aboutsummaryrefslogtreecommitdiff
path: root/src/hash
diff options
context:
space:
mode:
authorJayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com>2024-04-29 12:37:27 -0500
committerLynn Boger <laboger@linux.vnet.ibm.com>2024-05-06 12:09:50 +0000
commitac174400f460e9b577079e8606439e0bae62adb0 (patch)
tree4eb1ceb69e41760f9cbd6d3c9f51efc4a30b3c92 /src/hash
parentff0bc4669e00b590df4f185e417ed6dc1818e566 (diff)
downloadgo-ac174400f460e9b577079e8606439e0bae62adb0.tar.gz
go-ac174400f460e9b577079e8606439e0bae62adb0.zip
hash/crc32: improve asm for ppc64SlicingUpdateBy8
Improvements are made in the assembler code which improves time and space by 9-10%. 1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI. 2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases. Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5. 3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements. The gain from code changes can be seen as follows, generated by benchstat: goos: linux goarch: ppc64le pkg: hash/crc32 cpu: POWER10 | oldCrc.out | newCrc.out | | sec/op | sec/op vs base | CRC32/poly=IEEE/size=15/align=0-12 50.19n ± 1% 39.85n ± 0% -20.59% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 50.18n ± 1% 39.87n ± 0% -20.54% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 40.25n ± 0% 36.95n ± 0% -8.19% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 40.31n ± 0% 36.95n ± 0% -8.36% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 38.03n ± 0% 38.17n ± 0% +0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 89.19n ± 1% 73.65n ± 0% -17.43% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 50.73n ± 7% 50.14n ± 0% -1.18% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 101.00n ± 37% 81.58n ± 0% -19.23% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 98.30n ± 45% 93.05n ± 0% -5.34% (p=0.043 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 140.8n ± 0% 125.8n ± 0% -10.65% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 525.8n ± 0% 528.5n ± 0% +0.52% (p=0.011 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 584.4n ± 1% 576.3n ± 0% -1.39% (p=0.002 n=6) geomean 90.51n 81.74n -9.69% | oldCrc.out | newCrc.out | | B/s | B/s vs base | CRC32/poly=IEEE/size=15/align=0-12 285.0Mi ± 1% 359.0Mi ± 0% +25.94% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 285.1Mi ± 1% 358.8Mi ± 0% +25.86% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 947.8Mi ± 0% 1032.3Mi ± 0% +8.91% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 946.2Mi ± 0% 1032.5Mi ± 0% +9.12% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 12.54Gi ± 0% 12.49Gi ± 0% -0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 5.346Gi ± 1% 6.475Gi ± 0% +21.12% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 18.80Gi ± 7% 19.02Gi ± 0% +1.20% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 9.454Gi ± 27% 11.690Gi ± 0% +23.66% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 38.86Gi ± 31% 41.00Gi ± 0% +5.49% (p=0.041 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 27.10Gi ± 0% 30.32Gi ± 0% +11.89% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 58.05Gi ± 0% 57.74Gi ± 0% -0.53% (p=0.009 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 52.22Gi ± 1% 52.95Gi ± 0% +1.41% (p=0.002 n=6) geomean 6.074Gi 6.724Gi +10.70% Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2 Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10 Reviewed-on: https://go-review.googlesource.com/c/go/+/582395 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Eli Bendersky <eliben@google.com>
Diffstat (limited to 'src/hash')
-rw-r--r--src/hash/crc32/crc32_ppc64le.s53
1 files changed, 21 insertions, 32 deletions
diff --git a/src/hash/crc32/crc32_ppc64le.s b/src/hash/crc32/crc32_ppc64le.s
index 84ef213312..fb7c783f93 100644
--- a/src/hash/crc32/crc32_ppc64le.s
+++ b/src/hash/crc32/crc32_ppc64le.s
@@ -63,67 +63,56 @@ loop:
RLDICL $40,R9,$56,R17 // p[7]
SLD $2,R17,R17 // p[7]*4
RLDICL $40,R7,$56,R8 // crc>>24
- ADD R17,R10,R17 // &tab[0][p[7]]
SLD $2,R8,R8 // crc>>24*4
RLDICL $48,R9,$56,R18 // p[6]
SLD $2,R18,R18 // p[6]*4
+ MOVWZ (R10)(R17),R21 // tab[0][p[7]]
ADD $1024,R10,R10 // tab[1]
- MOVWZ 0(R17),R21 // tab[0][p[7]]
RLDICL $56,R9,$56,R19 // p[5]
- ADD R10,R18,R18 // &tab[1][p[6]]
SLD $2,R19,R19 // p[5]*4:1
- MOVWZ 0(R18),R22 // tab[1][p[6]]
+ MOVWZ (R10)(R18),R22 // tab[1][p[6]]
ADD $1024,R10,R10 // tab[2]
XOR R21,R22,R21 // xor done R22
- ADD R19,R10,R19 // &tab[2][p[5]]
- ANDCC $255,R9,R20 // p[4] ??
- SLD $2,R20,R20 // p[4]*4
- MOVWZ 0(R19),R23 // tab[2][p[5]]
+ CLRLSLDI $56,R9,$2,R20
+ MOVWZ (R10)(R19),R23 // tab[2][p[5]]
ADD $1024,R10,R10 // &tab[3]
- ADD R20,R10,R20 // tab[3][p[4]]
XOR R21,R23,R21 // xor done R23
- ADD $1024,R10,R10 // &tab[4]
- MOVWZ 0(R20),R24 // tab[3][p[4]]
- ADD R10,R8,R23 // &tab[4][crc>>24]
+ MOVWZ (R10)(R20),R24 // tab[3][p[4]]
+ ADD $1024,R10,R10 // &tab[4]
XOR R21,R24,R21 // xor done R24
- MOVWZ 0(R23),R25 // tab[4][crc>>24]
+ MOVWZ (R10)(R8),R25 // tab[4][crc>>24]
RLDICL $48,R7,$56,R24 // crc>>16&0xFF
XOR R21,R25,R21 // xor done R25
ADD $1024,R10,R10 // &tab[5]
SLD $2,R24,R24 // crc>>16&0xFF*4
- ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
- MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
+ MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF]
XOR R21,R26,R21 // xor done R26
RLDICL $56,R7,$56,R25 // crc>>8
ADD $1024,R10,R10 // &tab[6]
SLD $2,R25,R25 // crc>>8&FF*2
- ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
MOVBZ R7,R26 // crc&0xFF
- ADD $1024,R10,R10 // &tab[7]
- MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
+ MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF]
+ ADD $1024,R10,R10 // &tab[7]
SLD $2,R26,R26 // crc&0xFF*2
XOR R21,R27,R21 // xor done R27
- ADD R26,R10,R26 // &tab[7][crc&0xFF]
ADD $8,R5 // p = p[8:]
- MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
+ MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF]
XOR R21,R28,R21 // xor done R28
MOVWZ R21,R7 // crc for next round
- BC 16,0,loop // next 8 bytes
+ BDNZ loop
ANDCC $7,R6,R8 // any leftover bytes
BEQ done // none --> done
MOVD R8,CTR // byte count
PCALIGN $16 // align short loop
short:
- MOVBZ 0(R5),R8 // get v
- MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
- SRD $8,R7,R14 // crc>>8
- XOR R8,R9,R8 // byte(crc)^v -> R8
- ADD $1,R5 // ptr to next v
- SLD $2,R8 // convert index-> bytes
- ADD R8,R4,R9 // &tab[byte(crc)^v]
- MOVWZ 0(R9),R10 // tab[byte(crc)^v]
- XOR R10,R14,R7 // loop crc in R7
- BC 16,0,short
+ MOVBZ 0(R5),R8 // get v
+ XOR R8,R7,R8 // byte(crc)^v -> R8
+ RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22
+ SRD $8,R7,R14 // crc>>8
+ MOVWZ (R4)(R8),R10
+ ADD $1,R5
+ XOR R10,R14,R7 // loop crc in R7
+ BDNZ short
done:
NOR R7,R7,R7 // ^crc
MOVW R7,ret+40(FP) // return crc
@@ -333,7 +322,7 @@ cool_top:
LVX (R4+off112),V23 // next in buffer
ADD $128,R4 // bump up buffer pointer
- BC 16,0,cool_top // are we done?
+ BDNZ cool_top // are we done?
first_cool_down: