diff options
author | Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com> | 2024-04-29 12:37:27 -0500 |
---|---|---|
committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2024-05-06 12:09:50 +0000 |
commit | ac174400f460e9b577079e8606439e0bae62adb0 (patch) | |
tree | 4eb1ceb69e41760f9cbd6d3c9f51efc4a30b3c92 /src/hash | |
parent | ff0bc4669e00b590df4f185e417ed6dc1818e566 (diff) | |
download | go-ac174400f460e9b577079e8606439e0bae62adb0.tar.gz go-ac174400f460e9b577079e8606439e0bae62adb0.zip |
hash/crc32: improve asm for ppc64SlicingUpdateBy8
Improvements are made in the assembler code which improves time and
space by 9-10%.
1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI.
2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases.
Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5.
3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements.
The gain from code changes can be seen as follows, generated by
benchstat:
goos: linux
goarch: ppc64le
pkg: hash/crc32
cpu: POWER10
| oldCrc.out | newCrc.out |
| sec/op | sec/op vs base |
CRC32/poly=IEEE/size=15/align=0-12 50.19n ± 1% 39.85n ± 0% -20.59% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12 50.18n ± 1% 39.87n ± 0% -20.54% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12 40.25n ± 0% 36.95n ± 0% -8.19% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12 40.31n ± 0% 36.95n ± 0% -8.36% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12 38.03n ± 0% 38.17n ± 0% +0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12 89.19n ± 1% 73.65n ± 0% -17.43% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12 50.73n ± 7% 50.14n ± 0% -1.18% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12 101.00n ± 37% 81.58n ± 0% -19.23% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12 98.30n ± 45% 93.05n ± 0% -5.34% (p=0.043 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12 140.8n ± 0% 125.8n ± 0% -10.65% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12 525.8n ± 0% 528.5n ± 0% +0.52% (p=0.011 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12 584.4n ± 1% 576.3n ± 0% -1.39% (p=0.002 n=6)
geomean 90.51n 81.74n -9.69%
| oldCrc.out | newCrc.out |
| B/s | B/s vs base |
CRC32/poly=IEEE/size=15/align=0-12 285.0Mi ± 1% 359.0Mi ± 0% +25.94% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12 285.1Mi ± 1% 358.8Mi ± 0% +25.86% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12 947.8Mi ± 0% 1032.3Mi ± 0% +8.91% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12 946.2Mi ± 0% 1032.5Mi ± 0% +9.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12 12.54Gi ± 0% 12.49Gi ± 0% -0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12 5.346Gi ± 1% 6.475Gi ± 0% +21.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12 18.80Gi ± 7% 19.02Gi ± 0% +1.20% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12 9.454Gi ± 27% 11.690Gi ± 0% +23.66% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12 38.86Gi ± 31% 41.00Gi ± 0% +5.49% (p=0.041 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12 27.10Gi ± 0% 30.32Gi ± 0% +11.89% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12 58.05Gi ± 0% 57.74Gi ± 0% -0.53% (p=0.009 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12 52.22Gi ± 1% 52.95Gi ± 0% +1.41% (p=0.002 n=6)
geomean 6.074Gi 6.724Gi +10.70%
Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2
Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10
Reviewed-on: https://go-review.googlesource.com/c/go/+/582395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Eli Bendersky <eliben@google.com>
Diffstat (limited to 'src/hash')
-rw-r--r-- | src/hash/crc32/crc32_ppc64le.s | 53 |
1 files changed, 21 insertions, 32 deletions
diff --git a/src/hash/crc32/crc32_ppc64le.s b/src/hash/crc32/crc32_ppc64le.s index 84ef213312..fb7c783f93 100644 --- a/src/hash/crc32/crc32_ppc64le.s +++ b/src/hash/crc32/crc32_ppc64le.s @@ -63,67 +63,56 @@ loop: RLDICL $40,R9,$56,R17 // p[7] SLD $2,R17,R17 // p[7]*4 RLDICL $40,R7,$56,R8 // crc>>24 - ADD R17,R10,R17 // &tab[0][p[7]] SLD $2,R8,R8 // crc>>24*4 RLDICL $48,R9,$56,R18 // p[6] SLD $2,R18,R18 // p[6]*4 + MOVWZ (R10)(R17),R21 // tab[0][p[7]] ADD $1024,R10,R10 // tab[1] - MOVWZ 0(R17),R21 // tab[0][p[7]] RLDICL $56,R9,$56,R19 // p[5] - ADD R10,R18,R18 // &tab[1][p[6]] SLD $2,R19,R19 // p[5]*4:1 - MOVWZ 0(R18),R22 // tab[1][p[6]] + MOVWZ (R10)(R18),R22 // tab[1][p[6]] ADD $1024,R10,R10 // tab[2] XOR R21,R22,R21 // xor done R22 - ADD R19,R10,R19 // &tab[2][p[5]] - ANDCC $255,R9,R20 // p[4] ?? - SLD $2,R20,R20 // p[4]*4 - MOVWZ 0(R19),R23 // tab[2][p[5]] + CLRLSLDI $56,R9,$2,R20 + MOVWZ (R10)(R19),R23 // tab[2][p[5]] ADD $1024,R10,R10 // &tab[3] - ADD R20,R10,R20 // tab[3][p[4]] XOR R21,R23,R21 // xor done R23 - ADD $1024,R10,R10 // &tab[4] - MOVWZ 0(R20),R24 // tab[3][p[4]] - ADD R10,R8,R23 // &tab[4][crc>>24] + MOVWZ (R10)(R20),R24 // tab[3][p[4]] + ADD $1024,R10,R10 // &tab[4] XOR R21,R24,R21 // xor done R24 - MOVWZ 0(R23),R25 // tab[4][crc>>24] + MOVWZ (R10)(R8),R25 // tab[4][crc>>24] RLDICL $48,R7,$56,R24 // crc>>16&0xFF XOR R21,R25,R21 // xor done R25 ADD $1024,R10,R10 // &tab[5] SLD $2,R24,R24 // crc>>16&0xFF*4 - ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] - MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF] + MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF] XOR R21,R26,R21 // xor done R26 RLDICL $56,R7,$56,R25 // crc>>8 ADD $1024,R10,R10 // &tab[6] SLD $2,R25,R25 // crc>>8&FF*2 - ADD R25,R10,R25 // &tab[6][crc>>8&0xFF] MOVBZ R7,R26 // crc&0xFF - ADD $1024,R10,R10 // &tab[7] - MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF] + MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF] + ADD $1024,R10,R10 // &tab[7] SLD $2,R26,R26 // crc&0xFF*2 XOR R21,R27,R21 // xor done R27 - ADD R26,R10,R26 // &tab[7][crc&0xFF] ADD $8,R5 // p = p[8:] - MOVWZ 0(R26),R28 // tab[7][crc&0xFF] + MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF] XOR R21,R28,R21 // xor done R28 MOVWZ R21,R7 // crc for next round - BC 16,0,loop // next 8 bytes + BDNZ loop ANDCC $7,R6,R8 // any leftover bytes BEQ done // none --> done MOVD R8,CTR // byte count PCALIGN $16 // align short loop short: - MOVBZ 0(R5),R8 // get v - MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? - SRD $8,R7,R14 // crc>>8 - XOR R8,R9,R8 // byte(crc)^v -> R8 - ADD $1,R5 // ptr to next v - SLD $2,R8 // convert index-> bytes - ADD R8,R4,R9 // &tab[byte(crc)^v] - MOVWZ 0(R9),R10 // tab[byte(crc)^v] - XOR R10,R14,R7 // loop crc in R7 - BC 16,0,short + MOVBZ 0(R5),R8 // get v + XOR R8,R7,R8 // byte(crc)^v -> R8 + RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22 + SRD $8,R7,R14 // crc>>8 + MOVWZ (R4)(R8),R10 + ADD $1,R5 + XOR R10,R14,R7 // loop crc in R7 + BDNZ short done: NOR R7,R7,R7 // ^crc MOVW R7,ret+40(FP) // return crc @@ -333,7 +322,7 @@ cool_top: LVX (R4+off112),V23 // next in buffer ADD $128,R4 // bump up buffer pointer - BC 16,0,cool_top // are we done? + BDNZ cool_top // are we done? first_cool_down: |