aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_ppc64x.s
diff options
context:
space:
mode:
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>2017-02-27 21:32:29 -0300
committerLynn Boger <laboger@linux.vnet.ibm.com>2017-03-16 13:54:20 +0000
commitd60166d5eea5084e0957e9028237cc87ecadbf7d (patch)
tree554ca2d9bae8901c3570b307540ca6d0ee1e3af4 /src/runtime/asm_ppc64x.s
parentd5dc4905191c3f7cfeb52e93331d10ebd33301f5 (diff)
downloadgo-d60166d5eea5084e0957e9028237cc87ecadbf7d.tar.gz
go-d60166d5eea5084e0957e9028237cc87ecadbf7d.zip
runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte for ppc64x. Improvement for bytes·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-16 12.5 8.48 -32.16% BenchmarkIndexByte/32-16 34.4 9.85 -71.37% BenchmarkIndexByte/4K-16 3089 217 -92.98% BenchmarkIndexByte/4M-16 3154810 207051 -93.44% BenchmarkIndexByte/64M-16 50564811 5579093 -88.97% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-16 800.41 1179.64 1.47x BenchmarkIndexByte/32-16 930.60 3249.10 3.49x BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x Improvement for strings·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte-16 25.9 7.69 -70.31% Fixes #19030 Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11 Reviewed-on: https://go-review.googlesource.com/37670 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src/runtime/asm_ppc64x.s')
-rw-r--r--src/runtime/asm_ppc64x.s200
1 files changed, 165 insertions, 35 deletions
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 4ab5dec5cd..caa000bb56 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1113,53 +1113,183 @@ equal:
MOVBZ R3,ret+48(FP)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
- MOVD s+0(FP), R3
- MOVD s_len+8(FP), R4
- MOVBZ c+24(FP), R5 // byte to find
- MOVD R3, R6 // store base for later
- SUB $1, R3
- ADD R3, R4 // end-1
+TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD s+0(FP), R3 // R3 = byte array pointer
+ MOVD s_len+8(FP), R4 // R4 = length
+ MOVBZ c+24(FP), R5 // R5 = byte
+ MOVD $ret+32(FP), R14 // R14 = &ret
+ BR runtime·indexbytebody<>(SB)
+
+TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
+ MOVD s+0(FP), R3 // R3 = string
+ MOVD s_len+8(FP), R4 // R4 = length
+ MOVBZ c+16(FP), R5 // R5 = byte
+ MOVD $ret+24(FP), R14 // R14 = &ret
+ BR runtime·indexbytebody<>(SB)
+
+TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+ DCBT (R3) // Prepare cache line.
+ MOVD R3,R10 // Save base address for calculating the index later.
+ RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
+ RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
+
+ // Calculate last acceptable address and check for possible overflow
+ // using a saturated add.
+ // Overflows set last acceptable address to 0xffffffffffffffff.
+ ADD R4,R3,R7
+ SUBC R3,R7,R6
+ SUBE R0,R0,R9
+ MOVW R9,R6
+ OR R6,R7,R7
+
+ RLDIMI $16,R5,$32,R5
+ CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
+ MOVD $-1,R9
+ WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+ RLDIMI $32,R5,$0,R5
+ ADD $-1,R7,R7
+#ifdef GOARCH_ppc64le
+ SLD R6,R9,R9 // Prepare mask for Little Endian
+#else
+ SRD R6,R9,R9 // Same for Big Endian
+#endif
+ BLE small_string // Jump to the small string case if it's <32 bytes.
+
+ // Case for length >32 bytes
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base
+ RLDICL $0,R7,$61,R4 // length-1
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation
+ BNE CR7,done
+
+ // Check for doubleword alignment and jump to the loop setup if aligned.
+ MOVFL R8,CR7
+ BC 12,28,loop_setup
+
+ // Not aligned, so handle the second doubleword
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR7
+ BNE CR7,done
+
+loop_setup:
+ // We are now aligned to a 16-byte boundary. We will load two doublewords
+ // per loop iteration. The last doubleword is in R7, so our loop counter
+ // starts at (R7-R8)/16.
+ SUB R8,R7,R6
+ SRD $4,R6,R6
+ MOVD R6,CTR
+ // Note: when we have an align directive, align this loop to 32 bytes so
+ // it fits in a single icache sector.
loop:
- CMP R3, R4
+ // Load two doublewords, then compare and merge in a single register. We
+ // will check two doublewords per iteration, then find out which of them
+ // contains the byte later. This speeds up the search.
+ MOVD 8(R8),R12
+ MOVDU 16(R8),R11
+ CMPB R12,R5,R3
+ CMPB R11,R5,R9
+ OR R3,R9,R6
+ CMPU R6,$0,CR7
+ BNE CR7,found
+ BC 16,0,loop
+
+ // Counter zeroed, but we may have another doubleword to read
+ CMPU R8,R7
BEQ notfound
- MOVBZU 1(R3), R7
- CMP R7, R5
- BNE loop
- SUB R6, R3 // remove base
- MOVD R3, ret+32(FP)
- RET
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ BNE CR6,done
notfound:
- MOVD $-1, R3
- MOVD R3, ret+32(FP)
+ MOVD $-1,R3
+ MOVD R3,(R14)
RET
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
- MOVD p+0(FP), R3
- MOVD b_len+8(FP), R4
- MOVBZ c+16(FP), R5 // byte to find
- MOVD R3, R6 // store base for later
- SUB $1, R3
- ADD R3, R4 // end-1
+found:
+ // One of the doublewords from the loop contains the byte we are looking
+ // for. Check the first doubleword and adjust the address if found.
+ CMPU R3,$0,CR6
+ ADD $-8,R8,R8
+ BNE CR6,done
+
+ // Not found, so it must be in the second doubleword of the merged pair.
+ MOVD R9,R3
+ ADD $8,R8,R8
+
+done:
+ // At this point, R3 has 0xFF in the same position as the byte we are
+ // looking for in the doubleword. Use that to calculate the exact index
+ // of the byte.
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ CMPU R8,R7 // Check if we are at the last doubleword.
+ SRD $3,R11 // Convert trailing zeros to bytes.
+ ADD R11,R8,R3
+ CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset.
+ BNE return
+ BLE CR7,return
+ MOVD $-1,R3
+ MOVD R3,(R14)
+ RET
-loop:
- CMP R3, R4
+return:
+ SUB R10,R3 // Calculate index.
+ MOVD R3,(R14)
+ RET
+
+small_string:
+ // We unroll this loop for better performance.
+ CMPU R4,$0 // Check for length=0
BEQ notfound
- MOVBZU 1(R3), R7
- CMP R7, R5
- BNE loop
- SUB R6, R3 // remove base
- MOVD R3, ret+24(FP)
- RET
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base.
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
+ RLDICL $0,R7,$61,R4 // length-1
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7.
+ CMPU R8,R7
+ BNE CR7,done
+ BEQ notfound // Hit length.
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
-notfound:
- MOVD $-1, R3
- MOVD R3, ret+24(FP)
- RET
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BR notfound
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD s1_base+0(FP), R5