diff options
author | Carlos Eduardo Seo <cseo@linux.vnet.ibm.com> | 2017-02-27 21:32:29 -0300 |
---|---|---|
committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2017-03-16 13:54:20 +0000 |
commit | d60166d5eea5084e0957e9028237cc87ecadbf7d (patch) | |
tree | 554ca2d9bae8901c3570b307540ca6d0ee1e3af4 /src/runtime/asm_ppc64x.s | |
parent | d5dc4905191c3f7cfeb52e93331d10ebd33301f5 (diff) | |
download | go-d60166d5eea5084e0957e9028237cc87ecadbf7d.tar.gz go-d60166d5eea5084e0957e9028237cc87ecadbf7d.zip |
runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte for ppc64x.
Improvement for bytes·IndexByte:
benchmark old ns/op new ns/op delta
BenchmarkIndexByte/10-16 12.5 8.48 -32.16%
BenchmarkIndexByte/32-16 34.4 9.85 -71.37%
BenchmarkIndexByte/4K-16 3089 217 -92.98%
BenchmarkIndexByte/4M-16 3154810 207051 -93.44%
BenchmarkIndexByte/64M-16 50564811 5579093 -88.97%
benchmark old MB/s new MB/s speedup
BenchmarkIndexByte/10-16 800.41 1179.64 1.47x
BenchmarkIndexByte/32-16 930.60 3249.10 3.49x
BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x
BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x
BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x
Improvement for strings·IndexByte:
benchmark old ns/op new ns/op delta
BenchmarkIndexByte-16 25.9 7.69 -70.31%
Fixes #19030
Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11
Reviewed-on: https://go-review.googlesource.com/37670
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src/runtime/asm_ppc64x.s')
-rw-r--r-- | src/runtime/asm_ppc64x.s | 200 |
1 files changed, 165 insertions, 35 deletions
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s index 4ab5dec5cd..caa000bb56 100644 --- a/src/runtime/asm_ppc64x.s +++ b/src/runtime/asm_ppc64x.s @@ -1113,53 +1113,183 @@ equal: MOVBZ R3,ret+48(FP) RET -TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 - MOVD s+0(FP), R3 - MOVD s_len+8(FP), R4 - MOVBZ c+24(FP), R5 // byte to find - MOVD R3, R6 // store base for later - SUB $1, R3 - ADD R3, R4 // end-1 +TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 + MOVD s+0(FP), R3 // R3 = byte array pointer + MOVD s_len+8(FP), R4 // R4 = length + MOVBZ c+24(FP), R5 // R5 = byte + MOVD $ret+32(FP), R14 // R14 = &ret + BR runtime·indexbytebody<>(SB) + +TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 + MOVD s+0(FP), R3 // R3 = string + MOVD s_len+8(FP), R4 // R4 = length + MOVBZ c+16(FP), R5 // R5 = byte + MOVD $ret+24(FP), R14 // R14 = &ret + BR runtime·indexbytebody<>(SB) + +TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 + DCBT (R3) // Prepare cache line. + MOVD R3,R10 // Save base address for calculating the index later. + RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + + // Calculate last acceptable address and check for possible overflow + // using a saturated add. + // Overflows set last acceptable address to 0xffffffffffffffff. + ADD R4,R3,R7 + SUBC R3,R7,R6 + SUBE R0,R0,R9 + MOVW R9,R6 + OR R6,R7,R7 + + RLDIMI $16,R5,$32,R5 + CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently. + MOVD $-1,R9 + WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). + RLDIMI $32,R5,$0,R5 + ADD $-1,R7,R7 +#ifdef GOARCH_ppc64le + SLD R6,R9,R9 // Prepare mask for Little Endian +#else + SRD R6,R9,R9 // Same for Big Endian +#endif + BLE small_string // Jump to the small string case if it's <32 bytes. + + // Case for length >32 bytes + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base + RLDICL $0,R7,$61,R4 // length-1 + RLDICR $0,R7,$60,R7 // Last doubleword in R7 + CMPU R3,$0,CR7 // If we have a match, jump to the final computation + BNE CR7,done + + // Check for doubleword alignment and jump to the loop setup if aligned. + MOVFL R8,CR7 + BC 12,28,loop_setup + + // Not aligned, so handle the second doubleword + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR7 + BNE CR7,done + +loop_setup: + // We are now aligned to a 16-byte boundary. We will load two doublewords + // per loop iteration. The last doubleword is in R7, so our loop counter + // starts at (R7-R8)/16. + SUB R8,R7,R6 + SRD $4,R6,R6 + MOVD R6,CTR + // Note: when we have an align directive, align this loop to 32 bytes so + // it fits in a single icache sector. loop: - CMP R3, R4 + // Load two doublewords, then compare and merge in a single register. We + // will check two doublewords per iteration, then find out which of them + // contains the byte later. This speeds up the search. + MOVD 8(R8),R12 + MOVDU 16(R8),R11 + CMPB R12,R5,R3 + CMPB R11,R5,R9 + OR R3,R9,R6 + CMPU R6,$0,CR7 + BNE CR7,found + BC 16,0,loop + + // Counter zeroed, but we may have another doubleword to read + CMPU R8,R7 BEQ notfound - MOVBZU 1(R3), R7 - CMP R7, R5 - BNE loop - SUB R6, R3 // remove base - MOVD R3, ret+32(FP) - RET + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + BNE CR6,done notfound: - MOVD $-1, R3 - MOVD R3, ret+32(FP) + MOVD $-1,R3 + MOVD R3,(R14) RET -TEXT strings·IndexByte(SB),NOSPLIT,$0-32 - MOVD p+0(FP), R3 - MOVD b_len+8(FP), R4 - MOVBZ c+16(FP), R5 // byte to find - MOVD R3, R6 // store base for later - SUB $1, R3 - ADD R3, R4 // end-1 +found: + // One of the doublewords from the loop contains the byte we are looking + // for. Check the first doubleword and adjust the address if found. + CMPU R3,$0,CR6 + ADD $-8,R8,R8 + BNE CR6,done + + // Not found, so it must be in the second doubleword of the merged pair. + MOVD R9,R3 + ADD $8,R8,R8 + +done: + // At this point, R3 has 0xFF in the same position as the byte we are + // looking for in the doubleword. Use that to calculate the exact index + // of the byte. +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + CMPU R8,R7 // Check if we are at the last doubleword. + SRD $3,R11 // Convert trailing zeros to bytes. + ADD R11,R8,R3 + CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset. + BNE return + BLE CR7,return + MOVD $-1,R3 + MOVD R3,(R14) + RET -loop: - CMP R3, R4 +return: + SUB R10,R3 // Calculate index. + MOVD R3,(R14) + RET + +small_string: + // We unroll this loop for better performance. + CMPU R4,$0 // Check for length=0 BEQ notfound - MOVBZU 1(R3), R7 - CMP R7, R5 - BNE loop - SUB R6, R3 // remove base - MOVD R3, ret+24(FP) - RET + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base. + CMPU R3,$0,CR7 // If we have a match, jump to the final computation. + RLDICL $0,R7,$61,R4 // length-1 + RLDICR $0,R7,$60,R7 // Last doubleword in R7. + CMPU R8,R7 + BNE CR7,done + BEQ notfound // Hit length. + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound -notfound: - MOVD $-1, R3 - MOVD R3, ret+24(FP) - RET + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BR notfound TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 MOVD s1_base+0(FP), R5 |