aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_ppc64x.s
diff options
context:
space:
mode:
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>2017-08-07 15:44:38 -0300
committerLynn Boger <laboger@linux.vnet.ibm.com>2017-11-06 21:56:18 +0000
commitbe943df58860e7dec008ebb8d68428d54e311b94 (patch)
tree69a3bce9cb554f691b8bfa73d8c079444f3c877f /src/runtime/asm_ppc64x.s
parent4fcc835971ad63cf913ebe074ef6191e35a44ab9 (diff)
downloadgo-be943df58860e7dec008ebb8d68428d54e311b94.tar.gz
go-be943df58860e7dec008ebb8d68428d54e311b94.zip
runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte in asm that uses the vector registers/instructions on ppc64x. benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-8 9.70 9.37 -3.40% BenchmarkIndexByte/32-8 10.9 10.9 +0.00% BenchmarkIndexByte/4K-8 254 92.8 -63.46% BenchmarkIndexByte/4M-8 249246 118435 -52.48% BenchmarkIndexByte/64M-8 10737987 7383096 -31.24% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-8 1030.63 1067.24 1.04x BenchmarkIndexByte/32-8 2922.69 2928.53 1.00x BenchmarkIndexByte/4K-8 16065.95 44156.45 2.75x BenchmarkIndexByte/4M-8 16827.96 35414.21 2.10x BenchmarkIndexByte/64M-8 6249.67 9089.53 1.45x Change-Id: I81dbdd620f7bb4e395ce4d1f2a14e8e91e39f9a1 Reviewed-on: https://go-review.googlesource.com/71710 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src/runtime/asm_ppc64x.s')
-rw-r--r--src/runtime/asm_ppc64x.s258
1 files changed, 191 insertions, 67 deletions
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 487187f4d8..e02ca16907 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
DCBT (R3) // Prepare cache line.
- MOVD R3,R10 // Save base address for calculating the index later.
+ MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
-
- // Calculate last acceptable address and check for possible overflow
- // using a saturated add.
- // Overflows set last acceptable address to 0xffffffffffffffff.
- ADD R4,R3,R7
- SUBC R3,R7,R6
- SUBE R0,R0,R9
- MOVW R9,R6
- OR R6,R7,R7
+ ADD R4,R3,R7 // Last acceptable address in R7.
RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
MOVD $-1,R9
- WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+ WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
RLDIMI $32,R5,$0,R5
+ MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7
#ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian
@@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
#endif
BLE small_string // Jump to the small string case if it's <32 bytes.
- // Case for length >32 bytes
+ // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+ // in V0, V1 and V10, then branch to the preloop.
+ ANDCC $63,R3,R11
+ BEQ CR0,qw_align
+ RLDICL $0,R3,$61,R11
+
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base
- RLDICL $0,R7,$61,R4 // length-1
+ RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+ ADD R4,R11,R4
- // Check for doubleword alignment and jump to the loop setup if aligned.
- MOVFL R8,CR7
- BC 12,28,loop_setup
+ // Check for quadword alignment
+ ANDCC $15,R8,R11
+ BEQ CR0,qw_align
- // Not aligned, so handle the second doubleword
- MOVDU 8(R8),R12
+ // Not aligned, so handle the next doubleword
+ MOVD 0(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR7
BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+
+ // Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+ // Set up auxiliary data for the vectorized algorithm.
+ VSPLTISB $0,V0 // Replicate 0 across V0
+ VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+ MTVRD R5,V1
+ LVSL (R0+R0),V11
+ VSLB V11,V10,V10
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+ CMPU R4, $64 // If len <= 64, don't use the vectorized loop
+ BLE tail
+
+ // We will load 4 quardwords per iteration in the loop, so check for
+ // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+
+ // Not 64-byte aligned. Load one quadword at a time until aligned.
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $-16,R4,R4
+ ADD $16,R8,R8
+
+ // 64-byte aligned. Prepare for the main loop.
+preloop:
+ CMPU R4,$64
+ BLE tail // If len <= 64, don't use the vectorized loop
+
+ // We are now aligned to a 64-byte boundary. We will load 4 quadwords
+ // per loop iteration. The last doubleword is in R10, so our loop counter
+ // starts at (R10-R8)/64.
+ SUB R8,R10,R6
+ SRD $6,R6,R9 // Loop counter in R9
+ MOVD R9,CTR
-loop_setup:
- // We are now aligned to a 16-byte boundary. We will load two doublewords
- // per loop iteration. The last doubleword is in R7, so our loop counter
- // starts at (R7-R8)/16.
- SUB R8,R7,R6
- SRD $4,R6,R6
- MOVD R6,CTR
+ MOVD $16,R11 // Load offsets for the vector loads
+ MOVD $32,R9
+ MOVD $48,R7
- // Note: when we have an align directive, align this loop to 32 bytes so
- // it fits in a single icache sector.
+ // Main loop we will load 64 bytes per iteration
loop:
- // Load two doublewords, then compare and merge in a single register. We
- // will check two doublewords per iteration, then find out which of them
- // contains the byte later. This speeds up the search.
- MOVD 8(R8),R12
- MOVDU 16(R8),R11
- CMPB R12,R5,R3
- CMPB R11,R5,R9
- OR R3,R9,R6
- CMPU R6,$0,CR7
- BNE CR7,found
- BC 16,0,loop
-
- // Counter zeroed, but we may have another doubleword to read
- CMPU R8,R7
- BEQ notfound
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- BNE CR6,done
+ LVX (R8+R0),V2 // Load 4 16-byte vectors
+ LVX (R11+R8),V3
+ LVX (R9+R8),V4
+ LVX (R7+R8),V5
+ VCMPEQUB V1,V2,V6 // Look for byte in each vector
+ VCMPEQUB V1,V3,V7
+ VCMPEQUB V1,V4,V8
+ VCMPEQUB V1,V5,V9
+ VOR V6,V7,V11 // Compress the result in a single vector
+ VOR V8,V9,V12
+ VOR V11,V12,V11
+ VCMPEQUBCC V0,V11,V11 // Check for byte
+ BGE CR6,found
+ ADD $64,R8,R8
+ BC 16,0,loop // bdnz loop
+
+ // Handle the tailing bytes or R4 <= 64
+ RLDICL $0,R6,$58,R4
+tail:
+ CMPU R4,$0
+ BEQ notfound
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
notfound:
MOVD $-1,R3
@@ -1167,15 +1246,68 @@ notfound:
RET
found:
- // One of the doublewords from the loop contains the byte we are looking
- // for. Check the first doubleword and adjust the address if found.
- CMPU R3,$0,CR6
- ADD $-8,R8,R8
- BNE CR6,done
+ // We will now compress the results into a single doubleword,
+ // so it can be moved to a GPR for the final index calculation.
+
+ // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+ // first bit of each byte into bits 48-63.
+ VBPERMQ V6,V10,V6
+ VBPERMQ V7,V10,V7
+ VBPERMQ V8,V10,V8
+ VBPERMQ V9,V10,V9
+
+ // Shift each 16-bit component into its correct position for
+ // merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+ VSLDOI $2,V7,V7,V7
+ VSLDOI $4,V8,V8,V8
+ VSLDOI $6,V9,V9,V9
+#else
+ VSLDOI $6,V6,V6,V6
+ VSLDOI $4,V7,V7,V7
+ VSLDOI $2,V8,V8,V8
+#endif
- // Not found, so it must be in the second doubleword of the merged pair.
- MOVD R9,R3
- ADD $8,R8,R8
+ // Merge V6-V9 into a single doubleword and move to a GPR.
+ VOR V6,V7,V11
+ VOR V8,V9,V4
+ VOR V4,V11,V4
+ MFVRD V4,R3
+
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ ADD R8,R11,R3 // Calculate byte address
+
+return:
+ SUB R17,R3
+ MOVD R3,(R14)
+ RET
+
+found_qw_align:
+ // Use the same algorithm as above. Compress the result into
+ // a single doubleword and move it to a GPR for the final
+ // calculation.
+ VBPERMQ V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+ MFVRD V6,R3
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11
+#else
+ VSLDOI $6,V6,V6,V6
+ MFVRD V6,R3
+ CNTLZD R3,R11
+#endif
+ ADD R8,R11,R3
+ CMPU R11,R4
+ BLT return
+ BR notfound
done:
// At this point, R3 has 0xFF in the same position as the byte we are
@@ -1191,17 +1323,10 @@ done:
CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3
- CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset.
+ CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return
BLE CR7,return
- MOVD $-1,R3
- MOVD R3,(R14)
- RET
-
-return:
- SUB R10,R3 // Calculate index.
- MOVD R3,(R14)
- RET
+ BR notfound
small_string:
// We unroll this loop for better performance.
@@ -1212,9 +1337,9 @@ small_string:
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
- RLDICL $0,R7,$61,R4 // length-1
+ RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
- CMPU R8,R7
+ CMPU R8,R7
BNE CR7,done
BEQ notfound // Hit length.
@@ -1242,7 +1367,6 @@ small_string:
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
- CMPU R8,R7
BNE CR6,done
BR notfound