internal/bytealg: move IndexByte asssembly to the new bytealg package

Move the IndexByte function from the runtime to a new bytealg package. The new package will eventually hold all the optimized assembly for groveling through byte slices and strings. It seems a better home for this code than randomly keeping it in runtime. Once this is in, the next step is to move the other functions (Compare, Equal, ...). Update #19792 This change seems complicated enough that we might just declare "not worth it" and abandon. Opinions welcome. The core assembly is all unchanged, except minor modifications where the code reads cpu feature bits. The wrapper functions have been cleaned up as they are now actually checked by vet. Change-Id: I9fa75bee5d85db3a65b3fd3b7997e60367523796 Reviewed-on: https://go-review.googlesource.com/98016 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
author: Keith Randall <khr@google.com> 2018-03-01 16:38:41 -0800
committer: Keith Randall <khr@golang.org> 2018-03-02 22:46:15 +0000
commit: 403ab0f2214f583db84a2dae275389be92072a35 (patch)
tree: 9ff2fc82235a27acd95595f2152f03eec83fa71e /src/runtime/asm_ppc64x.s
parent: dcedcaa5fb15748cc2e5cb9461fc6b4f4fc942cb (diff)
download: go-403ab0f2214f583db84a2dae275389be92072a35.tar.gz
go-403ab0f2214f583db84a2dae275389be92072a35.zip
1 files changed, 0 insertions, 302 deletions
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index c0e872f7a9..0440751724 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1068,308 +1068,6 @@ equal:
 	MOVBZ	R3,ret+48(FP)
 	RET
 
-TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
-	MOVD	s+0(FP), R3		// R3 = byte array pointer
-	MOVD	s_len+8(FP), R4		// R4 = length
-	MOVBZ	c+24(FP), R5		// R5 = byte
-	MOVD	$ret+32(FP), R14	// R14 = &ret
-	BR	runtime·indexbytebody<>(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
-	MOVD	s+0(FP), R3	  // R3 = string
-	MOVD	s_len+8(FP), R4	  // R4 = length
-	MOVBZ	c+16(FP), R5	  // R5 = byte
-	MOVD	$ret+24(FP), R14  // R14 = &ret
-	BR	runtime·indexbytebody<>(SB)
-
-TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-	DCBT	(R3)		// Prepare cache line.
-	MOVD	R3,R17		// Save base address for calculating the index later.
-	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
-	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
-	ADD	R4,R3,R7	// Last acceptable address in R7.
-
-	RLDIMI	$16,R5,$32,R5
-	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
-	MOVD	$-1,R9
-	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
-	RLDIMI	$32,R5,$0,R5
-	MOVD	R7,R10		// Save last acceptable address in R10 for later.
-	ADD	$-1,R7,R7
-#ifdef GOARCH_ppc64le
-	SLD	R6,R9,R9	// Prepare mask for Little Endian
-#else
-	SRD	R6,R9,R9	// Same for Big Endian
-#endif
-	BLE	small_string	// Jump to the small string case if it's <32 bytes.
-
-	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
-	// in V0, V1 and V10, then branch to the preloop.
-	ANDCC	$63,R3,R11
-	BEQ	CR0,qw_align
-	RLDICL	$0,R3,$61,R11
-
-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICL	$0,R7,$61,R6	// length-1
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
-	BNE	CR7,done
-	ADD	$8,R8,R8
-	ADD	$-8,R4,R4
-	ADD	R4,R11,R4
-
-	// Check for quadword alignment
-	ANDCC	$15,R8,R11
-	BEQ	CR0,qw_align
-
-	// Not aligned, so handle the next doubleword
-	MOVD	0(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR7
-	BNE	CR7,done
-	ADD	$8,R8,R8
-	ADD	$-8,R4,R4
-
-	// Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
-	// Set up auxiliary data for the vectorized algorithm.
-	VSPLTISB  $0,V0		// Replicate 0 across V0
-	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
-	MTVRD	  R5,V1
-	LVSL	  (R0+R0),V11
-	VSLB	  V11,V10,V10
-	VSPLTB	  $7,V1,V1	// Replicate byte across V1
-	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
-	BLE	  tail
-
-	// We will load 4 quardwords per iteration in the loop, so check for
-	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
-	ANDCC	  $63,R8,R11
-	BEQ	  CR0,preloop
-
-	// Not 64-byte aligned. Load one quadword at a time until aligned.
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $-16,R4,R4
-	ADD	    $16,R8,R8
-
-	// 64-byte aligned. Prepare for the main loop.
-preloop:
-	CMPU	R4,$64
-	BLE	tail	      // If len <= 64, don't use the vectorized loop
-
-	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
-	// per loop iteration. The last doubleword is in R10, so our loop counter
-	// starts at (R10-R8)/64.
-	SUB	R8,R10,R6
-	SRD	$6,R6,R9      // Loop counter in R9
-	MOVD	R9,CTR
-
-	MOVD	$16,R11      // Load offsets for the vector loads
-	MOVD	$32,R9
-	MOVD	$48,R7
-
-	// Main loop we will load 64 bytes per iteration
-loop:
-	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	LVX	    (R11+R8),V3
-	LVX	    (R9+R8),V4
-	LVX	    (R7+R8),V5
-	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
-	VCMPEQUB    V1,V3,V7
-	VCMPEQUB    V1,V4,V8
-	VCMPEQUB    V1,V5,V9
-	VOR	    V6,V7,V11	      // Compress the result in a single vector
-	VOR	    V8,V9,V12
-	VOR	    V11,V12,V11
-	VCMPEQUBCC  V0,V11,V11	      // Check for byte
-	BGE	    CR6,found
-	ADD	    $64,R8,R8
-	BC	    16,0,loop	      // bdnz loop
-
-	// Handle the tailing bytes or R4 <= 64
-	RLDICL	$0,R6,$58,R4
-tail:
-	CMPU	    R4,$0
-	BEQ	    notfound
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-
-notfound:
-	MOVD	$-1,R3
-	MOVD	R3,(R14)
-	RET
-
-found:
-	// We will now compress the results into a single doubleword,
-	// so it can be moved to a GPR for the final index calculation.
-
-	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-	// first bit of each byte into bits 48-63.
-	VBPERMQ	  V6,V10,V6
-	VBPERMQ	  V7,V10,V7
-	VBPERMQ	  V8,V10,V8
-	VBPERMQ	  V9,V10,V9
-
-	// Shift each 16-bit component into its correct position for
-	// merging into a single doubleword.
-#ifdef GOARCH_ppc64le
-	VSLDOI	  $2,V7,V7,V7
-	VSLDOI	  $4,V8,V8,V8
-	VSLDOI	  $6,V9,V9,V9
-#else
-	VSLDOI	  $6,V6,V6,V6
-	VSLDOI	  $4,V7,V7,V7
-	VSLDOI	  $2,V8,V8,V8
-#endif
-
-	// Merge V6-V9 into a single doubleword and move to a GPR.
-	VOR	V6,V7,V11
-	VOR	V8,V9,V4
-	VOR	V4,V11,V4
-	MFVRD	V4,R3
-
-#ifdef GOARCH_ppc64le
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	ADD	R8,R11,R3	// Calculate byte address
-
-return:
-	SUB	R17,R3
-	MOVD	R3,(R14)
-	RET
-
-found_qw_align:
-	// Use the same algorithm as above. Compress the result into
-	// a single doubleword and move it to a GPR for the final
-	// calculation.
-	VBPERMQ	  V6,V10,V6
-
-#ifdef GOARCH_ppc64le
-	MFVRD	  V6,R3
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11
-#else
-	VSLDOI	  $6,V6,V6,V6
-	MFVRD	  V6,R3
-	CNTLZD	  R3,R11
-#endif
-	ADD	  R8,R11,R3
-	CMPU	  R11,R4
-	BLT	  return
-	BR	  notfound
-
-done:
-	// At this point, R3 has 0xFF in the same position as the byte we are
-	// looking for in the doubleword. Use that to calculate the exact index
-	// of the byte.
-#ifdef GOARCH_ppc64le
-	ADD	$-1,R3,R11
-	ANDN	R3,R11,R11
-	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	CMPU	R8,R7		// Check if we are at the last doubleword.
-	SRD	$3,R11		// Convert trailing zeros to bytes.
-	ADD	R11,R8,R3
-	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
-	BNE	return
-	BLE	CR7,return
-	BR	notfound
-
-small_string:
-	// We unroll this loop for better performance.
-	CMPU	R4,$0		// Check for length=0
-	BEQ	notfound
-
-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base.
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICL	$0,R7,$61,R6	// length-1
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
-	CMPU	R8,R7
-	BNE	CR7,done
-	BEQ	notfound	// Hit length.
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	BNE	CR6,done
-	BR	notfound
-
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s1_base+0(FP), R5
 	MOVD	s2_base+16(FP), R6
author	Keith Randall <khr@google.com>	2018-03-01 16:38:41 -0800
committer	Keith Randall <khr@golang.org>	2018-03-02 22:46:15 +0000
commit	403ab0f2214f583db84a2dae275389be92072a35 (patch)
tree	9ff2fc82235a27acd95595f2152f03eec83fa71e /src/runtime/asm_ppc64x.s
parent	dcedcaa5fb15748cc2e5cb9461fc6b4f4fc942cb (diff)
download	go-403ab0f2214f583db84a2dae275389be92072a35.tar.gz go-403ab0f2214f583db84a2dae275389be92072a35.zip