aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_arm64.s
diff options
context:
space:
mode:
authorKeith Randall <khr@google.com>2018-03-01 16:38:41 -0800
committerKeith Randall <khr@golang.org>2018-03-02 22:46:15 +0000
commit403ab0f2214f583db84a2dae275389be92072a35 (patch)
tree9ff2fc82235a27acd95595f2152f03eec83fa71e /src/runtime/asm_arm64.s
parentdcedcaa5fb15748cc2e5cb9461fc6b4f4fc942cb (diff)
downloadgo-403ab0f2214f583db84a2dae275389be92072a35.tar.gz
go-403ab0f2214f583db84a2dae275389be92072a35.zip
internal/bytealg: move IndexByte asssembly to the new bytealg package
Move the IndexByte function from the runtime to a new bytealg package. The new package will eventually hold all the optimized assembly for groveling through byte slices and strings. It seems a better home for this code than randomly keeping it in runtime. Once this is in, the next step is to move the other functions (Compare, Equal, ...). Update #19792 This change seems complicated enough that we might just declare "not worth it" and abandon. Opinions welcome. The core assembly is all unchanged, except minor modifications where the code reads cpu feature bits. The wrapper functions have been cleaned up as they are now actually checked by vet. Change-Id: I9fa75bee5d85db3a65b3fd3b7997e60367523796 Reviewed-on: https://go-review.googlesource.com/98016 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/runtime/asm_arm64.s')
-rw-r--r--src/runtime/asm_arm64.s120
1 files changed, 0 insertions, 120 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 2e08013097..6abb9945e2 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -800,126 +800,6 @@ samebytes:
//
// functions for other packages
//
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
- MOVD b+0(FP), R0
- MOVD b_len+8(FP), R2
- MOVBU c+24(FP), R1
- MOVD $ret+32(FP), R8
- B runtime·indexbytebody<>(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
- MOVD s+0(FP), R0
- MOVD s_len+8(FP), R2
- MOVBU c+16(FP), R1
- MOVD $ret+24(FP), R8
- B runtime·indexbytebody<>(SB)
-
-// input:
-// R0: data
-// R1: byte to search
-// R2: data len
-// R8: address to put result
-TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0
- // Core algorithm:
- // For each 32-byte chunk we calculate a 64-bit syndrome value,
- // with two bits per byte. For each tuple, bit 0 is set if the
- // relevant byte matched the requested character and bit 1 is
- // not used (faster than using a 32bit syndrome). Since the bits
- // in the syndrome reflect exactly the order in which things occur
- // in the original string, counting trailing zeros allows to
- // identify exactly which byte has matched.
-
- CBZ R2, fail
- MOVD R0, R11
- // Magic constant 0x40100401 allows us to identify
- // which lane matches the requested byte.
- // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
- // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
- MOVD $0x40100401, R5
- VMOV R1, V0.B16
- // Work with aligned 32-byte chunks
- BIC $0x1f, R0, R3
- VMOV R5, V5.S4
- ANDS $0x1f, R0, R9
- AND $0x1f, R2, R10
- BEQ loop
-
- // Input string is not 32-byte aligned. We calculate the
- // syndrome value for the aligned 32 bytes block containing
- // the first bytes and mask off the irrelevant part.
- VLD1.P (R3), [V1.B16, V2.B16]
- SUB $0x20, R9, R4
- ADDS R4, R2, R2
- VCMEQ V0.B16, V1.B16, V3.B16
- VCMEQ V0.B16, V2.B16, V4.B16
- VAND V5.B16, V3.B16, V3.B16
- VAND V5.B16, V4.B16, V4.B16
- VADDP V4.B16, V3.B16, V6.B16 // 256->128
- VADDP V6.B16, V6.B16, V6.B16 // 128->64
- VMOV V6.D[0], R6
- // Clear the irrelevant lower bits
- LSL $1, R9, R4
- LSR R4, R6, R6
- LSL R4, R6, R6
- // The first block can also be the last
- BLS masklast
- // Have we found something already?
- CBNZ R6, tail
-
-loop:
- VLD1.P (R3), [V1.B16, V2.B16]
- SUBS $0x20, R2, R2
- VCMEQ V0.B16, V1.B16, V3.B16
- VCMEQ V0.B16, V2.B16, V4.B16
- // If we're out of data we finish regardless of the result
- BLS end
- // Use a fast check for the termination condition
- VORR V4.B16, V3.B16, V6.B16
- VADDP V6.D2, V6.D2, V6.D2
- VMOV V6.D[0], R6
- // We're not out of data, loop if we haven't found the character
- CBZ R6, loop
-
-end:
- // Termination condition found, let's calculate the syndrome value
- VAND V5.B16, V3.B16, V3.B16
- VAND V5.B16, V4.B16, V4.B16
- VADDP V4.B16, V3.B16, V6.B16
- VADDP V6.B16, V6.B16, V6.B16
- VMOV V6.D[0], R6
- // Only do the clear for the last possible block with less than 32 bytes
- // Condition flags come from SUBS in the loop
- BHS tail
-
-masklast:
- // Clear the irrelevant upper bits
- ADD R9, R10, R4
- AND $0x1f, R4, R4
- SUB $0x20, R4, R4
- NEG R4<<1, R4
- LSL R4, R6, R6
- LSR R4, R6, R6
-
-tail:
- // Check that we have found a character
- CBZ R6, fail
- // Count the trailing zeros using bit reversing
- RBIT R6, R6
- // Compensate the last post-increment
- SUB $0x20, R3, R3
- // And count the leading zeros
- CLZ R6, R6
- // R6 is twice the offset into the fragment
- ADD R6>>1, R3, R0
- // Compute the offset result
- SUB R11, R0, R0
- MOVD R0, (R8)
- RET
-
-fail:
- MOVD $-1, R0
- MOVD R0, (R8)
- RET
// Equal(a, b []byte) bool
TEXT bytes·Equal(SB),NOSPLIT,$0-49