aboutsummaryrefslogtreecommitdiff
path: root/src/bytes
diff options
context:
space:
mode:
authorWei Xiao <wei.xiao@arm.com>2018-02-22 04:39:47 +0000
committerCherry Zhang <cherryyz@google.com>2018-03-01 15:24:33 +0000
commit562346b7d0b4c4dd89e36a7ac3613b53c05bc631 (patch)
treec6192dc0b182c6b01a7d0f8810dd3f187a6399d3 /src/bytes
parent4c1aff87f1a160c3da962cda0c48462c88260d7b (diff)
downloadgo-562346b7d0b4c4dd89e36a7ac3613b53c05bc631.tar.gz
go-562346b7d0b4c4dd89e36a7ac3613b53c05bc631.zip
bytes: add asm version of Index for short strings on arm64
Currently we have special case for 1-byte strings, this extends it to strings shorter than 9 bytes on arm64. Benchmark results: name old time/op new time/op delta IndexByte/10-32 18.6ns ± 0% 18.1ns ± 0% -2.69% (p=0.008 n=5+5) IndexByte/32-32 16.8ns ± 1% 16.9ns ± 1% ~ (p=0.762 n=5+5) IndexByte/4K-32 464ns ± 0% 464ns ± 0% ~ (all equal) IndexByte/4M-32 528µs ± 1% 506µs ± 1% -4.17% (p=0.008 n=5+5) IndexByte/64M-32 18.7ms ± 0% 18.7ms ± 1% ~ (p=0.730 n=4+5) IndexBytePortable/10-32 33.8ns ± 0% 34.9ns ± 3% ~ (p=0.167 n=5+5) IndexBytePortable/32-32 65.3ns ± 0% 66.1ns ± 2% ~ (p=0.444 n=5+5) IndexBytePortable/4K-32 5.88µs ± 0% 5.88µs ± 0% ~ (p=0.325 n=5+5) IndexBytePortable/4M-32 6.03ms ± 0% 6.03ms ± 0% ~ (p=1.000 n=5+5) IndexBytePortable/64M-32 98.8ms ± 0% 98.9ms ± 0% +0.10% (p=0.008 n=5+5) IndexRune/10-32 57.7ns ± 0% 49.2ns ± 0% -14.73% (p=0.000 n=5+4) IndexRune/32-32 57.7ns ± 0% 58.6ns ± 0% +1.56% (p=0.008 n=5+5) IndexRune/4K-32 511ns ± 0% 513ns ± 0% +0.39% (p=0.008 n=5+5) IndexRune/4M-32 527µs ± 1% 527µs ± 1% ~ (p=0.690 n=5+5) IndexRune/64M-32 18.7ms ± 0% 18.7ms ± 1% ~ (p=0.190 n=4+5) IndexRuneASCII/10-32 23.8ns ± 0% 23.8ns ± 0% ~ (all equal) IndexRuneASCII/32-32 24.3ns ± 0% 24.3ns ± 0% ~ (all equal) IndexRuneASCII/4K-32 468ns ± 0% 468ns ± 0% ~ (all equal) IndexRuneASCII/4M-32 521µs ± 1% 531µs ± 2% +1.91% (p=0.016 n=5+5) IndexRuneASCII/64M-32 18.6ms ± 1% 18.5ms ± 0% ~ (p=0.730 n=5+4) Index/10-32 89.1ns ±13% 25.2ns ± 0% -71.72% (p=0.008 n=5+5) Index/32-32 225ns ± 2% 226ns ± 3% ~ (p=0.683 n=5+5) Index/4K-32 11.9µs ± 0% 11.8µs ± 0% -0.22% (p=0.008 n=5+5) Index/4M-32 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.548 n=5+5) Index/64M-32 197ms ± 0% 197ms ± 0% ~ (p=0.690 n=5+5) IndexEasy/10-32 46.2ns ± 0% 22.1ns ± 8% -52.16% (p=0.008 n=5+5) IndexEasy/32-32 46.2ns ± 0% 47.2ns ± 0% +2.16% (p=0.008 n=5+5) IndexEasy/4K-32 499ns ± 0% 502ns ± 0% +0.44% (p=0.008 n=5+5) IndexEasy/4M-32 529µs ± 2% 529µs ± 1% ~ (p=0.841 n=5+5) IndexEasy/64M-32 18.6ms ± 1% 18.7ms ± 1% ~ (p=0.222 n=5+5) IndexAnyASCII/1:1-32 15.7ns ± 0% 15.7ns ± 0% ~ (all equal) IndexAnyASCII/1:2-32 17.2ns ± 0% 17.2ns ± 0% ~ (all equal) IndexAnyASCII/1:4-32 20.0ns ± 0% 20.0ns ± 0% ~ (all equal) IndexAnyASCII/1:8-32 34.8ns ± 0% 34.8ns ± 0% ~ (all equal) IndexAnyASCII/1:16-32 48.1ns ± 0% 48.1ns ± 0% ~ (all equal) IndexAnyASCII/16:1-32 97.9ns ± 1% 97.7ns ± 0% ~ (p=0.857 n=5+5) IndexAnyASCII/16:2-32 102ns ± 0% 102ns ± 0% ~ (all equal) IndexAnyASCII/16:4-32 116ns ± 1% 116ns ± 1% ~ (p=1.000 n=5+5) IndexAnyASCII/16:8-32 141ns ± 1% 141ns ± 0% ~ (p=0.571 n=5+4) IndexAnyASCII/16:16-32 178ns ± 0% 178ns ± 0% ~ (all equal) IndexAnyASCII/256:1-32 1.09µs ± 0% 1.09µs ± 0% ~ (all equal) IndexAnyASCII/256:2-32 1.09µs ± 0% 1.10µs ± 0% +0.27% (p=0.008 n=5+5) IndexAnyASCII/256:4-32 1.11µs ± 0% 1.11µs ± 0% ~ (p=0.397 n=5+5) IndexAnyASCII/256:8-32 1.10µs ± 0% 1.10µs ± 0% ~ (p=0.444 n=5+5) IndexAnyASCII/256:16-32 1.14µs ± 0% 1.14µs ± 0% ~ (all equal) IndexAnyASCII/4096:1-32 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=5+5) IndexAnyASCII/4096:2-32 17.0µs ± 0% 17.0µs ± 0% ~ (p=0.159 n=5+4) IndexAnyASCII/4096:4-32 17.1µs ± 0% 17.1µs ± 0% ~ (p=0.921 n=4+5) IndexAnyASCII/4096:8-32 16.5µs ± 0% 16.5µs ± 0% ~ (p=0.460 n=5+5) IndexAnyASCII/4096:16-32 16.5µs ± 0% 16.5µs ± 0% ~ (p=0.794 n=5+4) IndexPeriodic/IndexPeriodic2-32 189µs ± 0% 189µs ± 0% ~ (p=0.841 n=5+5) IndexPeriodic/IndexPeriodic4-32 189µs ± 0% 189µs ± 0% -0.03% (p=0.016 n=5+4) IndexPeriodic/IndexPeriodic8-32 189µs ± 0% 189µs ± 0% ~ (p=0.651 n=5+5) IndexPeriodic/IndexPeriodic16-32 175µs ± 9% 174µs ± 7% ~ (p=1.000 n=5+5) IndexPeriodic/IndexPeriodic32-32 75.1µs ± 0% 75.1µs ± 0% ~ (p=0.690 n=5+5) IndexPeriodic/IndexPeriodic64-32 42.6µs ± 0% 44.7µs ± 0% +4.98% (p=0.008 n=5+5) name old speed new speed delta IndexByte/10-32 538MB/s ± 0% 552MB/s ± 0% +2.65% (p=0.008 n=5+5) IndexByte/32-32 1.90GB/s ± 1% 1.90GB/s ± 1% ~ (p=0.548 n=5+5) IndexByte/4K-32 8.82GB/s ± 0% 8.81GB/s ± 0% ~ (p=0.548 n=5+5) IndexByte/4M-32 7.95GB/s ± 1% 8.29GB/s ± 1% +4.35% (p=0.008 n=5+5) IndexByte/64M-32 3.58GB/s ± 0% 3.60GB/s ± 1% ~ (p=0.730 n=4+5) IndexBytePortable/10-32 296MB/s ± 0% 286MB/s ± 3% ~ (p=0.381 n=4+5) IndexBytePortable/32-32 490MB/s ± 0% 485MB/s ± 2% ~ (p=0.286 n=5+5) IndexBytePortable/4K-32 697MB/s ± 0% 697MB/s ± 0% ~ (p=0.413 n=5+5) IndexBytePortable/4M-32 696MB/s ± 0% 695MB/s ± 0% ~ (p=0.897 n=5+5) IndexBytePortable/64M-32 679MB/s ± 0% 678MB/s ± 0% -0.10% (p=0.008 n=5+5) IndexRune/10-32 173MB/s ± 0% 203MB/s ± 0% +17.24% (p=0.016 n=5+4) IndexRune/32-32 555MB/s ± 0% 546MB/s ± 0% -1.62% (p=0.008 n=5+5) IndexRune/4K-32 8.01GB/s ± 0% 7.98GB/s ± 0% -0.38% (p=0.008 n=5+5) IndexRune/4M-32 7.97GB/s ± 1% 7.95GB/s ± 1% ~ (p=0.690 n=5+5) IndexRune/64M-32 3.59GB/s ± 0% 3.58GB/s ± 1% ~ (p=0.190 n=4+5) IndexRuneASCII/10-32 420MB/s ± 0% 420MB/s ± 0% ~ (p=0.190 n=5+4) IndexRuneASCII/32-32 1.32GB/s ± 0% 1.32GB/s ± 0% ~ (p=0.333 n=5+5) IndexRuneASCII/4K-32 8.75GB/s ± 0% 8.75GB/s ± 0% ~ (p=0.690 n=5+5) IndexRuneASCII/4M-32 8.04GB/s ± 1% 7.89GB/s ± 2% -1.87% (p=0.016 n=5+5) IndexRuneASCII/64M-32 3.61GB/s ± 1% 3.62GB/s ± 0% ~ (p=0.730 n=5+4) Index/10-32 113MB/s ±14% 397MB/s ± 0% +249.76% (p=0.008 n=5+5) Index/32-32 142MB/s ± 2% 141MB/s ± 3% ~ (p=0.794 n=5+5) Index/4K-32 345MB/s ± 0% 346MB/s ± 0% +0.22% (p=0.008 n=5+5) Index/4M-32 345MB/s ± 0% 345MB/s ± 0% ~ (p=0.619 n=5+5) Index/64M-32 341MB/s ± 0% 341MB/s ± 0% ~ (p=0.595 n=5+5) IndexEasy/10-32 216MB/s ± 0% 453MB/s ± 8% +109.60% (p=0.008 n=5+5) IndexEasy/32-32 692MB/s ± 0% 678MB/s ± 0% -2.01% (p=0.008 n=5+5) IndexEasy/4K-32 8.19GB/s ± 0% 8.16GB/s ± 0% -0.45% (p=0.008 n=5+5) IndexEasy/4M-32 7.93GB/s ± 2% 7.93GB/s ± 1% ~ (p=0.841 n=5+5) IndexEasy/64M-32 3.60GB/s ± 1% 3.59GB/s ± 1% ~ (p=0.222 n=5+5) Change-Id: I4ca69378a2df6f9ba748c6a2706953ee1bd07343 Reviewed-on: https://go-review.googlesource.com/96555 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/bytes')
-rw-r--r--src/bytes/bytes_arm64.go13
-rw-r--r--src/bytes/bytes_arm64.s123
2 files changed, 136 insertions, 0 deletions
diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go
index 846eeba486..3d9ed3dd22 100644
--- a/src/bytes/bytes_arm64.go
+++ b/src/bytes/bytes_arm64.go
@@ -6,6 +6,12 @@ package bytes
func countByte(s []byte, c byte) int // bytes_arm64.s
+// 8 bytes can be completely loaded into 1 register.
+const shortStringLen = 8
+
+//go:noescape
+func indexShortStr(s, sep []byte) int
+
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
@@ -21,6 +27,13 @@ func Index(s, sep []byte) int {
return -1
case n > len(s):
return -1
+ case n <= shortStringLen:
+ // Use brute force when both s and sep are small.
+ // Empirical data shows that it can get better
+ // performance when len(s) <= 16.
+ if len(s) <= 16 {
+ return indexShortStr(s, sep)
+ }
}
c := sep[0]
i := 0
diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s
index 5e229d772b..84e96d52ce 100644
--- a/src/bytes/bytes_arm64.s
+++ b/src/bytes/bytes_arm64.s
@@ -72,3 +72,126 @@ tail:
done:
MOVD R11, ret+32(FP)
RET
+
+// indexShortStr(s, sep []byte) int
+// precondition: 2 <= len(sep) <= 8
+TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+ // main idea is to load 'sep' into separate register(s)
+ // to avoid repeatedly re-load it again and again
+ // for sebsequent substring comparisons
+ MOVD s+0(FP), R0
+ MOVD s_len+8(FP), R1
+ MOVD sep+24(FP), R2
+ MOVD sep_len+32(FP), R3
+ SUB R3, R1, R4
+ // R4 contains the start of last substring for comparsion
+ ADD R0, R4, R4
+ ADD $1, R0, R8
+ TBZ $3, R3, len_2_7
+len_8:
+ // R5 contains 8-byte sep
+ MOVD (R2), R5
+loop_8:
+ // R6 contains substring for comparison
+ MOVD.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_8
+ JMP not_found
+len_2_7:
+ TBZ $2, R3, len_2_3
+ TBZ $1, R3, len_4_5
+ TBZ $0, R3, len_6
+len_7:
+ // R5 and R6 contain 7-byte sep
+ MOVWU (R2), R5
+ // 1-byte overlap with R5
+ MOVWU 3(R2), R6
+loop_7:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_7
+ MOVWU 2(R0), R3
+ CMP R6, R3
+ BEQ found
+not_equal_7:
+ CMP R4, R0
+ BLS loop_7
+ JMP not_found
+len_6:
+ // R5 and R6 contain 6-byte sep
+ MOVWU (R2), R5
+ MOVHU 4(R2), R6
+loop_6:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_6
+ MOVHU 3(R0), R3
+ CMP R6, R3
+ BEQ found
+not_equal_6:
+ CMP R4, R0
+ BLS loop_6
+ JMP not_found
+len_4_5:
+ TBZ $0, R3, len_4
+len_5:
+ // R5 and R7 contain 5-byte sep
+ MOVWU (R2), R5
+ MOVBU 4(R2), R7
+loop_5:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_5
+ MOVBU 3(R0), R3
+ CMP R7, R3
+ BEQ found
+not_equal_5:
+ CMP R4, R0
+ BLS loop_5
+ JMP not_found
+len_4:
+ // R5 contains 4-byte sep
+ MOVWU (R2), R5
+loop_4:
+ MOVWU.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_4
+ JMP not_found
+len_2_3:
+ TBZ $0, R3, len_2
+len_3:
+ // R6 and R7 contain 3-byte sep
+ MOVHU (R2), R6
+ MOVBU 2(R2), R7
+loop_3:
+ MOVHU.P 1(R0), R3
+ CMP R6, R3
+ BNE not_equal_3
+ MOVBU 1(R0), R3
+ CMP R7, R3
+ BEQ found
+not_equal_3:
+ CMP R4, R0
+ BLS loop_3
+ JMP not_found
+len_2:
+ // R5 contains 2-byte sep
+ MOVHU (R2), R5
+loop_2:
+ MOVHU.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_2
+not_found:
+ MOVD $-1, R0
+ MOVD R0, ret+48(FP)
+ RET
+found:
+ SUB R8, R0, R0
+ MOVD R0, ret+48(FP)
+ RET