aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_arm64.s
diff options
context:
space:
mode:
authorWei Xiao <wei.xiao@arm.com>2017-06-21 03:30:14 +0000
committerCherry Zhang <cherryyz@google.com>2017-10-25 14:37:25 +0000
commit78ddf2741f51fdb0f6db4190ef2053a36be91751 (patch)
tree0f6ff19c4e2bc8c22abd32f3f4f52b58e8eff789 /src/runtime/asm_arm64.s
parent0c68b79e9ce91ea471b2c1dcfa9da6a375300ad5 (diff)
downloadgo-78ddf2741f51fdb0f6db4190ef2053a36be91751.tar.gz
go-78ddf2741f51fdb0f6db4190ef2053a36be91751.zip
bytes: add optimized Equal for arm64
Use SIMD instructions when comparing chunks bigger than 16 bytes. Benchmark results of bytes: name old time/op new time/op delta Equal/0-8 6.52ns ± 1% 5.51ns ± 0% -15.43% (p=0.000 n=8+9) Equal/1-8 11.5ns ± 0% 10.5ns ± 0% -8.70% (p=0.000 n=10+10) Equal/6-8 19.0ns ± 0% 13.5ns ± 0% -28.95% (p=0.000 n=10+10) Equal/9-8 31.0ns ± 0% 13.5ns ± 0% -56.45% (p=0.000 n=10+10) Equal/15-8 40.0ns ± 0% 15.5ns ± 0% -61.25% (p=0.000 n=10+10) Equal/16-8 41.5ns ± 0% 14.5ns ± 0% -65.06% (p=0.000 n=10+10) Equal/20-8 47.5ns ± 0% 17.0ns ± 0% -64.21% (p=0.000 n=10+10) Equal/32-8 65.6ns ± 0% 17.0ns ± 0% -74.09% (p=0.000 n=10+10) Equal/4K-8 6.17µs ± 0% 0.57µs ± 1% -90.76% (p=0.000 n=10+10) Equal/4M-8 6.41ms ± 0% 1.11ms ±14% -82.71% (p=0.000 n=8+10) Equal/64M-8 104ms ± 0% 33ms ± 0% -68.64% (p=0.000 n=10+10) EqualPort/1-8 13.0ns ± 0% 13.0ns ± 0% ~ (all equal) EqualPort/6-8 22.0ns ± 0% 22.7ns ± 0% +3.06% (p=0.000 n=8+9) EqualPort/32-8 78.1ns ± 0% 78.1ns ± 0% ~ (all equal) EqualPort/4K-8 7.54µs ± 0% 7.61µs ± 0% +0.92% (p=0.000 n=10+8) EqualPort/4M-8 8.16ms ± 2% 8.05ms ± 1% -1.31% (p=0.023 n=10+10) EqualPort/64M-8 142ms ± 0% 142ms ± 0% +0.37% (p=0.000 n=10+10) CompareBytesEqual-8 39.0ns ± 0% 41.6ns ± 2% +6.67% (p=0.000 n=9+10) name old speed new speed delta Equal/1-8 86.9MB/s ± 0% 95.2MB/s ± 0% +9.53% (p=0.000 n=8+8) Equal/6-8 315MB/s ± 0% 444MB/s ± 0% +40.74% (p=0.000 n=9+10) Equal/9-8 290MB/s ± 0% 666MB/s ± 0% +129.63% (p=0.000 n=8+10) Equal/15-8 375MB/s ± 0% 967MB/s ± 0% +158.09% (p=0.000 n=10+10) Equal/16-8 385MB/s ± 0% 1103MB/s ± 0% +186.24% (p=0.000 n=10+9) Equal/20-8 421MB/s ± 0% 1175MB/s ± 0% +179.44% (p=0.000 n=9+10) Equal/32-8 488MB/s ± 0% 1881MB/s ± 0% +285.34% (p=0.000 n=10+8) Equal/4K-8 664MB/s ± 0% 7181MB/s ± 1% +981.32% (p=0.000 n=10+10) Equal/4M-8 654MB/s ± 0% 3822MB/s ±16% +484.15% (p=0.000 n=8+10) Equal/64M-8 645MB/s ± 0% 2056MB/s ± 0% +218.90% (p=0.000 n=10+10) EqualPort/1-8 76.8MB/s ± 0% 76.7MB/s ± 0% -0.09% (p=0.023 n=10+10) EqualPort/6-8 272MB/s ± 0% 264MB/s ± 0% -2.94% (p=0.000 n=8+10) EqualPort/32-8 410MB/s ± 0% 410MB/s ± 0% +0.01% (p=0.004 n=9+10) EqualPort/4K-8 543MB/s ± 0% 538MB/s ± 0% -0.91% (p=0.000 n=9+9) EqualPort/4M-8 514MB/s ± 2% 521MB/s ± 1% +1.31% (p=0.023 n=10+10) EqualPort/64M-8 473MB/s ± 0% 472MB/s ± 0% -0.37% (p=0.000 n=10+10) Benchmark results of go1: name old time/op new time/op delta BinaryTree17-8 6.53s ± 0% 6.52s ± 2% ~ (p=0.286 n=4+5) Fannkuch11-8 6.35s ± 1% 6.33s ± 0% ~ (p=0.690 n=5+5) FmtFprintfEmpty-8 108ns ± 1% 99ns ± 1% -8.31% (p=0.008 n=5+5) FmtFprintfString-8 172ns ± 1% 188ns ± 0% +9.43% (p=0.016 n=5+4) FmtFprintfInt-8 207ns ± 0% 202ns ± 0% -2.42% (p=0.008 n=5+5) FmtFprintfIntInt-8 277ns ± 1% 271ns ± 1% -2.02% (p=0.008 n=5+5) FmtFprintfPrefixedInt-8 386ns ± 0% 380ns ± 0% -1.55% (p=0.008 n=5+5) FmtFprintfFloat-8 492ns ± 0% 494ns ± 1% ~ (p=0.175 n=4+5) FmtManyArgs-8 1.32µs ± 1% 1.31µs ± 2% ~ (p=0.651 n=5+5) GobDecode-8 16.8ms ± 2% 16.9ms ± 1% ~ (p=0.310 n=5+5) GobEncode-8 14.1ms ± 1% 14.1ms ± 1% ~ (p=1.000 n=5+5) Gzip-8 788ms ± 0% 789ms ± 0% ~ (p=0.548 n=5+5) Gunzip-8 83.6ms ± 0% 83.6ms ± 0% ~ (p=0.548 n=5+5) HTTPClientServer-8 120µs ± 0% 120µs ± 1% ~ (p=0.690 n=5+5) JSONEncode-8 33.2ms ± 0% 33.6ms ± 0% +1.20% (p=0.008 n=5+5) JSONDecode-8 152ms ± 1% 146ms ± 1% -3.70% (p=0.008 n=5+5) Mandelbrot200-8 10.0ms ± 0% 10.0ms ± 0% ~ (p=0.151 n=5+5) GoParse-8 7.97ms ± 0% 8.06ms ± 0% +1.15% (p=0.008 n=5+5) RegexpMatchEasy0_32-8 233ns ± 1% 239ns ± 4% ~ (p=0.135 n=5+5) RegexpMatchEasy0_1K-8 1.86µs ± 0% 1.86µs ± 0% ~ (p=0.167 n=5+5) RegexpMatchEasy1_32-8 250ns ± 0% 263ns ± 1% +5.28% (p=0.008 n=5+5) RegexpMatchEasy1_1K-8 2.28µs ± 0% 2.13µs ± 0% -6.64% (p=0.000 n=4+5) RegexpMatchMedium_32-8 332ns ± 1% 319ns ± 0% -3.97% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 85.5µs ± 2% 79.1µs ± 1% -7.42% (p=0.008 n=5+5) RegexpMatchHard_32-8 4.34µs ± 1% 4.42µs ± 7% ~ (p=0.881 n=5+5) RegexpMatchHard_1K-8 130µs ± 1% 127µs ± 0% -2.18% (p=0.008 n=5+5) Revcomp-8 1.35s ± 1% 1.34s ± 0% -0.58% (p=0.016 n=5+4) Template-8 160ms ± 2% 158ms ± 1% ~ (p=0.222 n=5+5) TimeParse-8 795ns ± 2% 772ns ± 2% -2.87% (p=0.024 n=5+5) TimeFormat-8 782ns ± 0% 784ns ± 0% ~ (p=0.198 n=5+5) name old speed new speed delta GobDecode-8 45.8MB/s ± 2% 45.5MB/s ± 1% ~ (p=0.310 n=5+5) GobEncode-8 54.3MB/s ± 1% 54.4MB/s ± 1% ~ (p=0.984 n=5+5) Gzip-8 24.6MB/s ± 0% 24.6MB/s ± 0% ~ (p=0.540 n=5+5) Gunzip-8 232MB/s ± 0% 232MB/s ± 0% ~ (p=0.548 n=5+5) JSONEncode-8 58.4MB/s ± 0% 57.7MB/s ± 0% -1.19% (p=0.008 n=5+5) JSONDecode-8 12.8MB/s ± 1% 13.3MB/s ± 1% +3.85% (p=0.008 n=5+5) GoParse-8 7.27MB/s ± 0% 7.18MB/s ± 0% -1.13% (p=0.008 n=5+5) RegexpMatchEasy0_32-8 137MB/s ± 1% 134MB/s ± 4% ~ (p=0.151 n=5+5) RegexpMatchEasy0_1K-8 551MB/s ± 0% 550MB/s ± 0% ~ (p=0.222 n=5+5) RegexpMatchEasy1_32-8 128MB/s ± 0% 121MB/s ± 1% -5.09% (p=0.008 n=5+5) RegexpMatchEasy1_1K-8 449MB/s ± 0% 481MB/s ± 0% +7.12% (p=0.016 n=4+5) RegexpMatchMedium_32-8 3.00MB/s ± 0% 3.13MB/s ± 0% +4.33% (p=0.016 n=4+5) RegexpMatchMedium_1K-8 12.0MB/s ± 2% 12.9MB/s ± 1% +7.98% (p=0.008 n=5+5) RegexpMatchHard_32-8 7.38MB/s ± 1% 7.25MB/s ± 7% ~ (p=0.952 n=5+5) RegexpMatchHard_1K-8 7.88MB/s ± 1% 8.05MB/s ± 0% +2.21% (p=0.008 n=5+5) Revcomp-8 188MB/s ± 1% 189MB/s ± 0% +0.58% (p=0.016 n=5+4) Template-8 12.2MB/s ± 2% 12.3MB/s ± 1% ~ (p=0.183 n=5+5) Change-Id: I65e79f3f8f8b2914678311c4f1b0a2d98459e220 Reviewed-on: https://go-review.googlesource.com/71110 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/runtime/asm_arm64.s')
-rw-r--r--src/runtime/asm_arm64.s132
1 files changed, 103 insertions, 29 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 8f2e03c7ef..e4b2c37038 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -723,26 +723,18 @@ TEXT runtime·abort(SB),NOSPLIT,$-8-0
B (ZR)
UNDEF
-// memequal(p, q unsafe.Pointer, size uintptr) bool
+// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT,$-8-25
- MOVD a+0(FP), R1
+ MOVD size+16(FP), R1
+ // short path to handle 0-byte case
+ CBZ R1, equal
+ MOVD a+0(FP), R0
MOVD b+8(FP), R2
- MOVD size+16(FP), R3
- ADD R1, R3, R6
+ MOVD $ret+24(FP), R8
+ B runtime·memeqbody<>(SB)
+equal:
MOVD $1, R0
MOVB R0, ret+24(FP)
- CMP R1, R2
- BEQ done
-loop:
- CMP R1, R6
- BEQ done
- MOVBU.P 1(R1), R4
- MOVBU.P 1(R2), R5
- CMP R4, R5
- BEQ loop
-
- MOVB $0, ret+24(FP)
-done:
RET
// memequal_varlen(a, b unsafe.Pointer) bool
@@ -865,28 +857,110 @@ notfound:
MOVD R0, ret+24(FP)
RET
-// TODO: share code with memequal?
+// Equal(a, b []byte) bool
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVD a_len+8(FP), R1
MOVD b_len+32(FP), R3
- CMP R1, R3 // unequal lengths are not equal
- BNE notequal
+ CMP R1, R3
+ // unequal lengths are not equal
+ BNE not_equal
+ // short path to handle 0-byte case
+ CBZ R1, equal
MOVD a+0(FP), R0
MOVD b+24(FP), R2
- ADD R0, R1 // end
-loop:
- CMP R0, R1
- BEQ equal // reaches the end
- MOVBU.P 1(R0), R4
- MOVBU.P 1(R2), R5
- CMP R4, R5
- BEQ loop
-notequal:
+ MOVD $ret+48(FP), R8
+ B runtime·memeqbody<>(SB)
+equal:
+ MOVD $1, R0
+ MOVB R0, ret+48(FP)
+ RET
+not_equal:
MOVB ZR, ret+48(FP)
RET
+
+// input:
+// R0: pointer a
+// R1: data len
+// R2: pointer b
+// R8: address to put result
+TEXT runtime·memeqbody<>(SB),NOSPLIT,$0
+ CMP $1, R1
+ // handle 1-byte special case for better performance
+ BEQ one
+ CMP $16, R1
+ // handle specially if length < 16
+ BLO tail
+ BIC $0x3f, R1, R3
+ CBZ R3, chunk16
+ // work with 64-byte chunks
+ ADD R3, R0, R6 // end of chunks
+chunk64_loop:
+ VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+ VLD1.P (R2), [V4.D2, V5.D2, V6.D2, V7.D2]
+ VCMEQ V0.D2, V4.D2, V8.D2
+ VCMEQ V1.D2, V5.D2, V9.D2
+ VCMEQ V2.D2, V6.D2, V10.D2
+ VCMEQ V3.D2, V7.D2, V11.D2
+ VAND V8.B16, V9.B16, V8.B16
+ VAND V8.B16, V10.B16, V8.B16
+ VAND V8.B16, V11.B16, V8.B16
+ CMP R0, R6
+ VMOV V8.D[0], R4
+ VMOV V8.D[1], R5
+ CBZ R4, not_equal
+ CBZ R5, not_equal
+ BNE chunk64_loop
+ AND $0x3f, R1, R1
+ CBZ R1, equal
+chunk16:
+ // work with 16-byte chunks
+ BIC $0xf, R1, R3
+ CBZ R3, tail
+ ADD R3, R0, R6 // end of chunks
+chunk16_loop:
+ VLD1.P (R0), [V0.D2]
+ VLD1.P (R2), [V1.D2]
+ VCMEQ V0.D2, V1.D2, V2.D2
+ CMP R0, R6
+ VMOV V2.D[0], R4
+ VMOV V2.D[1], R5
+ CBZ R4, not_equal
+ CBZ R5, not_equal
+ BNE chunk16_loop
+ AND $0xf, R1, R1
+ CBZ R1, equal
+tail:
+ // special compare of tail with length < 16
+ TBZ $3, R1, lt_8
+ MOVD.P 8(R0), R4
+ MOVD.P 8(R2), R5
+ CMP R4, R5
+ BNE not_equal
+lt_8:
+ TBZ $2, R1, lt_4
+ MOVWU.P 4(R0), R4
+ MOVWU.P 4(R2), R5
+ CMP R4, R5
+ BNE not_equal
+lt_4:
+ TBZ $1, R1, lt_2
+ MOVHU.P 2(R0), R4
+ MOVHU.P 2(R2), R5
+ CMP R4, R5
+ BNE not_equal
+lt_2:
+ TBZ $0, R1, equal
+one:
+ MOVBU (R0), R4
+ MOVBU (R2), R5
+ CMP R4, R5
+ BNE not_equal
equal:
MOVD $1, R0
- MOVB R0, ret+48(FP)
+ MOVB R0, (R8)
+ RET
+not_equal:
+ MOVB ZR, (R8)
RET
TEXT runtime·return0(SB), NOSPLIT, $0