aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_arm64.s
diff options
context:
space:
mode:
authorWei Xiao <Wei.Xiao@arm.com>2017-07-07 10:27:57 +0800
committerCherry Zhang <cherryyz@google.com>2017-10-17 12:55:17 +0000
commit18508740b98db671608773c5f39bfa2718370f50 (patch)
tree4049769fd004d89e85afc6a46d5c005199a6c1a3 /src/runtime/asm_arm64.s
parent378de1ae43c6406ae5159f235f834da73403a541 (diff)
downloadgo-18508740b98db671608773c5f39bfa2718370f50.tar.gz
go-18508740b98db671608773c5f39bfa2718370f50.zip
reflect: optimize CALLFN wrapper for arm64
Optimize arm64 CALLFN wrapper with LDP/STP instructions. This provides a significant speedup for big argument copy. Benchmark results for reflect: name old time/op new time/op delta Call-8 79.0ns ± 4% 73.6ns ± 4% -6.78% (p=0.000 n=10+10) CallArgCopy/size=128-8 80.5ns ± 0% 60.3ns ± 0% -25.06% (p=0.000 n=10+9) CallArgCopy/size=256-8 119ns ± 2% 67ns ± 1% -43.59% (p=0.000 n=8+10) CallArgCopy/size=1024-8 524ns ± 1% 99ns ± 1% -81.03% (p=0.000 n=10+10) CallArgCopy/size=4096-8 837ns ± 0% 231ns ± 1% -72.42% (p=0.000 n=9+9) CallArgCopy/size=65536-8 13.6µs ± 6% 3.1µs ± 1% -77.38% (p=0.000 n=10+10) PtrTo-8 12.9ns ± 0% 13.1ns ± 3% +1.86% (p=0.000 n=10+10) FieldByName1-8 28.7ns ± 2% 28.6ns ± 2% ~ (p=0.408 n=9+10) FieldByName2-8 928ns ± 4% 946ns ± 8% ~ (p=0.326 n=9+10) FieldByName3-8 5.35µs ± 5% 5.32µs ± 5% ~ (p=0.755 n=10+10) InterfaceBig-8 2.57ns ± 0% 2.57ns ± 0% ~ (all equal) InterfaceSmall-8 2.57ns ± 0% 2.57ns ± 0% ~ (all equal) New-8 9.09ns ± 1% 8.83ns ± 1% -2.81% (p=0.000 n=10+9) name old alloc/op new alloc/op delta Call-8 0.00B 0.00B ~ (all equal) name old allocs/op new allocs/op delta Call-8 0.00 0.00 ~ (all equal) name old speed new speed delta CallArgCopy/size=128-8 1.59GB/s ± 0% 2.12GB/s ± 1% +33.46% (p=0.000 n=10+9) CallArgCopy/size=256-8 2.14GB/s ± 2% 3.81GB/s ± 1% +78.02% (p=0.000 n=8+10) CallArgCopy/size=1024-8 1.95GB/s ± 1% 10.30GB/s ± 0% +427.99% (p=0.000 n=10+9) CallArgCopy/size=4096-8 4.89GB/s ± 0% 17.69GB/s ± 1% +261.87% (p=0.000 n=9+9) CallArgCopy/size=65536-8 4.84GB/s ± 6% 21.36GB/s ± 1% +341.67% (p=0.000 n=10+10) Change-Id: I775d88b30c43cb2eda1d0612ac15e6d283e70beb Reviewed-on: https://go-review.googlesource.com/70570 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/runtime/asm_arm64.s')
-rw-r--r--src/runtime/asm_arm64.s30
1 files changed, 20 insertions, 10 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 64311be479..8f2e03c7ef 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -368,16 +368,26 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \
NO_LOCAL_POINTERS; \
/* copy arguments to stack */ \
MOVD arg+16(FP), R3; \
- MOVWU argsize+24(FP), R4; \
- MOVD RSP, R5; \
- ADD $(8-1), R5; \
- SUB $1, R3; \
- ADD R5, R4; \
- CMP R5, R4; \
- BEQ 4(PC); \
- MOVBU.W 1(R3), R6; \
- MOVBU.W R6, 1(R5); \
- B -4(PC); \
+ MOVWU argsize+24(FP), R4; \
+ ADD $8, RSP, R5; \
+ BIC $0xf, R4, R6; \
+ CBZ R6, 6(PC); \
+ /* if R6=(argsize&~15) != 0 */ \
+ ADD R6, R5, R6; \
+ /* copy 16 bytes a time */ \
+ LDP.P 16(R3), (R7, R8); \
+ STP.P (R7, R8), 16(R5); \
+ CMP R5, R6; \
+ BNE -3(PC); \
+ AND $0xf, R4, R6; \
+ CBZ R6, 6(PC); \
+ /* if R6=(argsize&15) != 0 */ \
+ ADD R6, R5, R6; \
+ /* copy 1 byte a time for the rest */ \
+ MOVBU.P 1(R3), R7; \
+ MOVBU.P R7, 1(R5); \
+ CMP R5, R6; \
+ BNE -3(PC); \
/* call function */ \
MOVD f+8(FP), R26; \
MOVD (R26), R0; \