diff options
author | Wei Xiao <Wei.Xiao@arm.com> | 2017-07-07 10:27:57 +0800 |
---|---|---|
committer | Cherry Zhang <cherryyz@google.com> | 2017-10-17 12:55:17 +0000 |
commit | 18508740b98db671608773c5f39bfa2718370f50 (patch) | |
tree | 4049769fd004d89e85afc6a46d5c005199a6c1a3 /src/runtime/asm_arm64.s | |
parent | 378de1ae43c6406ae5159f235f834da73403a541 (diff) | |
download | go-18508740b98db671608773c5f39bfa2718370f50.tar.gz go-18508740b98db671608773c5f39bfa2718370f50.zip |
reflect: optimize CALLFN wrapper for arm64
Optimize arm64 CALLFN wrapper with LDP/STP instructions.
This provides a significant speedup for big argument copy.
Benchmark results for reflect:
name old time/op new time/op delta
Call-8 79.0ns ± 4% 73.6ns ± 4% -6.78% (p=0.000 n=10+10)
CallArgCopy/size=128-8 80.5ns ± 0% 60.3ns ± 0% -25.06% (p=0.000 n=10+9)
CallArgCopy/size=256-8 119ns ± 2% 67ns ± 1% -43.59% (p=0.000 n=8+10)
CallArgCopy/size=1024-8 524ns ± 1% 99ns ± 1% -81.03% (p=0.000 n=10+10)
CallArgCopy/size=4096-8 837ns ± 0% 231ns ± 1% -72.42% (p=0.000 n=9+9)
CallArgCopy/size=65536-8 13.6µs ± 6% 3.1µs ± 1% -77.38% (p=0.000 n=10+10)
PtrTo-8 12.9ns ± 0% 13.1ns ± 3% +1.86% (p=0.000 n=10+10)
FieldByName1-8 28.7ns ± 2% 28.6ns ± 2% ~ (p=0.408 n=9+10)
FieldByName2-8 928ns ± 4% 946ns ± 8% ~ (p=0.326 n=9+10)
FieldByName3-8 5.35µs ± 5% 5.32µs ± 5% ~ (p=0.755 n=10+10)
InterfaceBig-8 2.57ns ± 0% 2.57ns ± 0% ~ (all equal)
InterfaceSmall-8 2.57ns ± 0% 2.57ns ± 0% ~ (all equal)
New-8 9.09ns ± 1% 8.83ns ± 1% -2.81% (p=0.000 n=10+9)
name old alloc/op new alloc/op delta
Call-8 0.00B 0.00B ~ (all equal)
name old allocs/op new allocs/op delta
Call-8 0.00 0.00 ~ (all equal)
name old speed new speed delta
CallArgCopy/size=128-8 1.59GB/s ± 0% 2.12GB/s ± 1% +33.46% (p=0.000 n=10+9)
CallArgCopy/size=256-8 2.14GB/s ± 2% 3.81GB/s ± 1% +78.02% (p=0.000 n=8+10)
CallArgCopy/size=1024-8 1.95GB/s ± 1% 10.30GB/s ± 0% +427.99% (p=0.000 n=10+9)
CallArgCopy/size=4096-8 4.89GB/s ± 0% 17.69GB/s ± 1% +261.87% (p=0.000 n=9+9)
CallArgCopy/size=65536-8 4.84GB/s ± 6% 21.36GB/s ± 1% +341.67% (p=0.000 n=10+10)
Change-Id: I775d88b30c43cb2eda1d0612ac15e6d283e70beb
Reviewed-on: https://go-review.googlesource.com/70570
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/runtime/asm_arm64.s')
-rw-r--r-- | src/runtime/asm_arm64.s | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 64311be479..8f2e03c7ef 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -368,16 +368,26 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \ NO_LOCAL_POINTERS; \ /* copy arguments to stack */ \ MOVD arg+16(FP), R3; \ - MOVWU argsize+24(FP), R4; \ - MOVD RSP, R5; \ - ADD $(8-1), R5; \ - SUB $1, R3; \ - ADD R5, R4; \ - CMP R5, R4; \ - BEQ 4(PC); \ - MOVBU.W 1(R3), R6; \ - MOVBU.W R6, 1(R5); \ - B -4(PC); \ + MOVWU argsize+24(FP), R4; \ + ADD $8, RSP, R5; \ + BIC $0xf, R4, R6; \ + CBZ R6, 6(PC); \ + /* if R6=(argsize&~15) != 0 */ \ + ADD R6, R5, R6; \ + /* copy 16 bytes a time */ \ + LDP.P 16(R3), (R7, R8); \ + STP.P (R7, R8), 16(R5); \ + CMP R5, R6; \ + BNE -3(PC); \ + AND $0xf, R4, R6; \ + CBZ R6, 6(PC); \ + /* if R6=(argsize&15) != 0 */ \ + ADD R6, R5, R6; \ + /* copy 1 byte a time for the rest */ \ + MOVBU.P 1(R3), R7; \ + MOVBU.P R7, 1(R5); \ + CMP R5, R6; \ + BNE -3(PC); \ /* call function */ \ MOVD f+8(FP), R26; \ MOVD (R26), R0; \ |