aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_arm64.s
diff options
context:
space:
mode:
authorZheng Xu <zheng.xu@arm.com>2018-08-29 14:55:03 +0800
committerCherry Zhang <cherryyz@google.com>2018-08-29 18:28:34 +0000
commit8f4fd3f34e8e218cb90435b5a8c6ba9be23a1e1e (patch)
tree0f0b36f71a6081c670cae5c6ed24b7a87c85a9ea /src/runtime/asm_arm64.s
parent7b88b22acf8cb5e32f34e2a396d797c5c0125566 (diff)
downloadgo-8f4fd3f34e8e218cb90435b5a8c6ba9be23a1e1e.tar.gz
go-8f4fd3f34e8e218cb90435b5a8c6ba9be23a1e1e.zip
build: support frame-pointer for arm64
Supporting frame-pointer makes Linux's perf and other profilers much more useful because it lets them gather a stack trace efficiently on profiling events. Major changes include: 1. save FP on the word below where RSP is pointing to (proposed by Cherry and Austin) 2. adjust some specific offsets in runtime assembly and wrapper code 3. add support to FP in goroutine scheduler 4. adjust link stack overflow check to take the extra word into account 5. adjust nosplit test cases to enable frame sizes which are 16 bytes aligned Performance impacts on go1 benchmarks: Enable frame-pointer (by default) name old time/op new time/op delta BinaryTree17-46 5.94s ± 0% 6.00s ± 0% +1.03% (p=0.029 n=4+4) Fannkuch11-46 2.84s ± 1% 2.77s ± 0% -2.58% (p=0.008 n=5+5) FmtFprintfEmpty-46 55.0ns ± 1% 58.9ns ± 1% +7.06% (p=0.008 n=5+5) FmtFprintfString-46 102ns ± 0% 105ns ± 0% +2.94% (p=0.008 n=5+5) FmtFprintfInt-46 118ns ± 0% 117ns ± 1% -1.19% (p=0.000 n=4+5) FmtFprintfIntInt-46 181ns ± 0% 182ns ± 1% ~ (p=0.444 n=5+5) FmtFprintfPrefixedInt-46 215ns ± 1% 214ns ± 0% ~ (p=0.254 n=5+4) FmtFprintfFloat-46 292ns ± 0% 296ns ± 0% +1.46% (p=0.029 n=4+4) FmtManyArgs-46 720ns ± 0% 732ns ± 0% +1.72% (p=0.008 n=5+5) GobDecode-46 9.82ms ± 1% 10.03ms ± 2% +2.10% (p=0.008 n=5+5) GobEncode-46 8.14ms ± 0% 8.72ms ± 1% +7.14% (p=0.008 n=5+5) Gzip-46 420ms ± 0% 424ms ± 0% +0.92% (p=0.008 n=5+5) Gunzip-46 48.2ms ± 0% 48.4ms ± 0% +0.41% (p=0.008 n=5+5) HTTPClientServer-46 201µs ± 4% 201µs ± 0% ~ (p=0.730 n=5+4) JSONEncode-46 17.1ms ± 0% 17.7ms ± 1% +3.80% (p=0.008 n=5+5) JSONDecode-46 88.0ms ± 0% 90.1ms ± 0% +2.42% (p=0.008 n=5+5) Mandelbrot200-46 5.06ms ± 0% 5.07ms ± 0% ~ (p=0.310 n=5+5) GoParse-46 5.04ms ± 0% 5.12ms ± 0% +1.53% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 117ns ± 0% 117ns ± 0% ~ (all equal) RegexpMatchEasy0_1K-46 332ns ± 0% 329ns ± 0% -0.78% (p=0.008 n=5+5) RegexpMatchEasy1_32-46 104ns ± 0% 113ns ± 0% +8.65% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 563ns ± 0% 569ns ± 0% +1.10% (p=0.008 n=5+5) RegexpMatchMedium_32-46 167ns ± 2% 177ns ± 1% +5.74% (p=0.008 n=5+5) RegexpMatchMedium_1K-46 49.5µs ± 0% 53.4µs ± 0% +7.81% (p=0.008 n=5+5) RegexpMatchHard_32-46 2.56µs ± 1% 2.72µs ± 0% +6.01% (p=0.008 n=5+5) RegexpMatchHard_1K-46 77.0µs ± 0% 81.8µs ± 0% +6.24% (p=0.016 n=5+4) Revcomp-46 631ms ± 1% 627ms ± 1% ~ (p=0.095 n=5+5) Template-46 81.8ms ± 0% 86.3ms ± 0% +5.55% (p=0.008 n=5+5) TimeParse-46 423ns ± 0% 432ns ± 0% +2.32% (p=0.008 n=5+5) TimeFormat-46 478ns ± 2% 497ns ± 1% +3.89% (p=0.008 n=5+5) [Geo mean] 71.6µs 73.3µs +2.45% name old speed new speed delta GobDecode-46 78.1MB/s ± 1% 76.6MB/s ± 2% -2.04% (p=0.008 n=5+5) GobEncode-46 94.3MB/s ± 0% 88.0MB/s ± 1% -6.67% (p=0.008 n=5+5) Gzip-46 46.2MB/s ± 0% 45.8MB/s ± 0% -0.91% (p=0.008 n=5+5) Gunzip-46 403MB/s ± 0% 401MB/s ± 0% -0.41% (p=0.008 n=5+5) JSONEncode-46 114MB/s ± 0% 109MB/s ± 1% -3.66% (p=0.008 n=5+5) JSONDecode-46 22.0MB/s ± 0% 21.5MB/s ± 0% -2.35% (p=0.008 n=5+5) GoParse-46 11.5MB/s ± 0% 11.3MB/s ± 0% -1.51% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 272MB/s ± 0% 272MB/s ± 1% ~ (p=0.190 n=4+5) RegexpMatchEasy0_1K-46 3.08GB/s ± 0% 3.11GB/s ± 0% +0.77% (p=0.008 n=5+5) RegexpMatchEasy1_32-46 306MB/s ± 0% 283MB/s ± 0% -7.63% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 1.82GB/s ± 0% 1.80GB/s ± 0% -1.07% (p=0.008 n=5+5) RegexpMatchMedium_32-46 5.99MB/s ± 0% 5.64MB/s ± 1% -5.77% (p=0.016 n=4+5) RegexpMatchMedium_1K-46 20.7MB/s ± 0% 19.2MB/s ± 0% -7.25% (p=0.008 n=5+5) RegexpMatchHard_32-46 12.5MB/s ± 1% 11.8MB/s ± 0% -5.66% (p=0.008 n=5+5) RegexpMatchHard_1K-46 13.3MB/s ± 0% 12.5MB/s ± 1% -6.01% (p=0.008 n=5+5) Revcomp-46 402MB/s ± 1% 405MB/s ± 1% ~ (p=0.095 n=5+5) Template-46 23.7MB/s ± 0% 22.5MB/s ± 0% -5.25% (p=0.008 n=5+5) [Geo mean] 82.2MB/s 79.6MB/s -3.26% Disable frame-pointer (GOEXPERIMENT=noframepointer) name old time/op new time/op delta BinaryTree17-46 5.94s ± 0% 5.96s ± 0% +0.39% (p=0.029 n=4+4) Fannkuch11-46 2.84s ± 1% 2.79s ± 1% -1.68% (p=0.008 n=5+5) FmtFprintfEmpty-46 55.0ns ± 1% 55.2ns ± 3% ~ (p=0.794 n=5+5) FmtFprintfString-46 102ns ± 0% 103ns ± 0% +0.98% (p=0.016 n=5+4) FmtFprintfInt-46 118ns ± 0% 115ns ± 0% -2.54% (p=0.029 n=4+4) FmtFprintfIntInt-46 181ns ± 0% 179ns ± 0% -1.10% (p=0.000 n=5+4) FmtFprintfPrefixedInt-46 215ns ± 1% 213ns ± 0% ~ (p=0.143 n=5+4) FmtFprintfFloat-46 292ns ± 0% 300ns ± 0% +2.83% (p=0.029 n=4+4) FmtManyArgs-46 720ns ± 0% 739ns ± 0% +2.64% (p=0.008 n=5+5) GobDecode-46 9.82ms ± 1% 9.78ms ± 1% ~ (p=0.151 n=5+5) GobEncode-46 8.14ms ± 0% 8.12ms ± 1% ~ (p=0.690 n=5+5) Gzip-46 420ms ± 0% 420ms ± 0% ~ (p=0.548 n=5+5) Gunzip-46 48.2ms ± 0% 48.0ms ± 0% -0.33% (p=0.032 n=5+5) HTTPClientServer-46 201µs ± 4% 199µs ± 3% ~ (p=0.548 n=5+5) JSONEncode-46 17.1ms ± 0% 17.2ms ± 0% ~ (p=0.056 n=5+5) JSONDecode-46 88.0ms ± 0% 88.6ms ± 0% +0.64% (p=0.008 n=5+5) Mandelbrot200-46 5.06ms ± 0% 5.07ms ± 0% ~ (p=0.548 n=5+5) GoParse-46 5.04ms ± 0% 5.07ms ± 0% +0.65% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 117ns ± 0% 112ns ± 4% -4.27% (p=0.016 n=4+5) RegexpMatchEasy0_1K-46 332ns ± 0% 330ns ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-46 104ns ± 0% 110ns ± 1% +5.29% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 563ns ± 0% 567ns ± 2% ~ (p=0.151 n=5+5) RegexpMatchMedium_32-46 167ns ± 2% 166ns ± 0% ~ (p=0.333 n=5+4) RegexpMatchMedium_1K-46 49.5µs ± 0% 49.6µs ± 0% ~ (p=0.841 n=5+5) RegexpMatchHard_32-46 2.56µs ± 1% 2.49µs ± 0% -2.81% (p=0.008 n=5+5) RegexpMatchHard_1K-46 77.0µs ± 0% 75.8µs ± 0% -1.55% (p=0.008 n=5+5) Revcomp-46 631ms ± 1% 628ms ± 0% ~ (p=0.095 n=5+5) Template-46 81.8ms ± 0% 84.3ms ± 1% +3.05% (p=0.008 n=5+5) TimeParse-46 423ns ± 0% 425ns ± 0% +0.52% (p=0.008 n=5+5) TimeFormat-46 478ns ± 2% 478ns ± 1% ~ (p=1.000 n=5+5) [Geo mean] 71.6µs 71.6µs -0.01% name old speed new speed delta GobDecode-46 78.1MB/s ± 1% 78.5MB/s ± 1% ~ (p=0.151 n=5+5) GobEncode-46 94.3MB/s ± 0% 94.5MB/s ± 1% ~ (p=0.690 n=5+5) Gzip-46 46.2MB/s ± 0% 46.2MB/s ± 0% ~ (p=0.571 n=5+5) Gunzip-46 403MB/s ± 0% 404MB/s ± 0% +0.33% (p=0.032 n=5+5) JSONEncode-46 114MB/s ± 0% 113MB/s ± 0% ~ (p=0.056 n=5+5) JSONDecode-46 22.0MB/s ± 0% 21.9MB/s ± 0% -0.64% (p=0.008 n=5+5) GoParse-46 11.5MB/s ± 0% 11.4MB/s ± 0% -0.64% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 272MB/s ± 0% 285MB/s ± 4% +4.74% (p=0.016 n=4+5) RegexpMatchEasy0_1K-46 3.08GB/s ± 0% 3.10GB/s ± 1% ~ (p=0.151 n=5+5) RegexpMatchEasy1_32-46 306MB/s ± 0% 290MB/s ± 1% -5.21% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 1.82GB/s ± 0% 1.81GB/s ± 2% ~ (p=0.151 n=5+5) RegexpMatchMedium_32-46 5.99MB/s ± 0% 6.02MB/s ± 1% ~ (p=0.063 n=4+5) RegexpMatchMedium_1K-46 20.7MB/s ± 0% 20.7MB/s ± 0% ~ (p=0.659 n=5+5) RegexpMatchHard_32-46 12.5MB/s ± 1% 12.8MB/s ± 0% +2.88% (p=0.008 n=5+5) RegexpMatchHard_1K-46 13.3MB/s ± 0% 13.5MB/s ± 0% +1.58% (p=0.008 n=5+5) Revcomp-46 402MB/s ± 1% 405MB/s ± 0% ~ (p=0.095 n=5+5) Template-46 23.7MB/s ± 0% 23.0MB/s ± 1% -2.95% (p=0.008 n=5+5) [Geo mean] 82.2MB/s 82.3MB/s +0.04% Frame-pointer is enabled on Linux by default but can be disabled by setting: GOEXPERIMENT=noframepointer. Fixes #10110 Change-Id: I1bfaca6dba29a63009d7c6ab04ed7a1413d9479e Reviewed-on: https://go-review.googlesource.com/61511 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/runtime/asm_arm64.s')
-rw-r--r--src/runtime/asm_arm64.s34
1 files changed, 25 insertions, 9 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index af389be9fe..6a6a699241 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -39,10 +39,9 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
#endif
MOVD $setg_gcc<>(SB), R1 // arg 1: setg
MOVD g, R0 // arg 0: G
+ SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved.
BL (R12)
- MOVD _cgo_init(SB), R12
- CMP $0, R12
- BEQ nocgo
+ ADD $16, RSP
nocgo:
// update stackguard after _cgo_init
@@ -107,6 +106,7 @@ TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
MOVD buf+0(FP), R3
MOVD RSP, R0
MOVD R0, gobuf_sp(R3)
+ MOVD R29, gobuf_bp(R3)
MOVD LR, gobuf_pc(R3)
MOVD g, gobuf_g(R3)
MOVD ZR, gobuf_lr(R3)
@@ -128,10 +128,12 @@ TEXT runtime·gogo(SB), NOSPLIT, $24-8
MOVD 0(g), R4 // make sure g is not nil
MOVD gobuf_sp(R5), R0
MOVD R0, RSP
+ MOVD gobuf_bp(R5), R29
MOVD gobuf_lr(R5), LR
MOVD gobuf_ret(R5), R0
MOVD gobuf_ctxt(R5), R26
MOVD $0, gobuf_sp(R5)
+ MOVD $0, gobuf_bp(R5)
MOVD $0, gobuf_ret(R5)
MOVD $0, gobuf_lr(R5)
MOVD $0, gobuf_ctxt(R5)
@@ -147,6 +149,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
// Save caller state in g->sched
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
+ MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD $0, (g_sched+gobuf_lr)(g)
MOVD g, (g_sched+gobuf_g)(g)
@@ -163,6 +166,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
MOVD 0(R26), R4 // code pointer
MOVD (g_sched+gobuf_sp)(g), R0
MOVD R0, RSP // sp = m->g0->sched.sp
+ MOVD (g_sched+gobuf_bp)(g), R29
MOVD R3, -8(RSP)
MOVD $0, -16(RSP)
SUB $16, RSP
@@ -211,6 +215,7 @@ switch:
MOVD R6, (g_sched+gobuf_pc)(g)
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
+ MOVD R29, (g_sched+gobuf_bp)(g)
MOVD $0, (g_sched+gobuf_lr)(g)
MOVD g, (g_sched+gobuf_g)(g)
@@ -224,6 +229,7 @@ switch:
MOVD $runtime·mstart(SB), R4
MOVD R4, 0(R3)
MOVD R3, RSP
+ MOVD (g_sched+gobuf_bp)(g), R29
// call target function
MOVD 0(R26), R3 // code pointer
@@ -235,7 +241,9 @@ switch:
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R0
MOVD R0, RSP
+ MOVD (g_sched+gobuf_bp)(g), R29
MOVD $0, (g_sched+gobuf_sp)(g)
+ MOVD $0, (g_sched+gobuf_bp)(g)
RET
noswitch:
@@ -244,6 +252,7 @@ noswitch:
// at an intermediate systemstack.
MOVD 0(R26), R3 // code pointer
MOVD.P 16(RSP), R30 // restore LR
+ SUB $8, RSP, R29 // restore FP
B (R3)
/*
@@ -278,6 +287,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
// Set g->sched to context in f
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
+ MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD R3, (g_sched+gobuf_lr)(g)
MOVD R26, (g_sched+gobuf_ctxt)(g)
@@ -294,6 +304,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R0
MOVD R0, RSP
+ MOVD (g_sched+gobuf_bp)(g), R29
MOVD.W $0, -16(RSP) // create a call frame on g0 (saved LR; keep 16-aligned)
BL runtime·newstack(SB)
@@ -843,8 +854,9 @@ TEXT runtime·jmpdefer(SB), NOSPLIT|NOFRAME, $0-16
// Save state of caller into g->sched. Smashes R0.
TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
MOVD LR, (g_sched+gobuf_pc)(g)
- MOVD RSP, R0
+ MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
+ MOVD R29, (g_sched+gobuf_bp)(g)
MOVD $0, (g_sched+gobuf_lr)(g)
MOVD $0, (g_sched+gobuf_ret)(g)
// Assert ctxt is zero. See func save.
@@ -885,6 +897,7 @@ TEXT ·asmcgocall(SB),NOSPLIT,$0-20
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R0
MOVD R0, RSP
+ MOVD (g_sched+gobuf_bp)(g), R29
MOVD R9, R0
// Now on a scheduling stack (a pthread-created stack).
@@ -996,6 +1009,7 @@ needm:
MOVD m_g0(R8), R3
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(R3)
+ MOVD R29, (g_sched+gobuf_bp)(R3)
havem:
// Now there's a valid m, and we're running on its m->g0.
@@ -1003,7 +1017,7 @@ havem:
// Save current sp in m->g0->sched.sp in preparation for
// switch back to m->curg stack.
// NOTE: unwindm knows that the saved g->sched.sp is at 16(RSP) aka savedsp-16(SP).
- // Beware that the frame size is actually 32.
+ // Beware that the frame size is actually 32+16.
MOVD m_g0(R8), R3
MOVD (g_sched+gobuf_sp)(R3), R4
MOVD R4, savedsp-16(SP)
@@ -1030,10 +1044,12 @@ havem:
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4
MOVD (g_sched+gobuf_pc)(g), R5
- MOVD R5, -(24+8)(R4)
+ MOVD R5, -48(R4)
+ MOVD (g_sched+gobuf_bp)(g), R5
+ MOVD R5, -56(R4)
MOVD ctxt+24(FP), R0
- MOVD R0, -(16+8)(R4)
- MOVD $-(24+8)(R4), R0 // maintain 16-byte SP alignment
+ MOVD R0, -40(R4)
+ MOVD $-48(R4), R0 // maintain 16-byte SP alignment
MOVD R0, RSP
BL runtime·cgocallbackg(SB)
@@ -1041,7 +1057,7 @@ havem:
MOVD 0(RSP), R5
MOVD R5, (g_sched+gobuf_pc)(g)
MOVD RSP, R4
- ADD $(24+8), R4, R4
+ ADD $48, R4, R4
MOVD R4, (g_sched+gobuf_sp)(g)
// Switch back to m->g0's stack and restore m->g0->sched.sp.