aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/internal/obj/arm64/asm7.go
diff options
context:
space:
mode:
authoreric fang <eric.fang@arm.com>2022-07-11 02:40:14 +0000
committerEric Fang <eric.fang@arm.com>2022-10-28 01:44:39 +0000
commit537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736 (patch)
treeb4ca4b162e181b92d5d34df975a1ad38e411e229 /src/cmd/internal/obj/arm64/asm7.go
parent7f255ba065ee7bd41da806b297cd643e3ead3fee (diff)
downloadgo-537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736.tar.gz
go-537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736.zip
cmd/internal/obj/arm64: optimize ADRP+ADD+LD/ST to ADRP+LD/ST(offset)
This CL optimizes the sequence of instructions ADRP+ADD+LD/ST to the sequence of ADRP+LD/ST(offset). This saves an ADD instruction. The test result of compilecmp: name old text-bytes new text-bytes delta HelloSize 763kB ± 0% 755kB ± 0% -1.06% (p=0.000 n=20+20) name old data-bytes new data-bytes delta HelloSize 13.5kB ± 0% 13.5kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 227kB ± 0% 227kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.33MB ± 0% 1.33MB ± 0% -0.02% (p=0.000 n=20+20) file before after Δ % addr2line 3760392 3759504 -888 -0.024% api 5361511 5295351 -66160 -1.234% asm 5014157 4948674 -65483 -1.306% buildid 2579949 2579485 -464 -0.018% cgo 4492817 4491737 -1080 -0.024% compile 23359229 23156074 -203155 -0.870% cover 4823337 4756937 -66400 -1.377% dist 3332850 3331794 -1056 -0.032% doc 3902649 3836745 -65904 -1.689% fix 3269708 3268828 -880 -0.027% link 6510760 6443496 -67264 -1.033% nm 3670740 3604348 -66392 -1.809% objdump 4069599 4068967 -632 -0.016% pack 2374824 2374208 -616 -0.026% pprof 13874860 13805700 -69160 -0.498% test2json 2599210 2598530 -680 -0.026% trace 13231640 13162872 -68768 -0.520% vet 7360899 7292267 -68632 -0.932% total 113589131 112775517 -813614 -0.716% Change-Id: Ie1cf277e149ddd3f352d05fa0753d0ced7e0b894 Reviewed-on: https://go-review.googlesource.com/c/go/+/444715 Run-TryBot: Eric Fang <eric.fang@arm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Damien Neil <dneil@google.com>
Diffstat (limited to 'src/cmd/internal/obj/arm64/asm7.go')
-rw-r--r--src/cmd/internal/obj/arm64/asm7.go85
1 files changed, 68 insertions, 17 deletions
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 95053163c5..d364cde25b 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -37,6 +37,7 @@ import (
"log"
"math"
"sort"
+ "strings"
)
// ctxt7 holds state while assembling a single function.
@@ -73,7 +74,7 @@ type Optab struct {
a3 uint8
a4 uint8
type_ int8
- size int8
+ size_ int8 // the value of this field is not static, use the size() method to return the value
param int16
flag int8
scond uint16
@@ -1021,6 +1022,27 @@ func pcAlignPadLength(pc int64, alignedValue int64, ctxt *obj.Link) int {
return int(-pc & (alignedValue - 1))
}
+// size returns the size of the sequence of machine instructions when p is encoded with o.
+// Usually it just returns o.size directly, in some cases it checks whether the optimization
+// conditions are met, and if so returns the size of the optimized instruction sequence.
+// These optimizations need to be synchronized with the asmout function.
+func (o *Optab) size(ctxt *obj.Link, p *obj.Prog) int {
+ // Optimize adrp+add+ld/st to adrp+ld/st(offset).
+ sz := movesize(p.As)
+ if sz != -1 {
+ // Relocations R_AARCH64_LDST{64,32,16,8}_ABS_LO12_NC can only generate 8-byte, 4-byte,
+ // 2-byte and 1-byte aligned addresses, so the address of load/store must be aligned.
+ // Also symbols with prefix of "go:string." are Go strings, which will go into
+ // the symbol table, their addresses are not necessary aligned, rule this out.
+ align := int64(1 << sz)
+ if o.a1 == C_ADDR && p.From.Offset%align == 0 && !strings.HasPrefix(p.From.Sym.Name, "go:string.") ||
+ o.a4 == C_ADDR && p.To.Offset%align == 0 && !strings.HasPrefix(p.To.Sym.Name, "go:string.") {
+ return 8
+ }
+ }
+ return int(o.size_)
+}
+
func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
if ctxt.Retpoline {
ctxt.Diag("-spectre=ret not supported on arm64")
@@ -1050,7 +1072,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
}
p.Pc = pc
o = c.oplook(p)
- m = int(o.size)
+ m = o.size(c.ctxt, p)
if m == 0 {
switch p.As {
case obj.APCALIGN:
@@ -1131,7 +1153,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
bflag = 1
}
}
- m = int(o.size)
+ m = o.size(c.ctxt, p)
if m == 0 {
switch p.As {
@@ -1176,8 +1198,9 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
psz += 4
}
- if int(o.size) > 4*len(out) {
- log.Fatalf("out array in span7 is too small, need at least %d for %v", o.size/4, p)
+ sz := o.size(c.ctxt, p)
+ if sz > 4*len(out) {
+ log.Fatalf("out array in span7 is too small, need at least %d for %v", sz/4, p)
}
if p.As == obj.APCALIGN {
alignedValue := p.From.Offset
@@ -1190,7 +1213,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
}
} else {
c.asmout(p, o, out[:])
- for i = 0; i < int(o.size/4); i++ {
+ for i = 0; i < sz/4; i++ {
c.ctxt.Arch.ByteOrder.PutUint32(bp, out[i])
bp = bp[4:]
psz += 4
@@ -1238,7 +1261,7 @@ func (c *ctxt7) isRestartable(p *obj.Prog) bool {
// If p doesn't use REGTMP, it can be simply preempted, so we don't
// mark it.
o := c.oplook(p)
- return o.size > 4 && o.flag&NOTUSETMP == 0
+ return o.size(c.ctxt, p) > 4 && o.flag&NOTUSETMP == 0
}
/*
@@ -3414,7 +3437,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
op = int32(c.opirr(p, AADD))
}
- if int(o.size) == 8 {
+ if int(o.size(c.ctxt, p)) == 8 {
// NOTE: this case does not use REGTMP. If it ever does,
// remove the NOTUSETMP flag in optab.
o1 = c.oaddi(p, op, v&0xfff000, r, rt)
@@ -4460,31 +4483,43 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o2 |= uint32(r&31) << 5
o2 |= uint32(rt & 31)
- /* reloc ops */
- case 64: /* movT R,addr -> adrp + add + movT R, (REGTMP) */
+ /* reloc ops */
+ case 64: /* movT R,addr -> adrp + movT R, (REGTMP) */
if p.From.Reg == REGTMP {
c.ctxt.Diag("cannot use REGTMP as source: %v\n", p)
}
o1 = ADR(1, 0, REGTMP)
- o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31
rel := obj.Addrel(c.cursym)
rel.Off = int32(c.pc)
rel.Siz = 8
rel.Sym = p.To.Sym
rel.Add = p.To.Offset
- rel.Type = objabi.R_ADDRARM64
- o3 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg))
+ // For unaligned access, fall back to adrp + add + movT R, (REGTMP).
+ if o.size(c.ctxt, p) != 8 {
+ o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31
+ o3 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg))
+ rel.Type = objabi.R_ADDRARM64
+ break
+ }
+ o2 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg))
+ rel.Type = c.addrRelocType(p)
- case 65: /* movT addr,R -> adrp + add + movT (REGTMP), R */
+ case 65: /* movT addr,R -> adrp + movT (REGTMP), R */
o1 = ADR(1, 0, REGTMP)
- o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31
rel := obj.Addrel(c.cursym)
rel.Off = int32(c.pc)
rel.Siz = 8
rel.Sym = p.From.Sym
rel.Add = p.From.Offset
- rel.Type = objabi.R_ADDRARM64
- o3 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg))
+ // For unaligned access, fall back to adrp + add + movT (REGTMP), R.
+ if o.size(c.ctxt, p) != 8 {
+ o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31
+ o3 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg))
+ rel.Type = objabi.R_ADDRARM64
+ break
+ }
+ o2 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg))
+ rel.Type = c.addrRelocType(p)
case 66: /* ldp O(R)!, (r1, r2); ldp (R)O!, (r1, r2) */
v := int32(c.regoff(&p.From))
@@ -5676,6 +5711,22 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
out[4] = o5
}
+func (c *ctxt7) addrRelocType(p *obj.Prog) objabi.RelocType {
+ switch movesize(p.As) {
+ case 0:
+ return objabi.R_ARM64_PCREL_LDST8
+ case 1:
+ return objabi.R_ARM64_PCREL_LDST16
+ case 2:
+ return objabi.R_ARM64_PCREL_LDST32
+ case 3:
+ return objabi.R_ARM64_PCREL_LDST64
+ default:
+ c.ctxt.Diag("use R_ADDRARM64 relocation type for: %v\n", p)
+ }
+ return -1
+}
+
/*
* basic Rm op Rn -> Rd (using shifted register with 0)
* also op Rn -> Rt