diff options
author | eric fang <eric.fang@arm.com> | 2022-07-11 02:40:14 +0000 |
---|---|---|
committer | Eric Fang <eric.fang@arm.com> | 2022-10-28 01:44:39 +0000 |
commit | 537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736 (patch) | |
tree | b4ca4b162e181b92d5d34df975a1ad38e411e229 /src/cmd/internal/obj/arm64/asm7.go | |
parent | 7f255ba065ee7bd41da806b297cd643e3ead3fee (diff) | |
download | go-537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736.tar.gz go-537c4354cb9fdf8812c0448bd8f8a3b9f9ab1736.zip |
cmd/internal/obj/arm64: optimize ADRP+ADD+LD/ST to ADRP+LD/ST(offset)
This CL optimizes the sequence of instructions ADRP+ADD+LD/ST to the
sequence of ADRP+LD/ST(offset). This saves an ADD instruction.
The test result of compilecmp:
name old text-bytes new text-bytes delta
HelloSize 763kB ± 0% 755kB ± 0% -1.06% (p=0.000 n=20+20)
name old data-bytes new data-bytes delta
HelloSize 13.5kB ± 0% 13.5kB ± 0% ~ (all equal)
name old bss-bytes new bss-bytes delta
HelloSize 227kB ± 0% 227kB ± 0% ~ (all equal)
name old exe-bytes new exe-bytes delta
HelloSize 1.33MB ± 0% 1.33MB ± 0% -0.02% (p=0.000 n=20+20)
file before after Δ %
addr2line 3760392 3759504 -888 -0.024%
api 5361511 5295351 -66160 -1.234%
asm 5014157 4948674 -65483 -1.306%
buildid 2579949 2579485 -464 -0.018%
cgo 4492817 4491737 -1080 -0.024%
compile 23359229 23156074 -203155 -0.870%
cover 4823337 4756937 -66400 -1.377%
dist 3332850 3331794 -1056 -0.032%
doc 3902649 3836745 -65904 -1.689%
fix 3269708 3268828 -880 -0.027%
link 6510760 6443496 -67264 -1.033%
nm 3670740 3604348 -66392 -1.809%
objdump 4069599 4068967 -632 -0.016%
pack 2374824 2374208 -616 -0.026%
pprof 13874860 13805700 -69160 -0.498%
test2json 2599210 2598530 -680 -0.026%
trace 13231640 13162872 -68768 -0.520%
vet 7360899 7292267 -68632 -0.932%
total 113589131 112775517 -813614 -0.716%
Change-Id: Ie1cf277e149ddd3f352d05fa0753d0ced7e0b894
Reviewed-on: https://go-review.googlesource.com/c/go/+/444715
Run-TryBot: Eric Fang <eric.fang@arm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Damien Neil <dneil@google.com>
Diffstat (limited to 'src/cmd/internal/obj/arm64/asm7.go')
-rw-r--r-- | src/cmd/internal/obj/arm64/asm7.go | 85 |
1 files changed, 68 insertions, 17 deletions
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 95053163c5..d364cde25b 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -37,6 +37,7 @@ import ( "log" "math" "sort" + "strings" ) // ctxt7 holds state while assembling a single function. @@ -73,7 +74,7 @@ type Optab struct { a3 uint8 a4 uint8 type_ int8 - size int8 + size_ int8 // the value of this field is not static, use the size() method to return the value param int16 flag int8 scond uint16 @@ -1021,6 +1022,27 @@ func pcAlignPadLength(pc int64, alignedValue int64, ctxt *obj.Link) int { return int(-pc & (alignedValue - 1)) } +// size returns the size of the sequence of machine instructions when p is encoded with o. +// Usually it just returns o.size directly, in some cases it checks whether the optimization +// conditions are met, and if so returns the size of the optimized instruction sequence. +// These optimizations need to be synchronized with the asmout function. +func (o *Optab) size(ctxt *obj.Link, p *obj.Prog) int { + // Optimize adrp+add+ld/st to adrp+ld/st(offset). + sz := movesize(p.As) + if sz != -1 { + // Relocations R_AARCH64_LDST{64,32,16,8}_ABS_LO12_NC can only generate 8-byte, 4-byte, + // 2-byte and 1-byte aligned addresses, so the address of load/store must be aligned. + // Also symbols with prefix of "go:string." are Go strings, which will go into + // the symbol table, their addresses are not necessary aligned, rule this out. + align := int64(1 << sz) + if o.a1 == C_ADDR && p.From.Offset%align == 0 && !strings.HasPrefix(p.From.Sym.Name, "go:string.") || + o.a4 == C_ADDR && p.To.Offset%align == 0 && !strings.HasPrefix(p.To.Sym.Name, "go:string.") { + return 8 + } + } + return int(o.size_) +} + func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if ctxt.Retpoline { ctxt.Diag("-spectre=ret not supported on arm64") @@ -1050,7 +1072,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { } p.Pc = pc o = c.oplook(p) - m = int(o.size) + m = o.size(c.ctxt, p) if m == 0 { switch p.As { case obj.APCALIGN: @@ -1131,7 +1153,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { bflag = 1 } } - m = int(o.size) + m = o.size(c.ctxt, p) if m == 0 { switch p.As { @@ -1176,8 +1198,9 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { psz += 4 } - if int(o.size) > 4*len(out) { - log.Fatalf("out array in span7 is too small, need at least %d for %v", o.size/4, p) + sz := o.size(c.ctxt, p) + if sz > 4*len(out) { + log.Fatalf("out array in span7 is too small, need at least %d for %v", sz/4, p) } if p.As == obj.APCALIGN { alignedValue := p.From.Offset @@ -1190,7 +1213,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { } } else { c.asmout(p, o, out[:]) - for i = 0; i < int(o.size/4); i++ { + for i = 0; i < sz/4; i++ { c.ctxt.Arch.ByteOrder.PutUint32(bp, out[i]) bp = bp[4:] psz += 4 @@ -1238,7 +1261,7 @@ func (c *ctxt7) isRestartable(p *obj.Prog) bool { // If p doesn't use REGTMP, it can be simply preempted, so we don't // mark it. o := c.oplook(p) - return o.size > 4 && o.flag&NOTUSETMP == 0 + return o.size(c.ctxt, p) > 4 && o.flag&NOTUSETMP == 0 } /* @@ -3414,7 +3437,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { op = int32(c.opirr(p, AADD)) } - if int(o.size) == 8 { + if int(o.size(c.ctxt, p)) == 8 { // NOTE: this case does not use REGTMP. If it ever does, // remove the NOTUSETMP flag in optab. o1 = c.oaddi(p, op, v&0xfff000, r, rt) @@ -4460,31 +4483,43 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o2 |= uint32(r&31) << 5 o2 |= uint32(rt & 31) - /* reloc ops */ - case 64: /* movT R,addr -> adrp + add + movT R, (REGTMP) */ + /* reloc ops */ + case 64: /* movT R,addr -> adrp + movT R, (REGTMP) */ if p.From.Reg == REGTMP { c.ctxt.Diag("cannot use REGTMP as source: %v\n", p) } o1 = ADR(1, 0, REGTMP) - o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31 rel := obj.Addrel(c.cursym) rel.Off = int32(c.pc) rel.Siz = 8 rel.Sym = p.To.Sym rel.Add = p.To.Offset - rel.Type = objabi.R_ADDRARM64 - o3 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg)) + // For unaligned access, fall back to adrp + add + movT R, (REGTMP). + if o.size(c.ctxt, p) != 8 { + o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31 + o3 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg)) + rel.Type = objabi.R_ADDRARM64 + break + } + o2 = c.olsr12u(p, int32(c.opstr(p, p.As)), 0, REGTMP, int(p.From.Reg)) + rel.Type = c.addrRelocType(p) - case 65: /* movT addr,R -> adrp + add + movT (REGTMP), R */ + case 65: /* movT addr,R -> adrp + movT (REGTMP), R */ o1 = ADR(1, 0, REGTMP) - o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31 rel := obj.Addrel(c.cursym) rel.Off = int32(c.pc) rel.Siz = 8 rel.Sym = p.From.Sym rel.Add = p.From.Offset - rel.Type = objabi.R_ADDRARM64 - o3 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg)) + // For unaligned access, fall back to adrp + add + movT (REGTMP), R. + if o.size(c.ctxt, p) != 8 { + o2 = c.opirr(p, AADD) | REGTMP&31<<5 | REGTMP&31 + o3 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg)) + rel.Type = objabi.R_ADDRARM64 + break + } + o2 = c.olsr12u(p, int32(c.opldr(p, p.As)), 0, REGTMP, int(p.To.Reg)) + rel.Type = c.addrRelocType(p) case 66: /* ldp O(R)!, (r1, r2); ldp (R)O!, (r1, r2) */ v := int32(c.regoff(&p.From)) @@ -5676,6 +5711,22 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { out[4] = o5 } +func (c *ctxt7) addrRelocType(p *obj.Prog) objabi.RelocType { + switch movesize(p.As) { + case 0: + return objabi.R_ARM64_PCREL_LDST8 + case 1: + return objabi.R_ARM64_PCREL_LDST16 + case 2: + return objabi.R_ARM64_PCREL_LDST32 + case 3: + return objabi.R_ARM64_PCREL_LDST64 + default: + c.ctxt.Diag("use R_ADDRARM64 relocation type for: %v\n", p) + } + return -1 +} + /* * basic Rm op Rn -> Rd (using shifted register with 0) * also op Rn -> Rt |