diff options
author | Joel Sing <joel@sing.id.au> | 2023-08-02 20:29:52 +1000 |
---|---|---|
committer | Joel Sing <joel@sing.id.au> | 2023-08-28 17:40:40 +0000 |
commit | 19e2e3c291e28d337f155f5f01afa21910431752 (patch) | |
tree | 9ad70c49a65908aee1687d000fbf397d4056bb60 | |
parent | b2e809bab59a692aa6a69e1bd1d32eeeab4622e3 (diff) | |
download | go-19e2e3c291e28d337f155f5f01afa21910431752.tar.gz go-19e2e3c291e28d337f155f5f01afa21910431752.zip |
cmd/internal/obj/arm64: avoid unnecessary pool literal usage for load/store pairs
Implement better classification for load and store pair operations. This in
turn allows us to avoid using pool literals when the offset fits in a 24 bit
unsigned immediate. In this case, the offset can be calculated using two
add immediate instructions, rather than loading the offset from the pool
literal and then adding the offset to the base register. This requires the
same number of instructions, however avoids a load from memory and does
not require the offset to be stored in the literal pool.
Updates #59615
Change-Id: I316ec3d54f1d06ae9d930e98d0c32471775fcb26
Reviewed-on: https://go-review.googlesource.com/c/go/+/515615
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
-rw-r--r-- | src/cmd/internal/obj/arm64/asm7.go | 145 |
1 files changed, 130 insertions, 15 deletions
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 2e5d84f647..84c6b4d5d4 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -710,7 +710,8 @@ var optab = []Optab{ {AFLDPQ, C_PQAUTO_16, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, REGSP, 0, 0}, {AFLDPQ, C_UAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, {AFLDPQ, C_NAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, - {AFLDPQ, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, + {AFLDPQ, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, 0, 0}, + {AFLDPQ, C_LAUTOPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, {AFLDPQ, C_NQOREG_16, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, 0}, {AFLDPQ, C_NQOREG_16, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPRE}, {AFLDPQ, C_NQOREG_16, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, @@ -719,14 +720,16 @@ var optab = []Optab{ {AFLDPQ, C_PQOREG_16, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, {AFLDPQ, C_UOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, {AFLDPQ, C_NOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, - {AFLDPQ, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, + {AFLDPQ, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, 0, 0}, + {AFLDPQ, C_LOREGPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, {AFLDPQ, C_ADDR, C_NONE, C_NONE, C_PAIR, C_NONE, 88, 12, 0, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NQAUTO_16, C_NONE, 67, 4, REGSP, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_PQAUTO_16, C_NONE, 67, 4, REGSP, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_UAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, - {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, LTO, 0}, + {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, 0, 0}, + {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LAUTOPOOL, C_NONE, 77, 12, REGSP, LTO, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NQOREG_16, C_NONE, 67, 4, 0, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NQOREG_16, C_NONE, 67, 4, 0, 0, C_XPRE}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NQOREG_16, C_NONE, 67, 4, 0, 0, C_XPOST}, @@ -735,14 +738,16 @@ var optab = []Optab{ {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_PQOREG_16, C_NONE, 67, 4, 0, 0, C_XPOST}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_UOREG4K, C_NONE, 76, 8, 0, 0, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_NOREG4K, C_NONE, 76, 8, 0, 0, 0}, - {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, LTO, 0}, + {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, 0, 0}, + {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_LOREGPOOL, C_NONE, 77, 12, 0, LTO, 0}, {AFSTPQ, C_PAIR, C_NONE, C_NONE, C_ADDR, C_NONE, 87, 12, 0, 0, 0}, {ALDP, C_NPAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, REGSP, 0, 0}, {ALDP, C_PPAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, REGSP, 0, 0}, {ALDP, C_UAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, {ALDP, C_NAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, - {ALDP, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, + {ALDP, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, 0, 0}, + {ALDP, C_LAUTOPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, {ALDP, C_NPOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, 0}, {ALDP, C_NPOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPRE}, {ALDP, C_NPOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, @@ -751,14 +756,16 @@ var optab = []Optab{ {ALDP, C_PPOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, {ALDP, C_UOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, {ALDP, C_NOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, - {ALDP, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, + {ALDP, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, 0, 0}, + {ALDP, C_LOREGPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, {ALDP, C_ADDR, C_NONE, C_NONE, C_PAIR, C_NONE, 88, 12, 0, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NPAUTO, C_NONE, 67, 4, REGSP, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_PPAUTO, C_NONE, 67, 4, REGSP, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_UAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, - {ASTP, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, LTO, 0}, + {ASTP, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, 0, 0}, + {ASTP, C_PAIR, C_NONE, C_NONE, C_LAUTOPOOL, C_NONE, 77, 12, REGSP, LTO, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NPOREG, C_NONE, 67, 4, 0, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NPOREG, C_NONE, 67, 4, 0, 0, C_XPRE}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NPOREG, C_NONE, 67, 4, 0, 0, C_XPOST}, @@ -767,7 +774,8 @@ var optab = []Optab{ {ASTP, C_PAIR, C_NONE, C_NONE, C_PPOREG, C_NONE, 67, 4, 0, 0, C_XPOST}, {ASTP, C_PAIR, C_NONE, C_NONE, C_UOREG4K, C_NONE, 76, 8, 0, 0, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_NOREG4K, C_NONE, 76, 8, 0, 0, 0}, - {ASTP, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, LTO, 0}, + {ASTP, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, 0, 0}, + {ASTP, C_PAIR, C_NONE, C_NONE, C_LOREGPOOL, C_NONE, 77, 12, 0, LTO, 0}, {ASTP, C_PAIR, C_NONE, C_NONE, C_ADDR, C_NONE, 87, 12, 0, 0, 0}, // differ from LDP/STP for C_NSAUTO_4/C_PSAUTO_4/C_NSOREG_4/C_PSOREG_4 @@ -775,7 +783,8 @@ var optab = []Optab{ {ALDPW, C_PSAUTO_4, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, REGSP, 0, 0}, {ALDPW, C_UAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, {ALDPW, C_NAUTO4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, REGSP, 0, 0}, - {ALDPW, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, + {ALDPW, C_LAUTO, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, 0, 0}, + {ALDPW, C_LAUTOPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, REGSP, LFROM, 0}, {ALDPW, C_NSOREG_4, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, 0}, {ALDPW, C_NSOREG_4, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPRE}, {ALDPW, C_NSOREG_4, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, @@ -784,14 +793,16 @@ var optab = []Optab{ {ALDPW, C_PSOREG_4, C_NONE, C_NONE, C_PAIR, C_NONE, 66, 4, 0, 0, C_XPOST}, {ALDPW, C_UOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, {ALDPW, C_NOREG4K, C_NONE, C_NONE, C_PAIR, C_NONE, 74, 8, 0, 0, 0}, - {ALDPW, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, + {ALDPW, C_LOREG, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, 0, 0}, + {ALDPW, C_LOREGPOOL, C_NONE, C_NONE, C_PAIR, C_NONE, 75, 12, 0, LFROM, 0}, {ALDPW, C_ADDR, C_NONE, C_NONE, C_PAIR, C_NONE, 88, 12, 0, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NSAUTO_4, C_NONE, 67, 4, REGSP, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_PSAUTO_4, C_NONE, 67, 4, REGSP, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_UAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NAUTO4K, C_NONE, 76, 8, REGSP, 0, 0}, - {ASTPW, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, LTO, 0}, + {ASTPW, C_PAIR, C_NONE, C_NONE, C_LAUTO, C_NONE, 77, 12, REGSP, 0, 0}, + {ASTPW, C_PAIR, C_NONE, C_NONE, C_LAUTOPOOL, C_NONE, 77, 12, REGSP, LTO, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NSOREG_4, C_NONE, 67, 4, 0, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NSOREG_4, C_NONE, 67, 4, 0, 0, C_XPRE}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NSOREG_4, C_NONE, 67, 4, 0, 0, C_XPOST}, @@ -800,7 +811,8 @@ var optab = []Optab{ {ASTPW, C_PAIR, C_NONE, C_NONE, C_PSOREG_4, C_NONE, 67, 4, 0, 0, C_XPOST}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_UOREG4K, C_NONE, 76, 8, 0, 0, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_NOREG4K, C_NONE, 76, 8, 0, 0, 0}, - {ASTPW, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, LTO, 0}, + {ASTPW, C_PAIR, C_NONE, C_NONE, C_LOREG, C_NONE, 77, 12, 0, 0, 0}, + {ASTPW, C_PAIR, C_NONE, C_NONE, C_LOREGPOOL, C_NONE, 77, 12, 0, LTO, 0}, {ASTPW, C_PAIR, C_NONE, C_NONE, C_ADDR, C_NONE, 87, 12, 0, 0, 0}, {ASWPD, C_ZREG, C_NONE, C_NONE, C_ZOREG, C_ZREG, 47, 4, 0, 0, 0}, @@ -1479,6 +1491,14 @@ func isNEGop(op obj.As) bool { return false } +func isLoadStorePairOp(op obj.As) bool { + switch op { + case AFLDPQ, AFSTPQ, ALDP, ASTP, ALDPW, ASTPW: + return true + } + return false +} + func isMOVop(op obj.As) bool { switch op { case AMOVB, AMOVBU, AMOVH, AMOVHU, AMOVW, AMOVWU, AMOVD, AFMOVS, AFMOVD, AFMOVQ: @@ -1984,6 +2004,33 @@ func (c *ctxt7) loadStoreClass(p *obj.Prog, lsc int, v int64) int { return lsc } +// loadStorePairClass reclassifies a load or store pair operation based on its offset. +func (c *ctxt7) loadStorePairClass(p *obj.Prog, lsc int, v int64) int { + // Avoid reclassification of pre/post-indexed loads and stores. + if p.Scond == C_XPRE || p.Scond == C_XPOST { + return lsc + } + + if cmp(C_NAUTO4K, lsc) || cmp(C_NOREG4K, lsc) { + return lsc + } + if cmp(C_UAUTO4K, lsc) || cmp(C_UOREG4K, lsc) { + return lsc + } + + needsPool := true + if v >= 0 && v <= 0xffffff { + needsPool = false + } + if needsPool && cmp(C_LAUTO, lsc) { + return C_LAUTOPOOL + } + if needsPool && cmp(C_LOREG, lsc) { + return C_LOREGPOOL + } + return lsc +} + func (c *ctxt7) aclass(a *obj.Addr) int { switch a.Type { case obj.TYPE_NONE: @@ -2212,6 +2259,10 @@ func (c *ctxt7) oplook(p *obj.Prog) *Optab { // More specific classification of large offset loads and stores. a1 = c.loadStoreClass(p, a1, c.instoffset) } + if isLoadStorePairOp(p.As) && (cmp(C_LAUTO, a1) || cmp(C_LOREG, a1)) { + // More specific classification of large offset loads and stores. + a1 = c.loadStorePairClass(p, a1, c.instoffset) + } } p.From.Class = int8(a1) } @@ -2238,6 +2289,10 @@ func (c *ctxt7) oplook(p *obj.Prog) *Optab { // More specific classification of large offset loads and stores. a4 = c.loadStoreClass(p, a4, c.instoffset) } + if isLoadStorePairOp(p.As) && (cmp(C_LAUTO, a4) || cmp(C_LOREG, a4)) { + // More specific classification of large offset loads and stores. + a4 = c.loadStorePairClass(p, a4, c.instoffset) + } } p.To.Class = int8(a4) } @@ -4035,12 +4090,12 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { break storeusepool: - if r == REGTMP || p.From.Reg == REGTMP { - c.ctxt.Diag("REGTMP used in large offset store: %v", p) - } if p.Pool == nil { c.ctxt.Diag("%v: constant is not in pool", p) } + if r == REGTMP || p.From.Reg == REGTMP { + c.ctxt.Diag("REGTMP used in large offset store: %v", p) + } o1 = c.omovlit(AMOVD, p, &p.To, REGTMP) o2 = c.olsxrr(p, int32(c.opstrr(p, p.As, false)), int(p.From.Reg), int(r), REGTMP) @@ -4831,6 +4886,11 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o2 = c.opldpstp(p, o, 0, REGTMP, rt1, rt2, 1) case 75: + // If offset L fits in a 24 bit unsigned immediate: + // add $lo, R, Rtmp + // add $hi, Rtmp, Rtmp + // ldr (Rtmp), R + // Otherwise, use constant pool: // mov $L, Rtmp (from constant pool) // add Rtmp, R, Rtmp // ldp (Rtmp), (R1, R2) @@ -4844,6 +4904,31 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { if rf == obj.REG_NONE { c.ctxt.Diag("invalid ldp source: %v", p) } + + v := c.regoff(&p.From) + if v >= -4095 && v <= 4095 { + c.ctxt.Diag("%v: bad type for offset %d (should be add/sub+ldp)", p, v) + } + + hi, lo, err := splitImm24uScaled(v, 0) + if err != nil { + goto loadpairusepool + } + if p.Pool != nil { + c.ctxt.Diag("%v: unused constant in pool (%v)\n", p, v) + } + o1 = c.oaddi(p, AADD, lo, REGTMP, int16(rf)) + o2 = c.oaddi(p, AADD, hi, REGTMP, REGTMP) + o3 = c.opldpstp(p, o, 0, REGTMP, rt1, rt2, 1) + break + + loadpairusepool: + if p.Pool == nil { + c.ctxt.Diag("%v: constant is not in pool", p) + } + if rf == REGTMP || p.From.Reg == REGTMP { + c.ctxt.Diag("REGTMP used in large offset load: %v", p) + } o1 = c.omovlit(AMOVD, p, &p.From, REGTMP) o2 = c.opxrrr(p, AADD, REGTMP, rf, REGTMP, false) o3 = c.opldpstp(p, o, 0, REGTMP, rt1, rt2, 1) @@ -4866,6 +4951,11 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o2 = c.opldpstp(p, o, 0, REGTMP, rf1, rf2, 0) case 77: + // If offset L fits in a 24 bit unsigned immediate: + // add $lo, R, Rtmp + // add $hi, Rtmp, Rtmp + // stp (R1, R2), (Rtmp) + // Otherwise, use constant pool: // mov $L, Rtmp (from constant pool) // add Rtmp, R, Rtmp // stp (R1, R2), (Rtmp) @@ -4879,6 +4969,31 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { if rt == obj.REG_NONE { c.ctxt.Diag("invalid stp destination: %v", p) } + + v := c.regoff(&p.To) + if v >= -4095 && v <= 4095 { + c.ctxt.Diag("%v: bad type for offset %d (should be add/sub+stp)", p, v) + } + + hi, lo, err := splitImm24uScaled(v, 0) + if err != nil { + goto storepairusepool + } + if p.Pool != nil { + c.ctxt.Diag("%v: unused constant in pool (%v)\n", p, v) + } + o1 = c.oaddi(p, AADD, lo, REGTMP, int16(rt)) + o2 = c.oaddi(p, AADD, hi, REGTMP, REGTMP) + o3 = c.opldpstp(p, o, 0, REGTMP, rf1, rf2, 0) + break + + storepairusepool: + if p.Pool == nil { + c.ctxt.Diag("%v: constant is not in pool", p) + } + if rt == REGTMP || p.From.Reg == REGTMP { + c.ctxt.Diag("REGTMP used in large offset store: %v", p) + } o1 = c.omovlit(AMOVD, p, &p.To, REGTMP) o2 = c.opxrrr(p, AADD, REGTMP, rt, REGTMP, false) o3 = c.opldpstp(p, o, 0, REGTMP, rf1, rf2, 0) |