aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/arm64
diff options
context:
space:
mode:
authorJonathan Swinney <jswinney@amazon.com>2020-11-04 16:18:23 +0000
committerCherry Zhang <cherryyz@google.com>2020-11-05 23:21:33 +0000
commitecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a (patch)
treef73baf58f3df5fcdcec20b65e3223260f14472ea /src/cmd/compile/internal/arm64
parent8e5778ed70ec3d371615a663520a586745fb7bee (diff)
downloadgo-ecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a.tar.gz
go-ecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a.zip
cmd/compile: improve atomic swap intrinsics on arm64
ARMv8.1 has added new instructions for atomic memory operations. This change builds on the previous change which added support for atomic add, 0a7ac93c27c9ade79fe0f66ae0bb81484c241ae5, to include similar support for atomic-compare-and-swap, atomic-swap, atomic-or, and atomic-and intrinsics. Since the new instructions are not guaranteed to be present, we guard their usages with a branch on a CPU feature. Peformance on an ARMv8.1 machine: name old time/op new time/op delta CompareAndSwap-16 37.9ns ±16% 24.1ns ± 4% -36.44% (p=0.000 n=10+9) CompareAndSwap64-16 38.6ns ±15% 24.1ns ± 3% -37.47% (p=0.000 n=10+10) name old time/op new time/op delta Swap-16 46.9ns ±32% 12.5ns ± 6% -73.40% (p=0.000 n=10+10) Swap64-16 53.4ns ± 1% 12.5ns ± 6% -76.56% (p=0.000 n=10+10) name old time/op new time/op delta Or8-16 8.81ns ± 0% 5.61ns ± 0% -36.32% (p=0.000 n=10+10) Or-16 7.21ns ± 0% 5.61ns ± 0% -22.19% (p=0.000 n=10+10) Or8Parallel-16 59.8ns ± 3% 12.5ns ± 2% -79.10% (p=0.000 n=10+10) OrParallel-16 51.7ns ± 3% 12.5ns ± 2% -75.84% (p=0.000 n=10+10) name old time/op new time/op delta And8-16 8.81ns ± 0% 5.61ns ± 0% -36.32% (p=0.000 n=10+10) And-16 7.21ns ± 0% 5.61ns ± 0% -22.19% (p=0.000 n=10+10) And8Parallel-16 59.1ns ± 6% 12.8ns ± 3% -78.33% (p=0.000 n=10+10) AndParallel-16 51.4ns ± 7% 12.8ns ± 3% -75.03% (p=0.000 n=10+10) Performance on an ARMv8.0 machine (no atomics instructions): name old time/op new time/op delta CompareAndSwap-16 61.3ns ± 0% 62.4ns ± 0% +1.70% (p=0.000 n=8+9) CompareAndSwap64-16 62.0ns ± 3% 61.3ns ± 2% ~ (p=0.093 n=10+10) name old time/op new time/op delta Swap-16 127ns ± 2% 131ns ± 2% +2.91% (p=0.001 n=10+10) Swap64-16 128ns ± 1% 131ns ± 2% +2.43% (p=0.001 n=10+10) name old time/op new time/op delta Or8-16 14.9ns ± 0% 15.3ns ± 0% +2.68% (p=0.000 n=10+10) Or-16 11.8ns ± 0% 12.3ns ± 0% +4.24% (p=0.000 n=10+10) Or8Parallel-16 137ns ± 1% 144ns ± 1% +4.97% (p=0.000 n=10+10) OrParallel-16 128ns ± 1% 136ns ± 1% +6.34% (p=0.000 n=10+10) name old time/op new time/op delta And8-16 14.9ns ± 0% 15.3ns ± 0% +2.68% (p=0.000 n=10+10) And-16 11.8ns ± 0% 12.3ns ± 0% +4.24% (p=0.000 n=10+10) And8Parallel-16 134ns ± 2% 141ns ± 1% +5.29% (p=0.000 n=10+10) AndParallel-16 125ns ± 2% 134ns ± 1% +7.10% (p=0.000 n=10+10) Fixes #39304 Change-Id: Idaca68701d4751650be6b4bedca3d57f51571712 Reviewed-on: https://go-review.googlesource.com/c/go/+/234217 Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Trust: fannie zhang <Fannie.Zhang@arm.com>
Diffstat (limited to 'src/cmd/compile/internal/arm64')
-rw-r--r--src/cmd/compile/internal/arm64/ssa.go125
1 files changed, 125 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 5c695ef84c..22b28a9308 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -581,6 +581,24 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p2.From.Reg = arm64.REGTMP
p2.To.Type = obj.TYPE_BRANCH
gc.Patch(p2, p)
+ case ssa.OpARM64LoweredAtomicExchange64Variant,
+ ssa.OpARM64LoweredAtomicExchange32Variant:
+ swap := arm64.ASWPALD
+ if v.Op == ssa.OpARM64LoweredAtomicExchange32Variant {
+ swap = arm64.ASWPALW
+ }
+ r0 := v.Args[0].Reg()
+ r1 := v.Args[1].Reg()
+ out := v.Reg0()
+
+ // SWPALD Rarg1, (Rarg0), Rout
+ p := s.Prog(swap)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = r0
+ p.RegTo2 = out
+
case ssa.OpARM64LoweredAtomicAdd64,
ssa.OpARM64LoweredAtomicAdd32:
// LDAXR (Rarg0), Rout
@@ -687,6 +705,56 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p5.To.Type = obj.TYPE_REG
p5.To.Reg = out
gc.Patch(p2, p5)
+ case ssa.OpARM64LoweredAtomicCas64Variant,
+ ssa.OpARM64LoweredAtomicCas32Variant:
+ // Rarg0: ptr
+ // Rarg1: old
+ // Rarg2: new
+ // MOV Rarg1, Rtmp
+ // CASAL Rtmp, (Rarg0), Rarg2
+ // CMP Rarg1, Rtmp
+ // CSET EQ, Rout
+ cas := arm64.ACASALD
+ cmp := arm64.ACMP
+ mov := arm64.AMOVD
+ if v.Op == ssa.OpARM64LoweredAtomicCas32Variant {
+ cas = arm64.ACASALW
+ cmp = arm64.ACMPW
+ mov = arm64.AMOVW
+ }
+ r0 := v.Args[0].Reg()
+ r1 := v.Args[1].Reg()
+ r2 := v.Args[2].Reg()
+ out := v.Reg0()
+
+ // MOV Rarg1, Rtmp
+ p := s.Prog(mov)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = arm64.REGTMP
+
+ // CASAL Rtmp, (Rarg0), Rarg2
+ p1 := s.Prog(cas)
+ p1.From.Type = obj.TYPE_REG
+ p1.From.Reg = arm64.REGTMP
+ p1.To.Type = obj.TYPE_MEM
+ p1.To.Reg = r0
+ p1.RegTo2 = r2
+
+ // CMP Rarg1, Rtmp
+ p2 := s.Prog(cmp)
+ p2.From.Type = obj.TYPE_REG
+ p2.From.Reg = r1
+ p2.Reg = arm64.REGTMP
+
+ // CSET EQ, Rout
+ p3 := s.Prog(arm64.ACSET)
+ p3.From.Type = obj.TYPE_REG
+ p3.From.Reg = arm64.COND_EQ
+ p3.To.Type = obj.TYPE_REG
+ p3.To.Reg = out
+
case ssa.OpARM64LoweredAtomicAnd8,
ssa.OpARM64LoweredAtomicAnd32,
ssa.OpARM64LoweredAtomicOr8,
@@ -725,6 +793,63 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p3.From.Reg = arm64.REGTMP
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
+ case ssa.OpARM64LoweredAtomicAnd8Variant,
+ ssa.OpARM64LoweredAtomicAnd32Variant:
+ atomic_clear := arm64.ALDCLRALW
+ if v.Op == ssa.OpARM64LoweredAtomicAnd8Variant {
+ atomic_clear = arm64.ALDCLRALB
+ }
+ r0 := v.Args[0].Reg()
+ r1 := v.Args[1].Reg()
+ out := v.Reg0()
+
+ // MNV Rarg1 Rtemp
+ p := s.Prog(arm64.AMVN)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = arm64.REGTMP
+
+ // LDCLRALW Rtemp, (Rarg0), Rout
+ p1 := s.Prog(atomic_clear)
+ p1.From.Type = obj.TYPE_REG
+ p1.From.Reg = arm64.REGTMP
+ p1.To.Type = obj.TYPE_MEM
+ p1.To.Reg = r0
+ p1.RegTo2 = out
+
+ // AND Rarg1, Rout
+ p2 := s.Prog(arm64.AAND)
+ p2.From.Type = obj.TYPE_REG
+ p2.From.Reg = r1
+ p2.To.Type = obj.TYPE_REG
+ p2.To.Reg = out
+
+ case ssa.OpARM64LoweredAtomicOr8Variant,
+ ssa.OpARM64LoweredAtomicOr32Variant:
+ atomic_or := arm64.ALDORALW
+ if v.Op == ssa.OpARM64LoweredAtomicOr8Variant {
+ atomic_or = arm64.ALDORALB
+ }
+ r0 := v.Args[0].Reg()
+ r1 := v.Args[1].Reg()
+ out := v.Reg0()
+
+ // LDORALW Rarg1, (Rarg0), Rout
+ p := s.Prog(atomic_or)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = r0
+ p.RegTo2 = out
+
+ // ORR Rarg1, Rout
+ p2 := s.Prog(arm64.AORR)
+ p2.From.Type = obj.TYPE_REG
+ p2.From.Reg = r1
+ p2.To.Type = obj.TYPE_REG
+ p2.To.Reg = out
+
case ssa.OpARM64MOVBreg,
ssa.OpARM64MOVBUreg,
ssa.OpARM64MOVHreg,