diff options
author | Jonathan Swinney <jswinney@amazon.com> | 2020-11-04 16:18:23 +0000 |
---|---|---|
committer | Cherry Zhang <cherryyz@google.com> | 2020-11-05 23:21:33 +0000 |
commit | ecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a (patch) | |
tree | f73baf58f3df5fcdcec20b65e3223260f14472ea /src/cmd/compile/internal/arm64 | |
parent | 8e5778ed70ec3d371615a663520a586745fb7bee (diff) | |
download | go-ecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a.tar.gz go-ecc3f5112ebaf23c4b1ac4c5eedfa406d82ecc9a.zip |
cmd/compile: improve atomic swap intrinsics on arm64
ARMv8.1 has added new instructions for atomic memory operations. This
change builds on the previous change which added support for atomic add,
0a7ac93c27c9ade79fe0f66ae0bb81484c241ae5, to include similar support for
atomic-compare-and-swap, atomic-swap, atomic-or, and atomic-and
intrinsics. Since the new instructions are not guaranteed to be present,
we guard their usages with a branch on a CPU feature.
Peformance on an ARMv8.1 machine:
name old time/op new time/op delta
CompareAndSwap-16 37.9ns ±16% 24.1ns ± 4% -36.44% (p=0.000 n=10+9)
CompareAndSwap64-16 38.6ns ±15% 24.1ns ± 3% -37.47% (p=0.000 n=10+10)
name old time/op new time/op delta
Swap-16 46.9ns ±32% 12.5ns ± 6% -73.40% (p=0.000 n=10+10)
Swap64-16 53.4ns ± 1% 12.5ns ± 6% -76.56% (p=0.000 n=10+10)
name old time/op new time/op delta
Or8-16 8.81ns ± 0% 5.61ns ± 0% -36.32% (p=0.000 n=10+10)
Or-16 7.21ns ± 0% 5.61ns ± 0% -22.19% (p=0.000 n=10+10)
Or8Parallel-16 59.8ns ± 3% 12.5ns ± 2% -79.10% (p=0.000 n=10+10)
OrParallel-16 51.7ns ± 3% 12.5ns ± 2% -75.84% (p=0.000 n=10+10)
name old time/op new time/op delta
And8-16 8.81ns ± 0% 5.61ns ± 0% -36.32% (p=0.000 n=10+10)
And-16 7.21ns ± 0% 5.61ns ± 0% -22.19% (p=0.000 n=10+10)
And8Parallel-16 59.1ns ± 6% 12.8ns ± 3% -78.33% (p=0.000 n=10+10)
AndParallel-16 51.4ns ± 7% 12.8ns ± 3% -75.03% (p=0.000 n=10+10)
Performance on an ARMv8.0 machine (no atomics instructions):
name old time/op new time/op delta
CompareAndSwap-16 61.3ns ± 0% 62.4ns ± 0% +1.70% (p=0.000 n=8+9)
CompareAndSwap64-16 62.0ns ± 3% 61.3ns ± 2% ~ (p=0.093 n=10+10)
name old time/op new time/op delta
Swap-16 127ns ± 2% 131ns ± 2% +2.91% (p=0.001 n=10+10)
Swap64-16 128ns ± 1% 131ns ± 2% +2.43% (p=0.001 n=10+10)
name old time/op new time/op delta
Or8-16 14.9ns ± 0% 15.3ns ± 0% +2.68% (p=0.000 n=10+10)
Or-16 11.8ns ± 0% 12.3ns ± 0% +4.24% (p=0.000 n=10+10)
Or8Parallel-16 137ns ± 1% 144ns ± 1% +4.97% (p=0.000 n=10+10)
OrParallel-16 128ns ± 1% 136ns ± 1% +6.34% (p=0.000 n=10+10)
name old time/op new time/op delta
And8-16 14.9ns ± 0% 15.3ns ± 0% +2.68% (p=0.000 n=10+10)
And-16 11.8ns ± 0% 12.3ns ± 0% +4.24% (p=0.000 n=10+10)
And8Parallel-16 134ns ± 2% 141ns ± 1% +5.29% (p=0.000 n=10+10)
AndParallel-16 125ns ± 2% 134ns ± 1% +7.10% (p=0.000 n=10+10)
Fixes #39304
Change-Id: Idaca68701d4751650be6b4bedca3d57f51571712
Reviewed-on: https://go-review.googlesource.com/c/go/+/234217
Run-TryBot: Emmanuel Odeke <emmanuel@orijtech.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: fannie zhang <Fannie.Zhang@arm.com>
Diffstat (limited to 'src/cmd/compile/internal/arm64')
-rw-r--r-- | src/cmd/compile/internal/arm64/ssa.go | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 5c695ef84c..22b28a9308 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -581,6 +581,24 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p2.From.Reg = arm64.REGTMP p2.To.Type = obj.TYPE_BRANCH gc.Patch(p2, p) + case ssa.OpARM64LoweredAtomicExchange64Variant, + ssa.OpARM64LoweredAtomicExchange32Variant: + swap := arm64.ASWPALD + if v.Op == ssa.OpARM64LoweredAtomicExchange32Variant { + swap = arm64.ASWPALW + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + out := v.Reg0() + + // SWPALD Rarg1, (Rarg0), Rout + p := s.Prog(swap) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_MEM + p.To.Reg = r0 + p.RegTo2 = out + case ssa.OpARM64LoweredAtomicAdd64, ssa.OpARM64LoweredAtomicAdd32: // LDAXR (Rarg0), Rout @@ -687,6 +705,56 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p5.To.Type = obj.TYPE_REG p5.To.Reg = out gc.Patch(p2, p5) + case ssa.OpARM64LoweredAtomicCas64Variant, + ssa.OpARM64LoweredAtomicCas32Variant: + // Rarg0: ptr + // Rarg1: old + // Rarg2: new + // MOV Rarg1, Rtmp + // CASAL Rtmp, (Rarg0), Rarg2 + // CMP Rarg1, Rtmp + // CSET EQ, Rout + cas := arm64.ACASALD + cmp := arm64.ACMP + mov := arm64.AMOVD + if v.Op == ssa.OpARM64LoweredAtomicCas32Variant { + cas = arm64.ACASALW + cmp = arm64.ACMPW + mov = arm64.AMOVW + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + r2 := v.Args[2].Reg() + out := v.Reg0() + + // MOV Rarg1, Rtmp + p := s.Prog(mov) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = arm64.REGTMP + + // CASAL Rtmp, (Rarg0), Rarg2 + p1 := s.Prog(cas) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = arm64.REGTMP + p1.To.Type = obj.TYPE_MEM + p1.To.Reg = r0 + p1.RegTo2 = r2 + + // CMP Rarg1, Rtmp + p2 := s.Prog(cmp) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = r1 + p2.Reg = arm64.REGTMP + + // CSET EQ, Rout + p3 := s.Prog(arm64.ACSET) + p3.From.Type = obj.TYPE_REG + p3.From.Reg = arm64.COND_EQ + p3.To.Type = obj.TYPE_REG + p3.To.Reg = out + case ssa.OpARM64LoweredAtomicAnd8, ssa.OpARM64LoweredAtomicAnd32, ssa.OpARM64LoweredAtomicOr8, @@ -725,6 +793,63 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p3.From.Reg = arm64.REGTMP p3.To.Type = obj.TYPE_BRANCH gc.Patch(p3, p) + case ssa.OpARM64LoweredAtomicAnd8Variant, + ssa.OpARM64LoweredAtomicAnd32Variant: + atomic_clear := arm64.ALDCLRALW + if v.Op == ssa.OpARM64LoweredAtomicAnd8Variant { + atomic_clear = arm64.ALDCLRALB + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + out := v.Reg0() + + // MNV Rarg1 Rtemp + p := s.Prog(arm64.AMVN) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = arm64.REGTMP + + // LDCLRALW Rtemp, (Rarg0), Rout + p1 := s.Prog(atomic_clear) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = arm64.REGTMP + p1.To.Type = obj.TYPE_MEM + p1.To.Reg = r0 + p1.RegTo2 = out + + // AND Rarg1, Rout + p2 := s.Prog(arm64.AAND) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = r1 + p2.To.Type = obj.TYPE_REG + p2.To.Reg = out + + case ssa.OpARM64LoweredAtomicOr8Variant, + ssa.OpARM64LoweredAtomicOr32Variant: + atomic_or := arm64.ALDORALW + if v.Op == ssa.OpARM64LoweredAtomicOr8Variant { + atomic_or = arm64.ALDORALB + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + out := v.Reg0() + + // LDORALW Rarg1, (Rarg0), Rout + p := s.Prog(atomic_or) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_MEM + p.To.Reg = r0 + p.RegTo2 = out + + // ORR Rarg1, Rout + p2 := s.Prog(arm64.AORR) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = r1 + p2.To.Type = obj.TYPE_REG + p2.To.Reg = out + case ssa.OpARM64MOVBreg, ssa.OpARM64MOVBUreg, ssa.OpARM64MOVHreg, |