diff options
author | Michael Munday <mike.munday@ibm.com> | 2019-10-23 06:43:23 -0700 |
---|---|---|
committer | Brad Fitzpatrick <bradfitz@golang.org> | 2019-11-11 15:23:59 +0000 |
commit | b3885dbc93ceae1b12f7e80edd2696baf566edec (patch) | |
tree | c7f9c66dbe89b891b512347b0e56cf69d253a387 /src/cmd/compile/internal/s390x | |
parent | 75c839af22a50cb027766ea54335e234dac32836 (diff) | |
download | go-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip |
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the
sequence of instructions used in the assembly implementations to
match the intrinsics.
Also, add a micro benchmark so we can more easily measure the
performance of these two functions:
name old time/op new time/op delta
And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20)
And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20)
Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20)
Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20)
By using a 'rotate then xor selected bits' instruction combined with
either a 'load and and' or a 'load and or' instruction we can
implement And8 and Or8 with far fewer instructions. Replacing
'compare and swap' with atomic instructions may also improve
performance when there is contention.
Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01
Reviewed-on: https://go-review.googlesource.com/c/go/+/204277
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/s390x')
-rw-r--r-- | src/cmd/compile/internal/s390x/ssa.go | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go index 885c14b33a..f1725bdda4 100644 --- a/src/cmd/compile/internal/s390x/ssa.go +++ b/src/cmd/compile/internal/s390x/ssa.go @@ -173,6 +173,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { if r != r1 { p.Reg = r1 } + case ssa.OpS390XRXSBG: + r1 := v.Reg() + if r1 != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + r2 := v.Args[1].Reg() + i := v.Aux.(s390x.RotateParams) + p := s.Prog(v.Op.Asm()) + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(i.Start)} + p.RestArgs = []obj.Addr{ + {Type: obj.TYPE_CONST, Offset: int64(i.End)}, + {Type: obj.TYPE_CONST, Offset: int64(i.Amount)}, + {Type: obj.TYPE_REG, Reg: r2}, + } + p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1} case ssa.OpS390XADD, ssa.OpS390XADDW, ssa.OpS390XSUB, ssa.OpS390XSUBW, ssa.OpS390XAND, ssa.OpS390XANDW, @@ -736,6 +751,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_MEM p.To.Reg = v.Args[0].Reg() gc.AddAux(&p.To, v) + case ssa.OpS390XLANfloor, ssa.OpS390XLAOfloor: + r := v.Args[0].Reg() // clobbered, assumed R1 in comments + + // Round ptr down to nearest multiple of 4. + // ANDW $~3, R1 + ptr := s.Prog(s390x.AANDW) + ptr.From.Type = obj.TYPE_CONST + ptr.From.Offset = 0xfffffffc + ptr.To.Type = obj.TYPE_REG + ptr.To.Reg = r + + // Redirect output of LA(N|O) into R1 since it is clobbered anyway. + // LA(N|O) Rx, R1, 0(R1) + op := s.Prog(v.Op.Asm()) + op.From.Type = obj.TYPE_REG + op.From.Reg = v.Args[1].Reg() + op.Reg = r + op.To.Type = obj.TYPE_MEM + op.To.Reg = r case ssa.OpS390XLAA, ssa.OpS390XLAAG: p := s.Prog(v.Op.Asm()) p.Reg = v.Reg0() |