aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/s390x
diff options
context:
space:
mode:
authorMichael Munday <mike.munday@ibm.com>2019-10-23 06:43:23 -0700
committerBrad Fitzpatrick <bradfitz@golang.org>2019-11-11 15:23:59 +0000
commitb3885dbc93ceae1b12f7e80edd2696baf566edec (patch)
treec7f9c66dbe89b891b512347b0e56cf69d253a387 /src/cmd/compile/internal/s390x
parent75c839af22a50cb027766ea54335e234dac32836 (diff)
downloadgo-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz
go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the sequence of instructions used in the assembly implementations to match the intrinsics. Also, add a micro benchmark so we can more easily measure the performance of these two functions: name old time/op new time/op delta And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20) And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20) Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20) Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20) By using a 'rotate then xor selected bits' instruction combined with either a 'load and and' or a 'load and or' instruction we can implement And8 and Or8 with far fewer instructions. Replacing 'compare and swap' with atomic instructions may also improve performance when there is contention. Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01 Reviewed-on: https://go-review.googlesource.com/c/go/+/204277 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/s390x')
-rw-r--r--src/cmd/compile/internal/s390x/ssa.go34
1 files changed, 34 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go
index 885c14b33a..f1725bdda4 100644
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@@ -173,6 +173,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if r != r1 {
p.Reg = r1
}
+ case ssa.OpS390XRXSBG:
+ r1 := v.Reg()
+ if r1 != v.Args[0].Reg() {
+ v.Fatalf("input[0] and output not in same register %s", v.LongString())
+ }
+ r2 := v.Args[1].Reg()
+ i := v.Aux.(s390x.RotateParams)
+ p := s.Prog(v.Op.Asm())
+ p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(i.Start)}
+ p.RestArgs = []obj.Addr{
+ {Type: obj.TYPE_CONST, Offset: int64(i.End)},
+ {Type: obj.TYPE_CONST, Offset: int64(i.Amount)},
+ {Type: obj.TYPE_REG, Reg: r2},
+ }
+ p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1}
case ssa.OpS390XADD, ssa.OpS390XADDW,
ssa.OpS390XSUB, ssa.OpS390XSUBW,
ssa.OpS390XAND, ssa.OpS390XANDW,
@@ -736,6 +751,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
+ case ssa.OpS390XLANfloor, ssa.OpS390XLAOfloor:
+ r := v.Args[0].Reg() // clobbered, assumed R1 in comments
+
+ // Round ptr down to nearest multiple of 4.
+ // ANDW $~3, R1
+ ptr := s.Prog(s390x.AANDW)
+ ptr.From.Type = obj.TYPE_CONST
+ ptr.From.Offset = 0xfffffffc
+ ptr.To.Type = obj.TYPE_REG
+ ptr.To.Reg = r
+
+ // Redirect output of LA(N|O) into R1 since it is clobbered anyway.
+ // LA(N|O) Rx, R1, 0(R1)
+ op := s.Prog(v.Op.Asm())
+ op.From.Type = obj.TYPE_REG
+ op.From.Reg = v.Args[1].Reg()
+ op.Reg = r
+ op.To.Type = obj.TYPE_MEM
+ op.To.Reg = r
case ssa.OpS390XLAA, ssa.OpS390XLAAG:
p := s.Prog(v.Op.Asm())
p.Reg = v.Reg0()