aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/ssa/opGen.go
diff options
context:
space:
mode:
authorMichael Munday <mike.munday@ibm.com>2019-10-23 06:43:23 -0700
committerBrad Fitzpatrick <bradfitz@golang.org>2019-11-11 15:23:59 +0000
commitb3885dbc93ceae1b12f7e80edd2696baf566edec (patch)
treec7f9c66dbe89b891b512347b0e56cf69d253a387 /src/cmd/compile/internal/ssa/opGen.go
parent75c839af22a50cb027766ea54335e234dac32836 (diff)
downloadgo-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz
go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the sequence of instructions used in the assembly implementations to match the intrinsics. Also, add a micro benchmark so we can more easily measure the performance of these two functions: name old time/op new time/op delta And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20) And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20) Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20) Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20) By using a 'rotate then xor selected bits' instruction combined with either a 'load and and' or a 'load and or' instruction we can implement And8 and Or8 with far fewer instructions. Replacing 'compare and swap' with atomic instructions may also improve performance when there is contention. Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01 Reviewed-on: https://go-review.googlesource.com/c/go/+/204277 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/ssa/opGen.go')
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go48
1 files changed, 48 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 047a2a5573..a5951dd4e1 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1982,6 +1982,7 @@ const (
OpS390XRLL
OpS390XRLLGconst
OpS390XRLLconst
+ OpS390XRXSBG
OpS390XNEG
OpS390XNEGW
OpS390XNOT
@@ -2081,6 +2082,8 @@ const (
OpS390XLAAG
OpS390XAddTupleFirst32
OpS390XAddTupleFirst64
+ OpS390XLAOfloor
+ OpS390XLANfloor
OpS390XLoweredAtomicCas32
OpS390XLoweredAtomicCas64
OpS390XLoweredAtomicExchange32
@@ -26502,6 +26505,23 @@ var opcodeTable = [...]opInfo{
},
},
{
+ name: "RXSBG",
+ auxType: auxArchSpecific,
+ argLen: 2,
+ resultInArg0: true,
+ clobberFlags: true,
+ asm: s390x.ARXSBG,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+ {1, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+ },
+ outputs: []outputInfo{
+ {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+ },
+ },
+ },
+ {
name: "NEG",
argLen: 1,
clobberFlags: true,
@@ -27843,6 +27863,34 @@ var opcodeTable = [...]opInfo{
reg: regInfo{},
},
{
+ name: "LAOfloor",
+ argLen: 3,
+ clobberFlags: true,
+ hasSideEffects: true,
+ asm: s390x.ALAO,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2}, // R1
+ {1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP
+ },
+ clobbers: 2, // R1
+ },
+ },
+ {
+ name: "LANfloor",
+ argLen: 3,
+ clobberFlags: true,
+ hasSideEffects: true,
+ asm: s390x.ALAN,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2}, // R1
+ {1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP
+ },
+ clobbers: 2, // R1
+ },
+ },
+ {
name: "LoweredAtomicCas32",
auxType: auxSymOff,
argLen: 4,