diff options
author | Michael Munday <mike.munday@ibm.com> | 2019-10-23 06:43:23 -0700 |
---|---|---|
committer | Brad Fitzpatrick <bradfitz@golang.org> | 2019-11-11 15:23:59 +0000 |
commit | b3885dbc93ceae1b12f7e80edd2696baf566edec (patch) | |
tree | c7f9c66dbe89b891b512347b0e56cf69d253a387 /src/cmd/compile/internal/ssa/opGen.go | |
parent | 75c839af22a50cb027766ea54335e234dac32836 (diff) | |
download | go-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip |
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the
sequence of instructions used in the assembly implementations to
match the intrinsics.
Also, add a micro benchmark so we can more easily measure the
performance of these two functions:
name old time/op new time/op delta
And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20)
And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20)
Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20)
Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20)
By using a 'rotate then xor selected bits' instruction combined with
either a 'load and and' or a 'load and or' instruction we can
implement And8 and Or8 with far fewer instructions. Replacing
'compare and swap' with atomic instructions may also improve
performance when there is contention.
Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01
Reviewed-on: https://go-review.googlesource.com/c/go/+/204277
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/ssa/opGen.go')
-rw-r--r-- | src/cmd/compile/internal/ssa/opGen.go | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 047a2a5573..a5951dd4e1 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1982,6 +1982,7 @@ const ( OpS390XRLL OpS390XRLLGconst OpS390XRLLconst + OpS390XRXSBG OpS390XNEG OpS390XNEGW OpS390XNOT @@ -2081,6 +2082,8 @@ const ( OpS390XLAAG OpS390XAddTupleFirst32 OpS390XAddTupleFirst64 + OpS390XLAOfloor + OpS390XLANfloor OpS390XLoweredAtomicCas32 OpS390XLoweredAtomicCas64 OpS390XLoweredAtomicExchange32 @@ -26502,6 +26505,23 @@ var opcodeTable = [...]opInfo{ }, }, { + name: "RXSBG", + auxType: auxArchSpecific, + argLen: 2, + resultInArg0: true, + clobberFlags: true, + asm: s390x.ARXSBG, + reg: regInfo{ + inputs: []inputInfo{ + {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 + {1, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 + }, + outputs: []outputInfo{ + {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 + }, + }, + }, + { name: "NEG", argLen: 1, clobberFlags: true, @@ -27843,6 +27863,34 @@ var opcodeTable = [...]opInfo{ reg: regInfo{}, }, { + name: "LAOfloor", + argLen: 3, + clobberFlags: true, + hasSideEffects: true, + asm: s390x.ALAO, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2}, // R1 + {1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP + }, + clobbers: 2, // R1 + }, + }, + { + name: "LANfloor", + argLen: 3, + clobberFlags: true, + hasSideEffects: true, + asm: s390x.ALAN, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2}, // R1 + {1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP + }, + clobbers: 2, // R1 + }, + }, + { name: "LoweredAtomicCas32", auxType: auxSymOff, argLen: 4, |