aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2020-03-30 15:23:19 -0400
committerLynn Boger <laboger@linux.vnet.ibm.com>2020-04-06 12:09:39 +0000
commit815509ae31fc7eaf753def9deb9cafee968f92b3 (patch)
treebd27aa71683f030619f08c9441616cbdcf3d62fb
parent5f3354d1bf2e6a61e4b9e1e31ee04b99dfe7de35 (diff)
downloadgo-815509ae31fc7eaf753def9deb9cafee968f92b3.tar.gz
go-815509ae31fc7eaf753def9deb9cafee968f92b3.zip
cmd/compile: improve lowered moves and zeros for ppc64le
This change includes the following: - Generate LXV/STXV sequences instead of LXVD2X/STXVD2X on power9. These instructions do not require an index register, which allows more loads and stores within a loop without initializing multiple index registers. The LoweredQuadXXX generate LXV/STXV. - Create LoweredMoveXXXShort and LoweredZeroXXXShort for short moves that don't generate loops, and therefore don't clobber the address registers or flags. - Use registers other than R3 and R4 to avoid conflicting with registers that have already been allocated to avoid unnecessary register moves. - Eliminate the use of R14 as scratch register and use R31 instead. - Add PCALIGN when the LoweredMoveXXX or LoweredZeroXXX generates a loop with more than 3 iterations. This performance opportunity was noticed in github.com/golang/snappy benchmarks. Results on power9: WordsDecode1e1 54.1ns ± 0% 53.8ns ± 0% -0.51% (p=0.029 n=4+4) WordsDecode1e2 287ns ± 0% 282ns ± 1% -1.83% (p=0.029 n=4+4) WordsDecode1e3 3.98µs ± 0% 3.64µs ± 0% -8.52% (p=0.029 n=4+4) WordsDecode1e4 66.9µs ± 0% 67.0µs ± 0% +0.20% (p=0.029 n=4+4) WordsDecode1e5 723µs ± 0% 723µs ± 0% -0.01% (p=0.200 n=4+4) WordsDecode1e6 7.21ms ± 0% 7.21ms ± 0% -0.02% (p=1.000 n=4+4) WordsEncode1e1 29.9ns ± 0% 29.4ns ± 0% -1.51% (p=0.029 n=4+4) WordsEncode1e2 2.12µs ± 0% 1.75µs ± 0% -17.70% (p=0.029 n=4+4) WordsEncode1e3 11.7µs ± 0% 11.2µs ± 0% -4.61% (p=0.029 n=4+4) WordsEncode1e4 119µs ± 0% 120µs ± 0% +0.36% (p=0.029 n=4+4) WordsEncode1e5 1.21ms ± 0% 1.22ms ± 0% +0.41% (p=0.029 n=4+4) WordsEncode1e6 12.0ms ± 0% 12.0ms ± 0% +0.57% (p=0.029 n=4+4) RandomEncode 286µs ± 0% 203µs ± 0% -28.82% (p=0.029 n=4+4) ExtendMatch 47.4µs ± 0% 47.0µs ± 0% -0.85% (p=0.029 n=4+4) Change-Id: Iecad3a39ae55280286e42760a5c9d5c1168f5858 Reviewed-on: https://go-review.googlesource.com/c/go/+/226539 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
-rw-r--r--src/cmd/compile/internal/ppc64/ssa.go586
-rw-r--r--src/cmd/compile/internal/ssa/gen/PPC64.rules13
-rw-r--r--src/cmd/compile/internal/ssa/gen/PPC64Ops.go86
-rw-r--r--src/cmd/compile/internal/ssa/opGen.go98
-rw-r--r--src/cmd/compile/internal/ssa/rewrite.go4
-rw-r--r--src/cmd/compile/internal/ssa/rewritePPC64.go86
-rw-r--r--test/codegen/copy.go34
7 files changed, 833 insertions, 74 deletions
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 0ab21604e5..50f595fe2f 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -850,39 +850,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt & 3
- case ssa.OpPPC64LoweredZero:
+ case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
+ // The LoweredQuad code generation
+ // generates STXV instructions on
+ // power9. The Short variation is used
+ // if no loop is generated.
- // unaligned data doesn't hurt performance
- // for these instructions on power8 or later
+ // sizes >= 64 generate a loop as follows:
- // for sizes >= 64 generate a loop as follows:
+ // Set up loop counter in CTR, used by BC
+ // XXLXOR clears VS32
+ // XXLXOR VS32,VS32,VS32
+ // MOVD len/64,REG_TMP
+ // MOVD REG_TMP,CTR
+ // loop:
+ // STXV VS32,0(R20)
+ // STXV VS32,16(R20)
+ // STXV VS32,32(R20)
+ // STXV VS32,48(R20)
+ // ADD $64,R20
+ // BC 16, 0, loop
+
+ // Bytes per iteration
+ ctr := v.AuxInt / 64
+
+ // Remainder bytes
+ rem := v.AuxInt % 64
+
+ // Only generate a loop if there is more
+ // than 1 iteration.
+ if ctr > 1 {
+ // Set up VS32 (V0) to hold 0s
+ p := s.Prog(ppc64.AXXLXOR)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ p.Reg = ppc64.REG_VS32
+
+ // Set up CTR loop counter
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ctr
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_CTR
+
+ // Don't generate padding for
+ // loops with few iterations.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
+ // generate 4 STXVs to zero 64 bytes
+ var top *obj.Prog
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+
+ // Save the top of loop
+ if top == nil {
+ top = p
+ }
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 16
- // set up loop counter in CTR, used by BC
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 48
+
+ // Increment address for the
+ // 64 bytes just zeroed.
+ p = s.Prog(ppc64.AADD)
+ p.Reg = v.Args[0].Reg()
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 64
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Args[0].Reg()
+
+ // Branch back to top of loop
+ // based on CTR
+ // BC with BO_BCTR generates bdnz
+ p = s.Prog(ppc64.ABC)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ppc64.BO_BCTR
+ p.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_BRANCH
+ gc.Patch(p, top)
+ }
+ // When ctr == 1 the loop was not generated but
+ // there are at least 64 bytes to clear, so add
+ // that to the remainder to generate the code
+ // to clear those doublewords
+ if ctr == 1 {
+ rem += 64
+ }
+
+ // Clear the remainder starting at offset zero
+ offset := int64(0)
+
+ if rem >= 16 && ctr <= 1 {
+ // If the XXLXOR hasn't already been
+ // generated, do it here to initialize
+ // VS32 (V0) to 0.
+ p := s.Prog(ppc64.AXXLXOR)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ p.Reg = ppc64.REG_VS32
+ }
+ // Generate STXV for 32 or 64
+ // bytes.
+ for rem >= 32 {
+ p := s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset + 16
+ offset += 32
+ rem -= 32
+ }
+ // Generate 16 bytes
+ if rem >= 16 {
+ p := s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+ offset += 16
+ rem -= 16
+ }
+
+ // first clear as many doublewords as possible
+ // then clear remaining sizes as available
+ for rem > 0 {
+ op, size := ppc64.AMOVB, int64(1)
+ switch {
+ case rem >= 8:
+ op, size = ppc64.AMOVD, 8
+ case rem >= 4:
+ op, size = ppc64.AMOVW, 4
+ case rem >= 2:
+ op, size = ppc64.AMOVH, 2
+ }
+ p := s.Prog(op)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+ rem -= size
+ offset += size
+ }
+
+ case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
+
+ // Unaligned data doesn't hurt performance
+ // for these instructions on power8.
+
+ // For sizes >= 64 generate a loop as follows:
+
+ // Set up loop counter in CTR, used by BC
// XXLXOR VS32,VS32,VS32
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// loop:
- // STXVD2X VS32,(R0)(R3)
- // STXVD2X VS32,(R31)(R3)
- // ADD $32,R3
+ // STXVD2X VS32,(R0)(R20)
+ // STXVD2X VS32,(R31)(R20)
+ // ADD $32,R20
// BC 16, 0, loop
//
// any remainder is done as described below
// for sizes < 64 bytes, first clear as many doublewords as possible,
// then handle the remainder
- // MOVD R0,(R3)
- // MOVD R0,8(R3)
+ // MOVD R0,(R20)
+ // MOVD R0,8(R20)
// .... etc.
//
// the remainder bytes are cleared using one or more
// of the following instructions with the appropriate
// offsets depending which instructions are needed
//
- // MOVW R0,n1(R3) 4 bytes
- // MOVH R0,n2(R3) 2 bytes
- // MOVB R0,n3(R3) 1 byte
+ // MOVW R0,n1(R20) 4 bytes
+ // MOVH R0,n2(R20) 2 bytes
+ // MOVB R0,n3(R20) 1 byte
//
// 7 bytes: MOVW, MOVH, MOVB
// 6 bytes: MOVW, MOVH
@@ -926,10 +1113,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
+ // Don't add padding for alignment
+ // with few loop iterations.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
// generate 2 STXVD2Xs to store 16 bytes
// when this is a loop then the top must be saved
var top *obj.Prog
// This is the top of loop
+
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
@@ -940,7 +1136,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if top == nil {
top = p
}
-
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
@@ -1001,8 +1196,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
offset += size
}
- case ssa.OpPPC64LoweredMove:
+ case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:
+ bytesPerLoop := int64(32)
// This will be used when moving more
// than 8 bytes. Moves start with
// as many 8 byte moves as possible, then
@@ -1019,34 +1215,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// top:
- // LXVD2X (R0)(R4),VS32
- // LXVD2X (R31)(R4),VS33
- // ADD $32,R4
- // STXVD2X VS32,(R0)(R3)
- // STXVD2X VS33,(R31)(R4)
- // ADD $32,R3
+ // LXVD2X (R0)(R21),VS32
+ // LXVD2X (R31)(R21),VS33
+ // ADD $32,R21
+ // STXVD2X VS32,(R0)(R20)
+ // STXVD2X VS33,(R31)(R20)
+ // ADD $32,R20
// BC 16,0,top
// Bytes not moved by this loop are moved
// with a combination of the following instructions,
// starting with the largest sizes and generating as
// many as needed, using the appropriate offset value.
- // MOVD n(R4),R14
- // MOVD R14,n(R3)
- // MOVW n1(R4),R14
- // MOVW R14,n1(R3)
- // MOVH n2(R4),R14
- // MOVH R14,n2(R3)
- // MOVB n3(R4),R14
- // MOVB R14,n3(R3)
+ // MOVD n(R21),R31
+ // MOVD R31,n(R20)
+ // MOVW n1(R21),R31
+ // MOVW R31,n1(R20)
+ // MOVH n2(R21),R31
+ // MOVH R31,n2(R20)
+ // MOVB n3(R21),R31
+ // MOVB R31,n3(R20)
// Each loop iteration moves 32 bytes
- ctr := v.AuxInt / 32
+ ctr := v.AuxInt / bytesPerLoop
// Remainder after the loop
- rem := v.AuxInt % 32
+ rem := v.AuxInt % bytesPerLoop
- dst_reg := v.Args[0].Reg()
- src_reg := v.Args[1].Reg()
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
// The set of registers used here, must match the clobbered reg list
// in PPC64Ops.go.
@@ -1076,57 +1272,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
+ // Don't adding padding for
+ // alignment with small iteration
+ // counts.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
// Generate 16 byte loads and stores.
// Use temp register for index (16)
// on the second one.
+
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
-
if top == nil {
top = p
}
-
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS33
// increment the src reg for next iteration
p = s.Prog(ppc64.AADD)
- p.Reg = src_reg
+ p.Reg = srcReg
p.From.Type = obj.TYPE_CONST
- p.From.Offset = 32
+ p.From.Offset = bytesPerLoop
p.To.Type = obj.TYPE_REG
- p.To.Reg = src_reg
+ p.To.Reg = srcReg
// generate 16 byte stores
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGZERO
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS33
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGTMP
// increment the dst reg for next iteration
p = s.Prog(ppc64.AADD)
- p.Reg = dst_reg
+ p.Reg = dstReg
p.From.Type = obj.TYPE_CONST
- p.From.Offset = 32
+ p.From.Offset = bytesPerLoop
p.To.Type = obj.TYPE_REG
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
// BC with BO_BCTR generates bdnz to branch on nonzero CTR
// to loop top.
@@ -1137,7 +1341,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_BRANCH
gc.Patch(p, top)
- // src_reg and dst_reg were incremented in the loop, so
+ // srcReg and dstReg were incremented in the loop, so
// later instructions start with offset 0.
offset = int64(0)
}
@@ -1145,7 +1349,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// No loop was generated for one iteration, so
// add 32 bytes to the remainder to move those bytes.
if ctr == 1 {
- rem += 32
+ rem += bytesPerLoop
}
if rem >= 16 {
@@ -1154,7 +1358,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// on the second one.
p := s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
@@ -1163,7 +1367,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGZERO
offset = 16
@@ -1171,18 +1375,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if rem >= 16 {
// Use REGTMP as index reg
- p = s.Prog(ppc64.AMOVD)
+ p := s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 16
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
- // Generate 16 byte loads and stores.
- // Use temp register for index (16)
- // on the second one.
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
@@ -1191,7 +1392,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGTMP
offset = 32
@@ -1214,17 +1415,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// Load
p := s.Prog(op)
p.To.Type = obj.TYPE_REG
- p.To.Reg = ppc64.REG_R14
+ p.To.Reg = ppc64.REGTMP
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+
+ // Store
+ p = s.Prog(op)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+ rem -= size
+ offset += size
+ }
+
+ case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
+ bytesPerLoop := int64(64)
+ // This is used when moving more
+ // than 8 bytes on power9. Moves start with
+ // as many 8 byte moves as possible, then
+ // 4, 2, or 1 byte(s) as remaining. This will
+ // work and be efficient for power8 or later.
+ // If there are 64 or more bytes, then a
+ // loop is generated to move 32 bytes and
+ // update the src and dst addresses on each
+ // iteration. When < 64 bytes, the appropriate
+ // number of moves are generated based on the
+ // size.
+ // When moving >= 64 bytes a loop is used
+ // MOVD len/32,REG_TMP
+ // MOVD REG_TMP,CTR
+ // top:
+ // LXV 0(R21),VS32
+ // LXV 16(R21),VS33
+ // ADD $32,R21
+ // STXV VS32,0(R20)
+ // STXV VS33,16(R20)
+ // ADD $32,R20
+ // BC 16,0,top
+ // Bytes not moved by this loop are moved
+ // with a combination of the following instructions,
+ // starting with the largest sizes and generating as
+ // many as needed, using the appropriate offset value.
+ // MOVD n(R21),R31
+ // MOVD R31,n(R20)
+ // MOVW n1(R21),R31
+ // MOVW R31,n1(R20)
+ // MOVH n2(R21),R31
+ // MOVH R31,n2(R20)
+ // MOVB n3(R21),R31
+ // MOVB R31,n3(R20)
+
+ // Each loop iteration moves 32 bytes
+ ctr := v.AuxInt / bytesPerLoop
+
+ // Remainder after the loop
+ rem := v.AuxInt % bytesPerLoop
+
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+
+ offset := int64(0)
+
+ // top of the loop
+ var top *obj.Prog
+
+ // Only generate looping code when loop counter is > 1 for >= 64 bytes
+ if ctr > 1 {
+ // Set up the CTR
+ p := s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ctr
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_CTR
+
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+
+ // Generate 16 byte loads and stores.
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ if top == nil {
+ top = p
+ }
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ // generate 16 byte stores
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 16
+
+ // Generate 16 byte loads and stores.
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 48
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ // generate 16 byte stores
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 48
+
+ // increment the src reg for next iteration
+ p = s.Prog(ppc64.AADD)
+ p.Reg = srcReg
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = bytesPerLoop
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = srcReg
+
+ // increment the dst reg for next iteration
+ p = s.Prog(ppc64.AADD)
+ p.Reg = dstReg
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = bytesPerLoop
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = dstReg
+
+ // BC with BO_BCTR generates bdnz to branch on nonzero CTR
+ // to loop top.
+ p = s.Prog(ppc64.ABC)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ppc64.BO_BCTR
+ p.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_BRANCH
+ gc.Patch(p, top)
+
+ // srcReg and dstReg were incremented in the loop, so
+ // later instructions start with offset 0.
+ offset = int64(0)
+ }
+
+ // No loop was generated for one iteration, so
+ // add 32 bytes to the remainder to move those bytes.
+ if ctr == 1 {
+ rem += bytesPerLoop
+ }
+ if rem >= 32 {
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = 16
+
+ offset = 32
+ rem -= 32
+ }
+
+ if rem >= 16 {
+ // Generate 16 byte loads and stores.
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ offset += 16
+ rem -= 16
+
+ if rem >= 16 {
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ offset += 16
+ rem -= 16
+ }
+ }
+ // Generate all the remaining load and store pairs, starting with
+ // as many 8 byte moves as possible, then 4, 2, 1.
+ for rem > 0 {
+ op, size := ppc64.AMOVB, int64(1)
+ switch {
+ case rem >= 8:
+ op, size = ppc64.AMOVD, 8
+ case rem >= 4:
+ op, size = ppc64.AMOVW, 4
+ case rem >= 2:
+ op, size = ppc64.AMOVH, 2
+ }
+ // Load
+ p := s.Prog(op)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Offset = offset
// Store
p = s.Prog(op)
p.From.Type = obj.TYPE_REG
- p.From.Reg = ppc64.REG_R14
+ p.From.Reg = ppc64.REGTMP
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Offset = offset
rem -= size
offset += size
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index 0c182a6222..22086db592 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -574,7 +574,12 @@
(MOVDstorezero [0] destptr mem))))
// Handle cases not handled above
-(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
+// Lowered Short cases do not generate loops, and as a result don't clobber
+// the address registers or flags.
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 && s < 64 -> (LoweredZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 -> (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s < 128 && objabi.GOPPC64 >= 9 -> (LoweredQuadZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 >= 9 -> (LoweredQuadZero [s] ptr mem)
// moves
// Only the MOVD and MOVW instructions require 4 byte
@@ -608,8 +613,12 @@
// Large move uses a loop. Since the address is computed and the
// offset is zero, any alignment can be used.
-(Move [s] dst src mem) && s > 8 && logLargeCopy(v, s) ->
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s) ->
(LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && s <= 64 && objabi.GOPPC64 >= 9 ->
+ (LoweredQuadMoveShort [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s) ->
+ (LoweredQuadMove [s] dst src mem)
// Calls
// Lowering calls
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index 4509c48570..0199c8f713 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -445,14 +445,49 @@ func init() {
aux: "Int64",
argLength: 2,
reg: regInfo{
- inputs: []regMask{buildReg("R3")},
- clobbers: buildReg("R3"),
+ inputs: []regMask{buildReg("R20")},
+ clobbers: buildReg("R20"),
},
clobberFlags: true,
typ: "Mem",
faultOnNilArg0: true,
unsafePoint: true,
},
+ {
+ name: "LoweredZeroShort",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp}},
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredQuadZeroShort",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredQuadZero",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20")},
+ clobbers: buildReg("R20"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+
// R31 is temp register
// Loop code:
// MOVD len/32,R31 set up loop ctr
@@ -491,8 +526,8 @@ func init() {
aux: "Int64",
argLength: 3,
reg: regInfo{
- inputs: []regMask{buildReg("R3"), buildReg("R4")},
- clobbers: buildReg("R3 R4 R14"),
+ inputs: []regMask{buildReg("R20"), buildReg("R21")},
+ clobbers: buildReg("R20 R21"),
},
clobberFlags: true,
typ: "Mem",
@@ -500,6 +535,49 @@ func init() {
faultOnNilArg1: true,
unsafePoint: true,
},
+ {
+ name: "LoweredMoveShort",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gp, gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+
+ // The following is similar to the LoweredMove, but uses
+ // LXV instead of LXVD2X, which does not require an index
+ // register and will do 4 in a loop instead of only.
+ {
+ name: "LoweredQuadMove",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20"), buildReg("R21")},
+ clobbers: buildReg("R20 R21"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+
+ {
+ name: "LoweredQuadMoveShort",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gp, gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index e8d1b841c8..ac0719ec0e 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1872,7 +1872,13 @@ const (
OpPPC64CALLclosure
OpPPC64CALLinter
OpPPC64LoweredZero
+ OpPPC64LoweredZeroShort
+ OpPPC64LoweredQuadZeroShort
+ OpPPC64LoweredQuadZero
OpPPC64LoweredMove
+ OpPPC64LoweredMoveShort
+ OpPPC64LoweredQuadMove
+ OpPPC64LoweredQuadMoveShort
OpPPC64LoweredAtomicStore8
OpPPC64LoweredAtomicStore32
OpPPC64LoweredAtomicStore64
@@ -24865,9 +24871,47 @@ var opcodeTable = [...]opInfo{
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 8}, // R3
+ {0, 1048576}, // R20
+ },
+ clobbers: 1048576, // R20
+ },
+ },
+ {
+ name: "LoweredZeroShort",
+ auxType: auxInt64,
+ argLen: 2,
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
+ {
+ name: "LoweredQuadZeroShort",
+ auxType: auxInt64,
+ argLen: 2,
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
+ {
+ name: "LoweredQuadZero",
+ auxType: auxInt64,
+ argLen: 2,
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1048576}, // R20
},
- clobbers: 8, // R3
+ clobbers: 1048576, // R20
},
},
{
@@ -24880,10 +24924,54 @@ var opcodeTable = [...]opInfo{
unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 8}, // R3
- {1, 16}, // R4
+ {0, 1048576}, // R20
+ {1, 2097152}, // R21
+ },
+ clobbers: 3145728, // R20 R21
+ },
+ },
+ {
+ name: "LoweredMoveShort",
+ auxType: auxInt64,
+ argLen: 3,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ {1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
+ {
+ name: "LoweredQuadMove",
+ auxType: auxInt64,
+ argLen: 3,
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1048576}, // R20
+ {1, 2097152}, // R21
+ },
+ clobbers: 3145728, // R20 R21
+ },
+ },
+ {
+ name: "LoweredQuadMoveShort",
+ auxType: auxInt64,
+ argLen: 3,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ {1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
- clobbers: 16408, // R3 R4 R14
},
},
{
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index a7979b273f..27c6b169e5 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1075,9 +1075,9 @@ func isInlinableMemmove(dst, src *Value, sz int64, c *Config) bool {
switch c.arch {
case "amd64":
return sz <= 16 || (sz < 1024 && disjoint(dst, sz, src, sz))
- case "386", "ppc64", "ppc64le", "arm64":
+ case "386", "arm64":
return sz <= 8
- case "s390x":
+ case "s390x", "ppc64", "ppc64le":
return sz <= 8 || disjoint(dst, sz, src, sz)
case "arm", "mips", "mips64", "mipsle", "mips64le":
return sz <= 4
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index a2ee60a86e..3f7ea3c222 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -3486,14 +3486,14 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
return true
}
// match: (Move [s] dst src mem)
- // cond: s > 8 && logLargeCopy(v, s)
+ // cond: s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)
// result: (LoweredMove [s] dst src mem)
for {
s := v.AuxInt
dst := v_0
src := v_1
mem := v_2
- if !(s > 8 && logLargeCopy(v, s)) {
+ if !(s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)) {
break
}
v.reset(OpPPC64LoweredMove)
@@ -3501,6 +3501,38 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
v.AddArg3(dst, src, mem)
return true
}
+ // match: (Move [s] dst src mem)
+ // cond: s > 8 && s <= 64 && objabi.GOPPC64 >= 9
+ // result: (LoweredQuadMoveShort [s] dst src mem)
+ for {
+ s := v.AuxInt
+ dst := v_0
+ src := v_1
+ mem := v_2
+ if !(s > 8 && s <= 64 && objabi.GOPPC64 >= 9) {
+ break
+ }
+ v.reset(OpPPC64LoweredQuadMoveShort)
+ v.AuxInt = s
+ v.AddArg3(dst, src, mem)
+ return true
+ }
+ // match: (Move [s] dst src mem)
+ // cond: s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)
+ // result: (LoweredQuadMove [s] dst src mem)
+ for {
+ s := v.AuxInt
+ dst := v_0
+ src := v_1
+ mem := v_2
+ if !(s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)) {
+ break
+ }
+ v.reset(OpPPC64LoweredQuadMove)
+ v.AuxInt = s
+ v.AddArg3(dst, src, mem)
+ return true
+ }
return false
}
func rewriteValuePPC64_OpNeq16(v *Value) bool {
@@ -14953,16 +14985,66 @@ func rewriteValuePPC64_OpZero(v *Value) bool {
return true
}
// match: (Zero [s] ptr mem)
+ // cond: objabi.GOPPC64 <= 8 && s < 64
+ // result: (LoweredZeroShort [s] ptr mem)
+ for {
+ s := v.AuxInt
+ ptr := v_0
+ mem := v_1
+ if !(objabi.GOPPC64 <= 8 && s < 64) {
+ break
+ }
+ v.reset(OpPPC64LoweredZeroShort)
+ v.AuxInt = s
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ // match: (Zero [s] ptr mem)
+ // cond: objabi.GOPPC64 <= 8
// result: (LoweredZero [s] ptr mem)
for {
s := v.AuxInt
ptr := v_0
mem := v_1
+ if !(objabi.GOPPC64 <= 8) {
+ break
+ }
v.reset(OpPPC64LoweredZero)
v.AuxInt = s
v.AddArg2(ptr, mem)
return true
}
+ // match: (Zero [s] ptr mem)
+ // cond: s < 128 && objabi.GOPPC64 >= 9
+ // result: (LoweredQuadZeroShort [s] ptr mem)
+ for {
+ s := v.AuxInt
+ ptr := v_0
+ mem := v_1
+ if !(s < 128 && objabi.GOPPC64 >= 9) {
+ break
+ }
+ v.reset(OpPPC64LoweredQuadZeroShort)
+ v.AuxInt = s
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ // match: (Zero [s] ptr mem)
+ // cond: objabi.GOPPC64 >= 9
+ // result: (LoweredQuadZero [s] ptr mem)
+ for {
+ s := v.AuxInt
+ ptr := v_0
+ mem := v_1
+ if !(objabi.GOPPC64 >= 9) {
+ break
+ }
+ v.reset(OpPPC64LoweredQuadZero)
+ v.AuxInt = s
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ return false
}
func rewriteBlockPPC64(b *Block) bool {
switch b.Kind {
diff --git a/test/codegen/copy.go b/test/codegen/copy.go
index 46c2bde9ab..db75cde1c6 100644
--- a/test/codegen/copy.go
+++ b/test/codegen/copy.go
@@ -34,6 +34,8 @@ func movesmall7() {
func movesmall16() {
x := [...]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
// amd64:-".*memmove"
+ // ppc64:".*memmove"
+ // ppc64le:".*memmove"
copy(x[1:], x[:])
}
@@ -41,10 +43,34 @@ var x [256]byte
// Check that large disjoint copies are replaced with moves.
+func moveDisjointStack32() {
+ var s [32]byte
+ // ppc64:-".*memmove"
+ // ppc64le:-".*memmove"
+ // ppc64le/power8:"LXVD2X",-"ADD",-"BC"
+ // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
+ copy(s[:], x[:32])
+ runtime.KeepAlive(&s)
+}
+
+func moveDisjointStack64() {
+ var s [96]byte
+ // ppc64:-".*memmove"
+ // ppc64le:-".*memmove"
+ // ppc64le/power8:"LXVD2X","ADD","BC"
+ // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
+ copy(s[:], x[:96])
+ runtime.KeepAlive(&s)
+}
+
func moveDisjointStack() {
var s [256]byte
// s390x:-".*memmove"
// amd64:-".*memmove"
+ // ppc64:-".*memmove"
+ // ppc64le:-".*memmove"
+ // ppc64le/power8:"LXVD2X"
+ // ppc64le/power9:"LXV",-"LXVD2X"
copy(s[:], x[:])
runtime.KeepAlive(&s)
}
@@ -53,6 +79,10 @@ func moveDisjointArg(b *[256]byte) {
var s [256]byte
// s390x:-".*memmove"
// amd64:-".*memmove"
+ // ppc64:-".*memmove"
+ // ppc64le:-".*memmove"
+ // ppc64le/power8:"LXVD2X"
+ // ppc64le/power9:"LXV",-"LXVD2X"
copy(s[:], b[:])
runtime.KeepAlive(&s)
}
@@ -60,6 +90,10 @@ func moveDisjointArg(b *[256]byte) {
func moveDisjointNoOverlap(a *[256]byte) {
// s390x:-".*memmove"
// amd64:-".*memmove"
+ // ppc64:-".*memmove"
+ // ppc64le:-".*memmove"
+ // ppc64le/power8:"LXVD2X"
+ // ppc64le/power9:"LXV",-"LXVD2X"
copy(a[:], a[128:])
}