aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/ppc64/ssa.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/compile/internal/ppc64/ssa.go')
-rw-r--r--src/cmd/compile/internal/ppc64/ssa.go586
1 files changed, 527 insertions, 59 deletions
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 0ab21604e5..50f595fe2f 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -850,39 +850,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt & 3
- case ssa.OpPPC64LoweredZero:
+ case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
+ // The LoweredQuad code generation
+ // generates STXV instructions on
+ // power9. The Short variation is used
+ // if no loop is generated.
- // unaligned data doesn't hurt performance
- // for these instructions on power8 or later
+ // sizes >= 64 generate a loop as follows:
- // for sizes >= 64 generate a loop as follows:
+ // Set up loop counter in CTR, used by BC
+ // XXLXOR clears VS32
+ // XXLXOR VS32,VS32,VS32
+ // MOVD len/64,REG_TMP
+ // MOVD REG_TMP,CTR
+ // loop:
+ // STXV VS32,0(R20)
+ // STXV VS32,16(R20)
+ // STXV VS32,32(R20)
+ // STXV VS32,48(R20)
+ // ADD $64,R20
+ // BC 16, 0, loop
+
+ // Bytes per iteration
+ ctr := v.AuxInt / 64
+
+ // Remainder bytes
+ rem := v.AuxInt % 64
+
+ // Only generate a loop if there is more
+ // than 1 iteration.
+ if ctr > 1 {
+ // Set up VS32 (V0) to hold 0s
+ p := s.Prog(ppc64.AXXLXOR)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ p.Reg = ppc64.REG_VS32
+
+ // Set up CTR loop counter
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ctr
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_CTR
+
+ // Don't generate padding for
+ // loops with few iterations.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
+ // generate 4 STXVs to zero 64 bytes
+ var top *obj.Prog
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+
+ // Save the top of loop
+ if top == nil {
+ top = p
+ }
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 16
- // set up loop counter in CTR, used by BC
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = 48
+
+ // Increment address for the
+ // 64 bytes just zeroed.
+ p = s.Prog(ppc64.AADD)
+ p.Reg = v.Args[0].Reg()
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 64
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Args[0].Reg()
+
+ // Branch back to top of loop
+ // based on CTR
+ // BC with BO_BCTR generates bdnz
+ p = s.Prog(ppc64.ABC)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ppc64.BO_BCTR
+ p.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_BRANCH
+ gc.Patch(p, top)
+ }
+ // When ctr == 1 the loop was not generated but
+ // there are at least 64 bytes to clear, so add
+ // that to the remainder to generate the code
+ // to clear those doublewords
+ if ctr == 1 {
+ rem += 64
+ }
+
+ // Clear the remainder starting at offset zero
+ offset := int64(0)
+
+ if rem >= 16 && ctr <= 1 {
+ // If the XXLXOR hasn't already been
+ // generated, do it here to initialize
+ // VS32 (V0) to 0.
+ p := s.Prog(ppc64.AXXLXOR)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ p.Reg = ppc64.REG_VS32
+ }
+ // Generate STXV for 32 or 64
+ // bytes.
+ for rem >= 32 {
+ p := s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset + 16
+ offset += 32
+ rem -= 32
+ }
+ // Generate 16 bytes
+ if rem >= 16 {
+ p := s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+ offset += 16
+ rem -= 16
+ }
+
+ // first clear as many doublewords as possible
+ // then clear remaining sizes as available
+ for rem > 0 {
+ op, size := ppc64.AMOVB, int64(1)
+ switch {
+ case rem >= 8:
+ op, size = ppc64.AMOVD, 8
+ case rem >= 4:
+ op, size = ppc64.AMOVW, 4
+ case rem >= 2:
+ op, size = ppc64.AMOVH, 2
+ }
+ p := s.Prog(op)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Offset = offset
+ rem -= size
+ offset += size
+ }
+
+ case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
+
+ // Unaligned data doesn't hurt performance
+ // for these instructions on power8.
+
+ // For sizes >= 64 generate a loop as follows:
+
+ // Set up loop counter in CTR, used by BC
// XXLXOR VS32,VS32,VS32
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// loop:
- // STXVD2X VS32,(R0)(R3)
- // STXVD2X VS32,(R31)(R3)
- // ADD $32,R3
+ // STXVD2X VS32,(R0)(R20)
+ // STXVD2X VS32,(R31)(R20)
+ // ADD $32,R20
// BC 16, 0, loop
//
// any remainder is done as described below
// for sizes < 64 bytes, first clear as many doublewords as possible,
// then handle the remainder
- // MOVD R0,(R3)
- // MOVD R0,8(R3)
+ // MOVD R0,(R20)
+ // MOVD R0,8(R20)
// .... etc.
//
// the remainder bytes are cleared using one or more
// of the following instructions with the appropriate
// offsets depending which instructions are needed
//
- // MOVW R0,n1(R3) 4 bytes
- // MOVH R0,n2(R3) 2 bytes
- // MOVB R0,n3(R3) 1 byte
+ // MOVW R0,n1(R20) 4 bytes
+ // MOVH R0,n2(R20) 2 bytes
+ // MOVB R0,n3(R20) 1 byte
//
// 7 bytes: MOVW, MOVH, MOVB
// 6 bytes: MOVW, MOVH
@@ -926,10 +1113,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
+ // Don't add padding for alignment
+ // with few loop iterations.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
// generate 2 STXVD2Xs to store 16 bytes
// when this is a loop then the top must be saved
var top *obj.Prog
// This is the top of loop
+
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
@@ -940,7 +1136,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if top == nil {
top = p
}
-
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
@@ -1001,8 +1196,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
offset += size
}
- case ssa.OpPPC64LoweredMove:
+ case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:
+ bytesPerLoop := int64(32)
// This will be used when moving more
// than 8 bytes. Moves start with
// as many 8 byte moves as possible, then
@@ -1019,34 +1215,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// top:
- // LXVD2X (R0)(R4),VS32
- // LXVD2X (R31)(R4),VS33
- // ADD $32,R4
- // STXVD2X VS32,(R0)(R3)
- // STXVD2X VS33,(R31)(R4)
- // ADD $32,R3
+ // LXVD2X (R0)(R21),VS32
+ // LXVD2X (R31)(R21),VS33
+ // ADD $32,R21
+ // STXVD2X VS32,(R0)(R20)
+ // STXVD2X VS33,(R31)(R20)
+ // ADD $32,R20
// BC 16,0,top
// Bytes not moved by this loop are moved
// with a combination of the following instructions,
// starting with the largest sizes and generating as
// many as needed, using the appropriate offset value.
- // MOVD n(R4),R14
- // MOVD R14,n(R3)
- // MOVW n1(R4),R14
- // MOVW R14,n1(R3)
- // MOVH n2(R4),R14
- // MOVH R14,n2(R3)
- // MOVB n3(R4),R14
- // MOVB R14,n3(R3)
+ // MOVD n(R21),R31
+ // MOVD R31,n(R20)
+ // MOVW n1(R21),R31
+ // MOVW R31,n1(R20)
+ // MOVH n2(R21),R31
+ // MOVH R31,n2(R20)
+ // MOVB n3(R21),R31
+ // MOVB R31,n3(R20)
// Each loop iteration moves 32 bytes
- ctr := v.AuxInt / 32
+ ctr := v.AuxInt / bytesPerLoop
// Remainder after the loop
- rem := v.AuxInt % 32
+ rem := v.AuxInt % bytesPerLoop
- dst_reg := v.Args[0].Reg()
- src_reg := v.Args[1].Reg()
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
// The set of registers used here, must match the clobbered reg list
// in PPC64Ops.go.
@@ -1076,57 +1272,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
+ // Don't adding padding for
+ // alignment with small iteration
+ // counts.
+ if ctr > 3 {
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ }
+
// Generate 16 byte loads and stores.
// Use temp register for index (16)
// on the second one.
+
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
-
if top == nil {
top = p
}
-
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS33
// increment the src reg for next iteration
p = s.Prog(ppc64.AADD)
- p.Reg = src_reg
+ p.Reg = srcReg
p.From.Type = obj.TYPE_CONST
- p.From.Offset = 32
+ p.From.Offset = bytesPerLoop
p.To.Type = obj.TYPE_REG
- p.To.Reg = src_reg
+ p.To.Reg = srcReg
// generate 16 byte stores
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGZERO
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS33
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGTMP
// increment the dst reg for next iteration
p = s.Prog(ppc64.AADD)
- p.Reg = dst_reg
+ p.Reg = dstReg
p.From.Type = obj.TYPE_CONST
- p.From.Offset = 32
+ p.From.Offset = bytesPerLoop
p.To.Type = obj.TYPE_REG
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
// BC with BO_BCTR generates bdnz to branch on nonzero CTR
// to loop top.
@@ -1137,7 +1341,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_BRANCH
gc.Patch(p, top)
- // src_reg and dst_reg were incremented in the loop, so
+ // srcReg and dstReg were incremented in the loop, so
// later instructions start with offset 0.
offset = int64(0)
}
@@ -1145,7 +1349,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// No loop was generated for one iteration, so
// add 32 bytes to the remainder to move those bytes.
if ctr == 1 {
- rem += 32
+ rem += bytesPerLoop
}
if rem >= 16 {
@@ -1154,7 +1358,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// on the second one.
p := s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
@@ -1163,7 +1367,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGZERO
offset = 16
@@ -1171,18 +1375,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if rem >= 16 {
// Use REGTMP as index reg
- p = s.Prog(ppc64.AMOVD)
+ p := s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 16
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
- // Generate 16 byte loads and stores.
- // Use temp register for index (16)
- // on the second one.
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
@@ -1191,7 +1392,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Index = ppc64.REGTMP
offset = 32
@@ -1214,17 +1415,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// Load
p := s.Prog(op)
p.To.Type = obj.TYPE_REG
- p.To.Reg = ppc64.REG_R14
+ p.To.Reg = ppc64.REGTMP
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+
+ // Store
+ p = s.Prog(op)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+ rem -= size
+ offset += size
+ }
+
+ case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
+ bytesPerLoop := int64(64)
+ // This is used when moving more
+ // than 8 bytes on power9. Moves start with
+ // as many 8 byte moves as possible, then
+ // 4, 2, or 1 byte(s) as remaining. This will
+ // work and be efficient for power8 or later.
+ // If there are 64 or more bytes, then a
+ // loop is generated to move 32 bytes and
+ // update the src and dst addresses on each
+ // iteration. When < 64 bytes, the appropriate
+ // number of moves are generated based on the
+ // size.
+ // When moving >= 64 bytes a loop is used
+ // MOVD len/32,REG_TMP
+ // MOVD REG_TMP,CTR
+ // top:
+ // LXV 0(R21),VS32
+ // LXV 16(R21),VS33
+ // ADD $32,R21
+ // STXV VS32,0(R20)
+ // STXV VS33,16(R20)
+ // ADD $32,R20
+ // BC 16,0,top
+ // Bytes not moved by this loop are moved
+ // with a combination of the following instructions,
+ // starting with the largest sizes and generating as
+ // many as needed, using the appropriate offset value.
+ // MOVD n(R21),R31
+ // MOVD R31,n(R20)
+ // MOVW n1(R21),R31
+ // MOVW R31,n1(R20)
+ // MOVH n2(R21),R31
+ // MOVH R31,n2(R20)
+ // MOVB n3(R21),R31
+ // MOVB R31,n3(R20)
+
+ // Each loop iteration moves 32 bytes
+ ctr := v.AuxInt / bytesPerLoop
+
+ // Remainder after the loop
+ rem := v.AuxInt % bytesPerLoop
+
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+
+ offset := int64(0)
+
+ // top of the loop
+ var top *obj.Prog
+
+ // Only generate looping code when loop counter is > 1 for >= 64 bytes
+ if ctr > 1 {
+ // Set up the CTR
+ p := s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ctr
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_CTR
+
+ p = s.Prog(obj.APCALIGN)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+
+ // Generate 16 byte loads and stores.
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ if top == nil {
+ top = p
+ }
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ // generate 16 byte stores
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 16
+
+ // Generate 16 byte loads and stores.
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset + 48
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ // generate 16 byte stores
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset + 48
+
+ // increment the src reg for next iteration
+ p = s.Prog(ppc64.AADD)
+ p.Reg = srcReg
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = bytesPerLoop
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = srcReg
+
+ // increment the dst reg for next iteration
+ p = s.Prog(ppc64.AADD)
+ p.Reg = dstReg
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = bytesPerLoop
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = dstReg
+
+ // BC with BO_BCTR generates bdnz to branch on nonzero CTR
+ // to loop top.
+ p = s.Prog(ppc64.ABC)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = ppc64.BO_BCTR
+ p.Reg = ppc64.REG_R0
+ p.To.Type = obj.TYPE_BRANCH
+ gc.Patch(p, top)
+
+ // srcReg and dstReg were incremented in the loop, so
+ // later instructions start with offset 0.
+ offset = int64(0)
+ }
+
+ // No loop was generated for one iteration, so
+ // add 32 bytes to the remainder to move those bytes.
+ if ctr == 1 {
+ rem += bytesPerLoop
+ }
+ if rem >= 32 {
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = 16
+
+ offset = 32
+ rem -= 32
+ }
+
+ if rem >= 16 {
+ // Generate 16 byte loads and stores.
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ offset += 16
+ rem -= 16
+
+ if rem >= 16 {
+ p := s.Prog(ppc64.ALXV)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = srcReg
+ p.From.Offset = offset
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dstReg
+ p.To.Offset = offset
+
+ offset += 16
+ rem -= 16
+ }
+ }
+ // Generate all the remaining load and store pairs, starting with
+ // as many 8 byte moves as possible, then 4, 2, 1.
+ for rem > 0 {
+ op, size := ppc64.AMOVB, int64(1)
+ switch {
+ case rem >= 8:
+ op, size = ppc64.AMOVD, 8
+ case rem >= 4:
+ op, size = ppc64.AMOVW, 4
+ case rem >= 2:
+ op, size = ppc64.AMOVH, 2
+ }
+ // Load
+ p := s.Prog(op)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
+ p.From.Reg = srcReg
p.From.Offset = offset
// Store
p = s.Prog(op)
p.From.Type = obj.TYPE_REG
- p.From.Reg = ppc64.REG_R14
+ p.From.Reg = ppc64.REGTMP
p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
+ p.To.Reg = dstReg
p.To.Offset = offset
rem -= size
offset += size