diff options
Diffstat (limited to 'src/cmd/compile/internal/ppc64/ssa.go')
-rw-r--r-- | src/cmd/compile/internal/ppc64/ssa.go | 586 |
1 files changed, 527 insertions, 59 deletions
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go index 0ab21604e5..50f595fe2f 100644 --- a/src/cmd/compile/internal/ppc64/ssa.go +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -850,39 +850,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt & 3 - case ssa.OpPPC64LoweredZero: + case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort: + // The LoweredQuad code generation + // generates STXV instructions on + // power9. The Short variation is used + // if no loop is generated. - // unaligned data doesn't hurt performance - // for these instructions on power8 or later + // sizes >= 64 generate a loop as follows: - // for sizes >= 64 generate a loop as follows: + // Set up loop counter in CTR, used by BC + // XXLXOR clears VS32 + // XXLXOR VS32,VS32,VS32 + // MOVD len/64,REG_TMP + // MOVD REG_TMP,CTR + // loop: + // STXV VS32,0(R20) + // STXV VS32,16(R20) + // STXV VS32,32(R20) + // STXV VS32,48(R20) + // ADD $64,R20 + // BC 16, 0, loop + + // Bytes per iteration + ctr := v.AuxInt / 64 + + // Remainder bytes + rem := v.AuxInt % 64 + + // Only generate a loop if there is more + // than 1 iteration. + if ctr > 1 { + // Set up VS32 (V0) to hold 0s + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + + // Set up CTR loop counter + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + // Don't generate padding for + // loops with few iterations. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + + // generate 4 STXVs to zero 64 bytes + var top *obj.Prog + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + + // Save the top of loop + if top == nil { + top = p + } + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 16 - // set up loop counter in CTR, used by BC + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 48 + + // Increment address for the + // 64 bytes just zeroed. + p = s.Prog(ppc64.AADD) + p.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_CONST + p.From.Offset = 64 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[0].Reg() + + // Branch back to top of loop + // based on CTR + // BC with BO_BCTR generates bdnz + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + gc.Patch(p, top) + } + // When ctr == 1 the loop was not generated but + // there are at least 64 bytes to clear, so add + // that to the remainder to generate the code + // to clear those doublewords + if ctr == 1 { + rem += 64 + } + + // Clear the remainder starting at offset zero + offset := int64(0) + + if rem >= 16 && ctr <= 1 { + // If the XXLXOR hasn't already been + // generated, do it here to initialize + // VS32 (V0) to 0. + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + } + // Generate STXV for 32 or 64 + // bytes. + for rem >= 32 { + p := s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + 16 + offset += 32 + rem -= 32 + } + // Generate 16 bytes + if rem >= 16 { + p := s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + offset += 16 + rem -= 16 + } + + // first clear as many doublewords as possible + // then clear remaining sizes as available + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVW, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + p := s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort: + + // Unaligned data doesn't hurt performance + // for these instructions on power8. + + // For sizes >= 64 generate a loop as follows: + + // Set up loop counter in CTR, used by BC // XXLXOR VS32,VS32,VS32 // MOVD len/32,REG_TMP // MOVD REG_TMP,CTR // MOVD $16,REG_TMP // loop: - // STXVD2X VS32,(R0)(R3) - // STXVD2X VS32,(R31)(R3) - // ADD $32,R3 + // STXVD2X VS32,(R0)(R20) + // STXVD2X VS32,(R31)(R20) + // ADD $32,R20 // BC 16, 0, loop // // any remainder is done as described below // for sizes < 64 bytes, first clear as many doublewords as possible, // then handle the remainder - // MOVD R0,(R3) - // MOVD R0,8(R3) + // MOVD R0,(R20) + // MOVD R0,8(R20) // .... etc. // // the remainder bytes are cleared using one or more // of the following instructions with the appropriate // offsets depending which instructions are needed // - // MOVW R0,n1(R3) 4 bytes - // MOVH R0,n2(R3) 2 bytes - // MOVB R0,n3(R3) 1 byte + // MOVW R0,n1(R20) 4 bytes + // MOVH R0,n2(R20) 2 bytes + // MOVB R0,n3(R20) 1 byte // // 7 bytes: MOVW, MOVH, MOVB // 6 bytes: MOVW, MOVH @@ -926,10 +1113,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REGTMP + // Don't add padding for alignment + // with few loop iterations. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + // generate 2 STXVD2Xs to store 16 bytes // when this is a loop then the top must be saved var top *obj.Prog // This is the top of loop + p = s.Prog(ppc64.ASTXVD2X) p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32 @@ -940,7 +1136,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { if top == nil { top = p } - p = s.Prog(ppc64.ASTXVD2X) p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32 @@ -1001,8 +1196,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { offset += size } - case ssa.OpPPC64LoweredMove: + case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort: + bytesPerLoop := int64(32) // This will be used when moving more // than 8 bytes. Moves start with // as many 8 byte moves as possible, then @@ -1019,34 +1215,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // MOVD REG_TMP,CTR // MOVD $16,REG_TMP // top: - // LXVD2X (R0)(R4),VS32 - // LXVD2X (R31)(R4),VS33 - // ADD $32,R4 - // STXVD2X VS32,(R0)(R3) - // STXVD2X VS33,(R31)(R4) - // ADD $32,R3 + // LXVD2X (R0)(R21),VS32 + // LXVD2X (R31)(R21),VS33 + // ADD $32,R21 + // STXVD2X VS32,(R0)(R20) + // STXVD2X VS33,(R31)(R20) + // ADD $32,R20 // BC 16,0,top // Bytes not moved by this loop are moved // with a combination of the following instructions, // starting with the largest sizes and generating as // many as needed, using the appropriate offset value. - // MOVD n(R4),R14 - // MOVD R14,n(R3) - // MOVW n1(R4),R14 - // MOVW R14,n1(R3) - // MOVH n2(R4),R14 - // MOVH R14,n2(R3) - // MOVB n3(R4),R14 - // MOVB R14,n3(R3) + // MOVD n(R21),R31 + // MOVD R31,n(R20) + // MOVW n1(R21),R31 + // MOVW R31,n1(R20) + // MOVH n2(R21),R31 + // MOVH R31,n2(R20) + // MOVB n3(R21),R31 + // MOVB R31,n3(R20) // Each loop iteration moves 32 bytes - ctr := v.AuxInt / 32 + ctr := v.AuxInt / bytesPerLoop // Remainder after the loop - rem := v.AuxInt % 32 + rem := v.AuxInt % bytesPerLoop - dst_reg := v.Args[0].Reg() - src_reg := v.Args[1].Reg() + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() // The set of registers used here, must match the clobbered reg list // in PPC64Ops.go. @@ -1076,57 +1272,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REGTMP + // Don't adding padding for + // alignment with small iteration + // counts. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + // Generate 16 byte loads and stores. // Use temp register for index (16) // on the second one. + p = s.Prog(ppc64.ALXVD2X) p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg + p.From.Reg = srcReg p.From.Index = ppc64.REGZERO p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_VS32 - if top == nil { top = p } - p = s.Prog(ppc64.ALXVD2X) p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg + p.From.Reg = srcReg p.From.Index = ppc64.REGTMP p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_VS33 // increment the src reg for next iteration p = s.Prog(ppc64.AADD) - p.Reg = src_reg + p.Reg = srcReg p.From.Type = obj.TYPE_CONST - p.From.Offset = 32 + p.From.Offset = bytesPerLoop p.To.Type = obj.TYPE_REG - p.To.Reg = src_reg + p.To.Reg = srcReg // generate 16 byte stores p = s.Prog(ppc64.ASTXVD2X) p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32 p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg + p.To.Reg = dstReg p.To.Index = ppc64.REGZERO p = s.Prog(ppc64.ASTXVD2X) p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS33 p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg + p.To.Reg = dstReg p.To.Index = ppc64.REGTMP // increment the dst reg for next iteration p = s.Prog(ppc64.AADD) - p.Reg = dst_reg + p.Reg = dstReg p.From.Type = obj.TYPE_CONST - p.From.Offset = 32 + p.From.Offset = bytesPerLoop p.To.Type = obj.TYPE_REG - p.To.Reg = dst_reg + p.To.Reg = dstReg // BC with BO_BCTR generates bdnz to branch on nonzero CTR // to loop top. @@ -1137,7 +1341,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_BRANCH gc.Patch(p, top) - // src_reg and dst_reg were incremented in the loop, so + // srcReg and dstReg were incremented in the loop, so // later instructions start with offset 0. offset = int64(0) } @@ -1145,7 +1349,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // No loop was generated for one iteration, so // add 32 bytes to the remainder to move those bytes. if ctr == 1 { - rem += 32 + rem += bytesPerLoop } if rem >= 16 { @@ -1154,7 +1358,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // on the second one. p := s.Prog(ppc64.ALXVD2X) p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg + p.From.Reg = srcReg p.From.Index = ppc64.REGZERO p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_VS32 @@ -1163,7 +1367,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32 p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg + p.To.Reg = dstReg p.To.Index = ppc64.REGZERO offset = 16 @@ -1171,18 +1375,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { if rem >= 16 { // Use REGTMP as index reg - p = s.Prog(ppc64.AMOVD) + p := s.Prog(ppc64.AMOVD) p.From.Type = obj.TYPE_CONST p.From.Offset = 16 p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REGTMP - // Generate 16 byte loads and stores. - // Use temp register for index (16) - // on the second one. p = s.Prog(ppc64.ALXVD2X) p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg + p.From.Reg = srcReg p.From.Index = ppc64.REGTMP p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_VS32 @@ -1191,7 +1392,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32 p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg + p.To.Reg = dstReg p.To.Index = ppc64.REGTMP offset = 32 @@ -1214,17 +1415,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // Load p := s.Prog(op) p.To.Type = obj.TYPE_REG - p.To.Reg = ppc64.REG_R14 + p.To.Reg = ppc64.REGTMP + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + + // Store + p = s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort: + bytesPerLoop := int64(64) + // This is used when moving more + // than 8 bytes on power9. Moves start with + // as many 8 byte moves as possible, then + // 4, 2, or 1 byte(s) as remaining. This will + // work and be efficient for power8 or later. + // If there are 64 or more bytes, then a + // loop is generated to move 32 bytes and + // update the src and dst addresses on each + // iteration. When < 64 bytes, the appropriate + // number of moves are generated based on the + // size. + // When moving >= 64 bytes a loop is used + // MOVD len/32,REG_TMP + // MOVD REG_TMP,CTR + // top: + // LXV 0(R21),VS32 + // LXV 16(R21),VS33 + // ADD $32,R21 + // STXV VS32,0(R20) + // STXV VS33,16(R20) + // ADD $32,R20 + // BC 16,0,top + // Bytes not moved by this loop are moved + // with a combination of the following instructions, + // starting with the largest sizes and generating as + // many as needed, using the appropriate offset value. + // MOVD n(R21),R31 + // MOVD R31,n(R20) + // MOVW n1(R21),R31 + // MOVW R31,n1(R20) + // MOVH n2(R21),R31 + // MOVH R31,n2(R20) + // MOVB n3(R21),R31 + // MOVB R31,n3(R20) + + // Each loop iteration moves 32 bytes + ctr := v.AuxInt / bytesPerLoop + + // Remainder after the loop + rem := v.AuxInt % bytesPerLoop + + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + + offset := int64(0) + + // top of the loop + var top *obj.Prog + + // Only generate looping code when loop counter is > 1 for >= 64 bytes + if ctr > 1 { + // Set up the CTR + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + + // Generate 16 byte loads and stores. + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + if top == nil { + top = p + } + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // generate 16 byte stores + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 16 + + // Generate 16 byte loads and stores. + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 48 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // generate 16 byte stores + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 48 + + // increment the src reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = srcReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = srcReg + + // increment the dst reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = dstReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = dstReg + + // BC with BO_BCTR generates bdnz to branch on nonzero CTR + // to loop top. + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + gc.Patch(p, top) + + // srcReg and dstReg were incremented in the loop, so + // later instructions start with offset 0. + offset = int64(0) + } + + // No loop was generated for one iteration, so + // add 32 bytes to the remainder to move those bytes. + if ctr == 1 { + rem += bytesPerLoop + } + if rem >= 32 { + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = 16 + + offset = 32 + rem -= 32 + } + + if rem >= 16 { + // Generate 16 byte loads and stores. + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + offset += 16 + rem -= 16 + + if rem >= 16 { + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + offset += 16 + rem -= 16 + } + } + // Generate all the remaining load and store pairs, starting with + // as many 8 byte moves as possible, then 4, 2, 1. + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVW, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + // Load + p := s.Prog(op) + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg + p.From.Reg = srcReg p.From.Offset = offset // Store p = s.Prog(op) p.From.Type = obj.TYPE_REG - p.From.Reg = ppc64.REG_R14 + p.From.Reg = ppc64.REGTMP p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg + p.To.Reg = dstReg p.To.Offset = offset rem -= size offset += size |