4 files changed, 107 insertions, 92 deletions
diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go
index e57beb3276..dda24a0b96 100644
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -79,8 +79,10 @@ const (
 	REG_R30
 	REG_R31
 
-	/* F0=4128 ... F31=4159 */
-	REG_F0
+	/* Align FPR and VSR vectors such that when masked with 0x3F they produce
+	   an equivalent VSX register. */
+	/* F0=4160 ... F31=4191 */
+	REG_F0 = obj.RBasePPC64 + iota + 32
 	REG_F1
 	REG_F2
 	REG_F3
@@ -113,7 +115,7 @@ const (
 	REG_F30
 	REG_F31
 
-	/* V0=4160 ... V31=4191 */
+	/* V0=4192 ... V31=4223 */
 	REG_V0
 	REG_V1
 	REG_V2
@@ -147,7 +149,7 @@ const (
 	REG_V30
 	REG_V31
 
-	/* VS0=4192 ... VS63=4255 */
+	/* VS0=4224 ... VS63=4287 */
 	REG_VS0
 	REG_VS1
 	REG_VS2
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index e642413590..1d92c4866f 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -428,15 +428,13 @@ var optab = []Optab{
 	{as: ASTXSIWX, a1: C_VSREG, a6: C_SOREG, type_: 86, size: 4}, /* vsx scalar as integer store, xx1-form */
 
 	/* VSX move from VSR */
-	{as: AMFVSRD, a1: C_VSREG, a6: C_REG, type_: 88, size: 4}, /* vsx move from vsr, xx1-form */
+	{as: AMFVSRD, a1: C_VSREG, a6: C_REG, type_: 88, size: 4},
 	{as: AMFVSRD, a1: C_FREG, a6: C_REG, type_: 88, size: 4},
-	{as: AMFVSRD, a1: C_VREG, a6: C_REG, type_: 88, size: 4},
 
 	/* VSX move to VSR */
-	{as: AMTVSRD, a1: C_REG, a6: C_VSREG, type_: 88, size: 4}, /* vsx move to vsr, xx1-form */
-	{as: AMTVSRD, a1: C_REG, a2: C_REG, a6: C_VSREG, type_: 88, size: 4},
-	{as: AMTVSRD, a1: C_REG, a6: C_FREG, type_: 88, size: 4},
-	{as: AMTVSRD, a1: C_REG, a6: C_VREG, type_: 88, size: 4},
+	{as: AMTVSRD, a1: C_REG, a6: C_VSREG, type_: 104, size: 4},
+	{as: AMTVSRD, a1: C_REG, a6: C_FREG, type_: 104, size: 4},
+	{as: AMTVSRDD, a1: C_REG, a2: C_REG, a6: C_VSREG, type_: 104, size: 4},
 
 	/* VSX logical */
 	{as: AXXLAND, a1: C_VSREG, a2: C_VSREG, a6: C_VSREG, type_: 90, size: 4}, /* vsx and, xx3-form */
@@ -1036,13 +1034,14 @@ func (c *ctxt9) oplook(p *obj.Prog) *Optab {
 	// c.ctxt.Logf("oplook %v %d %d %d %d\n", p, a1, a2, a3, a4, a5, a6)
 	ops := oprange[p.As&obj.AMask]
 	c1 := &xcmp[a1]
+	c2 := &xcmp[a2]
 	c3 := &xcmp[a3]
 	c4 := &xcmp[a4]
 	c5 := &xcmp[a5]
 	c6 := &xcmp[a6]
 	for i := range ops {
 		op := &ops[i]
-		if int(op.a2) == a2 && c1[op.a1] && c3[op.a3] && c4[op.a4] && c5[op.a5] && c6[op.a6] {
+		if c1[op.a1] && c2[op.a2] && c3[op.a3] && c4[op.a4] && c5[op.a5] && c6[op.a6] {
 			p.Optab = uint16(cap(optab) - cap(ops) + i + 1)
 			return op
 		}
@@ -1116,6 +1115,12 @@ func cmp(a int, b int) bool {
 			return r0iszero != 0 /*TypeKind(100016)*/
 		}
 
+	case C_VSREG:
+		/* Allow any VR argument as a VSR operand. */
+		if b == C_VREG {
+			return true
+		}
+
 	case C_ANY:
 		return true
 	}
@@ -1594,7 +1599,6 @@ func buildop(ctxt *obj.Link) {
 			opset(AMTVRD, r0)
 			opset(AMTVSRWA, r0)
 			opset(AMTVSRWZ, r0)
-			opset(AMTVSRDD, r0)
 			opset(AMTVSRWS, r0)
 
 		case AXXLAND: /* xxland, xxlandc, xxleqv, xxlnand */
@@ -1977,6 +1981,7 @@ func buildop(ctxt *obj.Link) {
 			ACMPEQB,
 			AECIWX,
 			ACLRLSLWI,
+			AMTVSRDD,
 			obj.ANOP,
 			obj.ATEXT,
 			obj.AUNDEF,
@@ -2075,50 +2080,32 @@ func AOP_IR(op uint32, d uint32, simm uint32) uint32 {
 }
 
 /* XX1-form 3-register operands, 1 VSR operand */
-func AOP_XX1(op uint32, d uint32, a uint32, b uint32) uint32 {
-	/* For the XX-form encodings, we need the VSX register number to be exactly */
-	/* between 0-63, so we can properly set the rightmost bits. */
-	r := d - REG_VS0
+func AOP_XX1(op uint32, r uint32, a uint32, b uint32) uint32 {
 	return op | (r&31)<<21 | (a&31)<<16 | (b&31)<<11 | (r&32)>>5
 }
 
 /* XX2-form 3-register operands, 2 VSR operands */
-func AOP_XX2(op uint32, d uint32, a uint32, b uint32) uint32 {
-	xt := d - REG_VS0
-	xb := b - REG_VS0
+func AOP_XX2(op uint32, xt uint32, a uint32, xb uint32) uint32 {
 	return op | (xt&31)<<21 | (a&3)<<16 | (xb&31)<<11 | (xb&32)>>4 | (xt&32)>>5
 }
 
 /* XX3-form 3 VSR operands */
-func AOP_XX3(op uint32, d uint32, a uint32, b uint32) uint32 {
-	xt := d - REG_VS0
-	xa := a - REG_VS0
-	xb := b - REG_VS0
+func AOP_XX3(op uint32, xt uint32, xa uint32, xb uint32) uint32 {
 	return op | (xt&31)<<21 | (xa&31)<<16 | (xb&31)<<11 | (xa&32)>>3 | (xb&32)>>4 | (xt&32)>>5
 }
 
 /* XX3-form 3 VSR operands + immediate */
-func AOP_XX3I(op uint32, d uint32, a uint32, b uint32, c uint32) uint32 {
-	xt := d - REG_VS0
-	xa := a - REG_VS0
-	xb := b - REG_VS0
+func AOP_XX3I(op uint32, xt uint32, xa uint32, xb uint32, c uint32) uint32 {
 	return op | (xt&31)<<21 | (xa&31)<<16 | (xb&31)<<11 | (c&3)<<8 | (xa&32)>>3 | (xb&32)>>4 | (xt&32)>>5
 }
 
 /* XX4-form, 4 VSR operands */
-func AOP_XX4(op uint32, d uint32, a uint32, b uint32, c uint32) uint32 {
-	xt := d - REG_VS0
-	xa := a - REG_VS0
-	xb := b - REG_VS0
-	xc := c - REG_VS0
+func AOP_XX4(op uint32, xt uint32, xa uint32, xb uint32, xc uint32) uint32 {
 	return op | (xt&31)<<21 | (xa&31)<<16 | (xb&31)<<11 | (xc&31)<<6 | (xc&32)>>2 | (xa&32)>>3 | (xb&32)>>4 | (xt&32)>>5
 }
 
 /* DQ-form, VSR register, register + offset operands */
-func AOP_DQ(op uint32, d uint32, a uint32, b uint32) uint32 {
-	/* For the DQ-form encodings, we need the VSX register number to be exactly */
-	/* between 0-63, so we can properly set the SX bit. */
-	r := d - REG_VS0
+func AOP_DQ(op uint32, xt uint32, a uint32, b uint32) uint32 {
 	/* The EA for this instruction form is (RA) + DQ << 4, where DQ is a 12-bit signed integer. */
 	/* In order to match the output of the GNU objdump (and make the usage in Go asm easier), the */
 	/* instruction is called using the sign extended value (i.e. a valid offset would be -32752 or 32752, */
@@ -2126,7 +2113,7 @@ func AOP_DQ(op uint32, d uint32, a uint32, b uint32) uint32 {
 	/* bits 0 to 3 in 'dq' need to be zero, otherwise this will generate an illegal instruction. */
 	/* If in doubt how this instruction form is encoded, refer to ISA 3.0b, pages 492 and 507. */
 	dq := b >> 4
-	return op | (r&31)<<21 | (a&31)<<16 | (dq&4095)<<4 | (r&32)>>2
+	return op | (xt&31)<<21 | (a&31)<<16 | (dq&4095)<<4 | (xt&32)>>2
 }
 
 /* Z23-form, 3-register operands + CY field */
@@ -3586,33 +3573,8 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		/* 3-register operand order: (RB)(RA*1), XT */
 		o1 = AOP_XX1(c.oploadx(p.As), uint32(p.To.Reg), uint32(p.From.Index), uint32(p.From.Reg))
 
-	case 88: /* VSX instructions, XX1-form */
-		/* reg reg none OR reg reg reg */
-		/* 3-register operand order: RA, RB, XT */
-		/* 2-register operand order: XS, RA or RA, XT */
-		xt := int32(p.To.Reg)
-		xs := int32(p.From.Reg)
-		/* We need to treat the special case of extended mnemonics that may have a FREG/VREG as an argument */
-		if REG_V0 <= xt && xt <= REG_V31 {
-			/* Convert V0-V31 to VS32-VS63 */
-			xt = xt + 64
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xt), uint32(p.From.Reg), uint32(p.Reg))
-		} else if REG_F0 <= xt && xt <= REG_F31 {
-			/* Convert F0-F31 to VS0-VS31 */
-			xt = xt + 64
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xt), uint32(p.From.Reg), uint32(p.Reg))
-		} else if REG_VS0 <= xt && xt <= REG_VS63 {
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xt), uint32(p.From.Reg), uint32(p.Reg))
-		} else if REG_V0 <= xs && xs <= REG_V31 {
-			/* Likewise for XS */
-			xs = xs + 64
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xs), uint32(p.To.Reg), uint32(p.Reg))
-		} else if REG_F0 <= xs && xs <= REG_F31 {
-			xs = xs + 64
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xs), uint32(p.To.Reg), uint32(p.Reg))
-		} else if REG_VS0 <= xs && xs <= REG_VS63 {
-			o1 = AOP_XX1(c.oprrr(p.As), uint32(xs), uint32(p.To.Reg), uint32(p.Reg))
-		}
+	case 88: /* VSX mfvsr* instructions, XX1-form XS,RA */
+		o1 = AOP_XX1(c.oprrr(p.As), uint32(p.From.Reg), uint32(p.To.Reg), uint32(p.Reg))
 
 	case 89: /* VSX instructions, XX2-form */
 		/* reg none reg OR reg imm reg */
@@ -3743,6 +3705,9 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		mb := uint32(c.regoff(&p.RestArgs[0].Addr))
 		me := uint32(c.regoff(&p.RestArgs[1].Addr))
 		o1 = OP_RLW(c.oprrr(p.As), uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), mb, me)
+
+	case 104: /* VSX mtvsr* instructions, XX1-form RA,RB,XT */
+		o1 = AOP_XX1(c.oprrr(p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg))
 	}
 
 	out[0] = o1
diff --git a/src/cmd/internal/obj/ppc64/asm_test.go b/src/cmd/internal/obj/ppc64/asm_test.go
index 70dabc2017..b851d3c86b 100644
--- a/src/cmd/internal/obj/ppc64/asm_test.go
+++ b/src/cmd/internal/obj/ppc64/asm_test.go
@@ -107,3 +107,44 @@ func TestPCalign(t *testing.T) {
 		t.Errorf("Invalid alignment not detected for PCALIGN\n")
 	}
 }
+
+// Verify register constants are correctly aligned. Much of the ppc64 assembler assumes masking out significant
+// bits will produce a valid register number:
+// REG_Rx & 31 == x
+// REG_Fx & 31 == x
+// REG_Vx & 31 == x
+// REG_VSx & 63 == x
+// REG_SPRx & 1023 == x
+// REG_CRx & 7 == x
+//
+// VR and FPR disjointly overlap VSR, interpreting as VSR registers should produce the correctly overlapped VSR.
+// REG_FPx & 63 == x
+// REG_Vx & 63 == x + 32
+func TestRegValueAlignment(t *testing.T) {
+	tstFunc := func(rstart, rend, msk, rout int) {
+		for i := rstart; i <= rend; i++ {
+			if i&msk != rout {
+				t.Errorf("%v is not aligned to 0x%X (expected %d, got %d)\n", rconv(i), msk, rout, rstart&msk)
+			}
+			rout++
+		}
+	}
+	var testType = []struct {
+		rstart int
+		rend   int
+		msk    int
+		rout   int
+	}{
+		{REG_VS0, REG_VS63, 63, 0},
+		{REG_R0, REG_R31, 31, 0},
+		{REG_F0, REG_F31, 31, 0},
+		{REG_V0, REG_V31, 31, 0},
+		{REG_V0, REG_V31, 63, 32},
+		{REG_F0, REG_F31, 63, 0},
+		{REG_SPR0, REG_SPR0 + 1023, 1023, 0},
+		{REG_CR0, REG_CR7, 7, 0},
+	}
+	for _, t := range testType {
+		tstFunc(t.rstart, t.rend, t.msk, t.rout)
+	}
+}
diff --git a/src/cmd/internal/obj/ppc64/obj9.go b/src/cmd/internal/obj/ppc64/obj9.go
index c2722b0afb..ee93fe048b 100644
--- a/src/cmd/internal/obj/ppc64/obj9.go
+++ b/src/cmd/internal/obj/ppc64/obj9.go
@@ -294,9 +294,9 @@ func (c *ctxt9) rewriteToUseGot(p *obj.Prog) {
 		//     BL (LR)
 		var sym *obj.LSym
 		if p.As == obj.ADUFFZERO {
-			sym = c.ctxt.Lookup("runtime.duffzero")
+			sym = c.ctxt.LookupABI("runtime.duffzero", obj.ABIInternal)
 		} else {
-			sym = c.ctxt.Lookup("runtime.duffcopy")
+			sym = c.ctxt.LookupABI("runtime.duffcopy", obj.ABIInternal)
 		}
 		offset := p.To.Offset
 		p.As = AMOVD
@@ -687,7 +687,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 					q.From.Reg = REG_LR
 					q.To.Type = obj.TYPE_REG
 					q.To.Reg = REGTMP
-
 					prologueEnd = q
 
 					q = obj.Appendp(q, c.newprog)
@@ -787,14 +786,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				q.From.Reg = REGG
 				q.From.Offset = 4 * int64(c.ctxt.Arch.PtrSize) // G.panic
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R3
+				q.To.Reg = REG_R22
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = ACMP
 				q.From.Type = obj.TYPE_REG
 				q.From.Reg = REG_R0
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R3
+				q.To.Reg = REG_R22
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = ABEQ
@@ -804,10 +803,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				q = obj.Appendp(q, c.newprog)
 				q.As = AMOVD
 				q.From.Type = obj.TYPE_MEM
-				q.From.Reg = REG_R3
+				q.From.Reg = REG_R22
 				q.From.Offset = 0 // Panic.argp
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R4
+				q.To.Reg = REG_R23
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = AADD
@@ -815,14 +814,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				q.From.Offset = int64(autosize) + c.ctxt.FixedFrameSize()
 				q.Reg = REGSP
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R5
+				q.To.Reg = REG_R24
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = ACMP
 				q.From.Type = obj.TYPE_REG
-				q.From.Reg = REG_R4
+				q.From.Reg = REG_R23
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R5
+				q.To.Reg = REG_R24
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = ABNE
@@ -835,14 +834,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				q.From.Offset = c.ctxt.FixedFrameSize()
 				q.Reg = REGSP
 				q.To.Type = obj.TYPE_REG
-				q.To.Reg = REG_R6
+				q.To.Reg = REG_R25
 
 				q = obj.Appendp(q, c.newprog)
 				q.As = AMOVD
 				q.From.Type = obj.TYPE_REG
-				q.From.Reg = REG_R6
+				q.From.Reg = REG_R25
 				q.To.Type = obj.TYPE_MEM
-				q.To.Reg = REG_R3
+				q.To.Reg = REG_R22
 				q.To.Offset = 0 // Panic.argp
 
 				q = obj.Appendp(q, c.newprog)
@@ -1051,7 +1050,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 	p0 := p // save entry point, but skipping the two instructions setting R2 in shared mode
 
-	// MOVD	g_stackguard(g), R3
+	// MOVD	g_stackguard(g), R22
 	p = obj.Appendp(p, c.newprog)
 
 	p.As = AMOVD
@@ -1062,7 +1061,7 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 		p.From.Offset = 3 * int64(c.ctxt.Arch.PtrSize) // G.stackguard1
 	}
 	p.To.Type = obj.TYPE_REG
-	p.To.Reg = REG_R3
+	p.To.Reg = REG_R22
 
 	// Mark the stack bound check and morestack call async nonpreemptible.
 	// If we get preempted here, when resumed the preemption request is
@@ -1078,7 +1077,7 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 
 		p.As = ACMPU
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = REG_R3
+		p.From.Reg = REG_R22
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = REGSP
 	} else {
@@ -1108,14 +1107,14 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 				p.From.Type = obj.TYPE_CONST
 				p.From.Offset = offset
 				p.To.Type = obj.TYPE_REG
-				p.To.Reg = REG_R4
+				p.To.Reg = REG_R23
 
 				p = obj.Appendp(p, c.newprog)
 				p.As = ACMPU
 				p.From.Type = obj.TYPE_REG
 				p.From.Reg = REGSP
 				p.To.Type = obj.TYPE_REG
-				p.To.Reg = REG_R4
+				p.To.Reg = REG_R23
 			}
 
 			p = obj.Appendp(p, c.newprog)
@@ -1134,14 +1133,14 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 		p.From.Offset = -offset
 		p.Reg = REGSP
 		p.To.Type = obj.TYPE_REG
-		p.To.Reg = REG_R4
+		p.To.Reg = REG_R23
 
 		p = obj.Appendp(p, c.newprog)
 		p.As = ACMPU
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = REG_R3
+		p.From.Reg = REG_R22
 		p.To.Type = obj.TYPE_REG
-		p.To.Reg = REG_R4
+		p.To.Reg = REG_R23
 	}
 
 	// q1: BLT	done
@@ -1151,17 +1150,25 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 	p.As = ABLT
 	p.To.Type = obj.TYPE_BRANCH
 
-	// MOVD	LR, R5
 	p = obj.Appendp(p, c.newprog)
+	p.As = obj.ANOP // zero-width place holder
+
+	if q != nil {
+		q.To.SetTarget(p)
+	}
+
+	// Spill the register args that could be clobbered by the
+	// morestack code.
 
+	spill := c.cursym.Func().SpillRegisterArgs(p, c.newprog)
+
+	// MOVD LR, R5
+	p = obj.Appendp(spill, c.newprog)
 	p.As = AMOVD
 	p.From.Type = obj.TYPE_REG
 	p.From.Reg = REG_LR
 	p.To.Type = obj.TYPE_REG
 	p.To.Reg = REG_R5
-	if q != nil {
-		q.To.SetTarget(p)
-	}
 
 	p = c.ctxt.EmitEntryStackMap(c.cursym, p, c.newprog)
 
@@ -1181,8 +1188,7 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 		// Fortunately, in shared mode, 8(SP) and 16(SP) are reserved in
 		// the caller's frame, but not used (0(SP) is caller's saved LR,
 		// 24(SP) is caller's saved R2). Use 8(SP) to save this function's R2.
-
-		// MOVD R12, 8(SP)
+		// MOVD R2, 8(SP)
 		p = obj.Appendp(p, c.newprog)
 		p.As = AMOVD
 		p.From.Type = obj.TYPE_REG
@@ -1249,7 +1255,8 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
 		p.To.Reg = REG_R2
 	}
 
-	p = c.ctxt.EndUnsafePoint(p, c.newprog, -1)
+	unspill := c.cursym.Func().UnspillRegisterArgs(p, c.newprog)
+	p = c.ctxt.EndUnsafePoint(unspill, c.newprog, -1)
 
 	// BR	start
 	p = obj.Appendp(p, c.newprog)