1 files changed, 52 insertions, 32 deletions
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index df4bbbbd35..ee4a33eef4 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -260,8 +260,9 @@ func MOVCONST(d int64, s int, rt int) uint32 {
 const (
 	// Optab.flag
 	LFROM     = 1 << 0 // p.From uses constant pool
-	LTO       = 1 << 1 // p.To uses constant pool
-	NOTUSETMP = 1 << 2 // p expands to multiple instructions, but does NOT use REGTMP
+	LFROM3    = 1 << 1 // p.From3 uses constant pool
+	LTO       = 1 << 2 // p.To uses constant pool
+	NOTUSETMP = 1 << 3 // p expands to multiple instructions, but does NOT use REGTMP
 )
 
 var optab = []Optab{
@@ -391,12 +392,16 @@ var optab = []Optab{
 	{AMOVD, C_VCON, C_NONE, C_NONE, C_REG, 12, 16, 0, NOTUSETMP, 0},
 
 	{AMOVK, C_VCON, C_NONE, C_NONE, C_REG, 33, 4, 0, 0, 0},
-	{AMOVD, C_AACON, C_NONE, C_NONE, C_REG, 4, 4, REGFROM, 0, 0},
+	{AMOVD, C_AACON, C_NONE, C_NONE, C_RSP, 4, 4, REGFROM, 0, 0},
+	{AMOVD, C_AACON2, C_NONE, C_NONE, C_RSP, 4, 8, REGFROM, 0, 0},
 
-	// Move a large constant to a Vn.
-	{AFMOVQ, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
-	{AFMOVD, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
-	{AFMOVS, C_LCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
+	/* load long effective stack address (load int32 offset and add) */
+	{AMOVD, C_LACON, C_NONE, C_NONE, C_RSP, 34, 8, REGSP, LFROM, 0},
+
+	// Move a large constant to a vector register.
+	{AVMOVQ, C_VCON, C_NONE, C_VCON, C_VREG, 101, 4, 0, LFROM | LFROM3, 0},
+	{AVMOVD, C_VCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
+	{AVMOVS, C_LCON, C_NONE, C_NONE, C_VREG, 101, 4, 0, LFROM, 0},
 
 	/* jump operations */
 	{AB, C_NONE, C_NONE, C_NONE, C_SBRA, 5, 4, 0, 0, 0},
@@ -594,9 +599,6 @@ var optab = []Optab{
 	{AFMOVD, C_LAUTO, C_NONE, C_NONE, C_FREG, 31, 8, REGSP, LFROM, 0},
 	{AFMOVD, C_LOREG, C_NONE, C_NONE, C_FREG, 31, 8, 0, LFROM, 0},
 
-	/* load long effective stack address (load int32 offset and add) */
-	{AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 34, 8, REGSP, LFROM, 0},
-
 	/* pre/post-indexed load (unscaled, signed 9-bit offset) */
 	{AMOVD, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
 	{AMOVW, C_LOREG, C_NONE, C_NONE, C_REG, 22, 4, 0, 0, C_XPOST},
@@ -949,13 +951,14 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				c.ctxt.Diag("zero-width instruction\n%v", p)
 			}
 		}
-		switch o.flag & (LFROM | LTO) {
-		case LFROM:
+		if o.flag&LFROM != 0 {
 			c.addpool(p, &p.From)
-
-		case LTO:
+		}
+		if o.flag&LFROM3 != 0 {
+			c.addpool(p, p.GetFrom3())
+		}
+		if o.flag&LTO != 0 {
 			c.addpool(p, &p.To)
-			break
 		}
 
 		if p.As == AB || p.As == obj.ARET || p.As == AERET { /* TODO: other unconditional operations */
@@ -1173,8 +1176,8 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) {
 	sz := 4
 
 	if a.Type == obj.TYPE_CONST {
-		if lit != int64(int32(lit)) && uint64(lit) != uint64(uint32(lit)) {
-			// out of range -0x80000000 ~ 0xffffffff, must store 64-bit
+		if (lit != int64(int32(lit)) && uint64(lit) != uint64(uint32(lit))) || p.As == AVMOVQ || p.As == AVMOVD {
+			// out of range -0x80000000 ~ 0xffffffff or VMOVQ or VMOVD operand, must store 64-bit.
 			t.As = ADWORD
 			sz = 8
 		} // else store 32-bit
@@ -1361,6 +1364,10 @@ func isaddcon(v int64) bool {
 	return v <= 0xFFF
 }
 
+func isaddcon2(v int64) bool {
+	return 0 <= v && v <= 0xFFFFFF
+}
+
 // isbitcon reports whether a constant can be encoded into a logical instruction.
 // bitcon has a binary form of repetition of a bit sequence of length 2, 4, 8, 16, 32, or 64,
 // which itself is a rotate (w.r.t. the length of the unit) of a sequence of ones.
@@ -1889,10 +1896,14 @@ func (c *ctxt7) aclass(a *obj.Addr) int {
 		default:
 			return C_GOK
 		}
-
-		if isaddcon(c.instoffset) {
+		cf := c.instoffset
+		if isaddcon(cf) || isaddcon(-cf) {
 			return C_AACON
 		}
+		if isaddcon2(cf) {
+			return C_AACON2
+		}
+
 		return C_LACON
 
 	case obj.TYPE_BRANCH:
@@ -2046,7 +2057,7 @@ func cmp(a int, b int) bool {
 		return cmp(C_LCON, b)
 
 	case C_LACON:
-		if b == C_AACON {
+		if b == C_AACON || b == C_AACON2 {
 			return true
 		}
 
@@ -2666,7 +2677,7 @@ func buildop(ctxt *obj.Link) {
 		case AFCSELD:
 			oprangeset(AFCSELS, t)
 
-		case AFMOVS, AFMOVD, AFMOVQ:
+		case AFMOVS, AFMOVD, AVMOVQ, AVMOVD, AVMOVS:
 			break
 
 		case AFCVTZSD:
@@ -3062,11 +3073,10 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		}
 		o1 |= (uint32(r&31) << 5) | uint32(rt&31)
 
-	case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R */
-		o1 = c.opirr(p, p.As)
-
+	case 4: /* mov $addcon, R; mov $recon, R; mov $racon, R; mov $addcon2, R */
 		rt := int(p.To.Reg)
 		r := int(o.param)
+
 		if r == 0 {
 			r = REGZERO
 		} else if r == REGFROM {
@@ -3075,13 +3085,23 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		if r == 0 {
 			r = REGSP
 		}
+
 		v := int32(c.regoff(&p.From))
-		if (v & 0xFFF000) != 0 {
-			v >>= 12
-			o1 |= 1 << 22 /* shift, by 12 */
+		var op int32
+		if v < 0 {
+			v = -v
+			op = int32(c.opirr(p, ASUB))
+		} else {
+			op = int32(c.opirr(p, AADD))
+		}
+
+		if int(o.size) == 8 {
+			o1 = c.oaddi(p, op, v&0xfff000, r, REGTMP)
+			o2 = c.oaddi(p, op, v&0x000fff, REGTMP, rt)
+			break
 		}
 
-		o1 |= ((uint32(v) & 0xFFF) << 10) | (uint32(r&31) << 5) | uint32(rt&31)
+		o1 = c.oaddi(p, op, v, r, rt)
 
 	case 5: /* b s; bl s */
 		o1 = c.opbra(p, p.As)
@@ -5124,7 +5144,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = q<<30 | 0xe<<24 | len<<13
 		o1 |= (uint32(rf&31) << 16) | uint32(offset&31)<<5 | uint32(rt&31)
 
-	case 101: // FOMVQ/FMOVD $vcon, Vd -> load from constant pool.
+	case 101: // VMOVQ $vcon1, $vcon2, Vd or VMOVD|VMOVS $vcon, Vd -> FMOVQ/FMOVD/FMOVS pool(PC), Vd: load from constant pool.
 		o1 = c.omovlit(p.As, p, &p.From, int(p.To.Reg))
 
 	case 102: /* vushll, vushll2, vuxtl, vuxtl2 */
@@ -6654,15 +6674,15 @@ func (c *ctxt7) omovlit(as obj.As, p *obj.Prog, a *obj.Addr, dr int) uint32 {
 	} else {
 		fp, w := 0, 0
 		switch as {
-		case AFMOVS:
+		case AFMOVS, AVMOVS:
 			fp = 1
 			w = 0 /* 32-bit SIMD/FP */
 
-		case AFMOVD:
+		case AFMOVD, AVMOVD:
 			fp = 1
 			w = 1 /* 64-bit SIMD/FP */
 
-		case AFMOVQ:
+		case AVMOVQ:
 			fp = 1
 			w = 2 /* 128-bit SIMD/FP */