aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/ssa/gen/ARM64.rules
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/compile/internal/ssa/gen/ARM64.rules')
-rw-r--r--src/cmd/compile/internal/ssa/gen/ARM64.rules244
1 files changed, 239 insertions, 5 deletions
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index 9290b3c186..9b80094f86 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -19,12 +19,21 @@
(Sub64F x y) -> (FSUBD x y)
(Mul64 x y) -> (MUL x y)
-(Mul32 x y) -> (MUL x y)
-(Mul16 x y) -> (MUL x y)
-(Mul8 x y) -> (MUL x y)
+(Mul32 x y) -> (MULW x y)
+(Mul16 x y) -> (MULW x y)
+(Mul8 x y) -> (MULW x y)
(Mul32F x y) -> (FMULS x y)
(Mul64F x y) -> (FMULD x y)
+(Hmul64 x y) -> (MULH x y)
+(Hmul64u x y) -> (UMULH x y)
+(Hmul32 x y) -> (SRAconst (MULL <config.fe.TypeInt64()> x y) [32])
+(Hmul32u x y) -> (SRAconst (UMULL <config.fe.TypeUInt64()> x y) [32])
+(Hmul16 x y) -> (SRAconst (MULW <config.fe.TypeInt32()> (SignExt16to32 x) (SignExt16to32 y)) [16])
+(Hmul16u x y) -> (SRLconst (MUL <config.fe.TypeUInt32()> (ZeroExt16to32 x) (ZeroExt16to32 y)) [16])
+(Hmul8 x y) -> (SRAconst (MULW <config.fe.TypeInt16()> (SignExt8to32 x) (SignExt8to32 y)) [8])
+(Hmul8u x y) -> (SRLconst (MUL <config.fe.TypeUInt16()> (ZeroExt8to32 x) (ZeroExt8to32 y)) [8])
+
(Div64 x y) -> (DIV x y)
(Div64u x y) -> (UDIV x y)
(Div32 x y) -> (DIVW x y)
@@ -45,6 +54,8 @@
(Mod8 x y) -> (MODW (SignExt8to32 x) (SignExt8to32 y))
(Mod8u x y) -> (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Avg64u <t> x y) -> (ADD (ADD <t> (SRLconst <t> x [1]) (SRLconst <t> y [1])) (AND <t> (AND <t> x y) (MOVDconst [1])))
+
(And64 x y) -> (AND x y)
(And32 x y) -> (AND x y)
(And16 x y) -> (AND x y)
@@ -73,12 +84,112 @@
(Com16 x) -> (MVN x)
(Com8 x) -> (MVN x)
+(Sqrt x) -> (FSQRTD x)
+
// boolean ops -- booleans are represented with 0=false, 1=true
(AndB x y) -> (AND x y)
(OrB x y) -> (OR x y)
-(EqB x y) -> (XORconst [1] (XOR <config.fe.TypeBool()> x y))
+(EqB x y) -> (XOR (MOVDconst [1]) (XOR <config.fe.TypeBool()> x y))
(NeqB x y) -> (XOR x y)
-(Not x) -> (XORconst [1] x)
+(Not x) -> (XOR (MOVDconst [1]) x)
+
+// constant shifts
+(Lsh64x64 x (MOVDconst [c])) && uint64(c) < 64 -> (SLLconst x [c])
+(Rsh64x64 x (MOVDconst [c])) && uint64(c) < 64 -> (SRAconst x [c])
+(Rsh64Ux64 x (MOVDconst [c])) && uint64(c) < 64 -> (SRLconst x [c])
+(Lsh32x64 x (MOVDconst [c])) && uint64(c) < 32 -> (SLLconst x [c])
+(Rsh32x64 x (MOVDconst [c])) && uint64(c) < 32 -> (SRAconst (SignExt32to64 x) [c])
+(Rsh32Ux64 x (MOVDconst [c])) && uint64(c) < 32 -> (SRLconst (ZeroExt32to64 x) [c])
+(Lsh16x64 x (MOVDconst [c])) && uint64(c) < 16 -> (SLLconst x [c])
+(Rsh16x64 x (MOVDconst [c])) && uint64(c) < 16 -> (SRAconst (SignExt16to64 x) [c])
+(Rsh16Ux64 x (MOVDconst [c])) && uint64(c) < 16 -> (SRLconst (ZeroExt16to64 x) [c])
+(Lsh8x64 x (MOVDconst [c])) && uint64(c) < 8 -> (SLLconst x [c])
+(Rsh8x64 x (MOVDconst [c])) && uint64(c) < 8 -> (SRAconst (SignExt8to64 x) [c])
+(Rsh8Ux64 x (MOVDconst [c])) && uint64(c) < 8 -> (SRLconst (ZeroExt8to64 x) [c])
+
+// large constant shifts
+(Lsh64x64 _ (MOVDconst [c])) && uint64(c) >= 64 -> (MOVDconst [0])
+(Rsh64Ux64 _ (MOVDconst [c])) && uint64(c) >= 64 -> (MOVDconst [0])
+(Lsh32x64 _ (MOVDconst [c])) && uint64(c) >= 32 -> (MOVDconst [0])
+(Rsh32Ux64 _ (MOVDconst [c])) && uint64(c) >= 32 -> (MOVDconst [0])
+(Lsh16x64 _ (MOVDconst [c])) && uint64(c) >= 16 -> (MOVDconst [0])
+(Rsh16Ux64 _ (MOVDconst [c])) && uint64(c) >= 16 -> (MOVDconst [0])
+(Lsh8x64 _ (MOVDconst [c])) && uint64(c) >= 8 -> (MOVDconst [0])
+(Rsh8Ux64 _ (MOVDconst [c])) && uint64(c) >= 8 -> (MOVDconst [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh64x64 x (MOVDconst [c])) && uint64(c) >= 64 -> (SRAconst x [63])
+(Rsh32x64 x (MOVDconst [c])) && uint64(c) >= 32 -> (SRAconst (SignExt32to64 x) [63])
+(Rsh16x64 x (MOVDconst [c])) && uint64(c) >= 16 -> (SRAconst (SignExt16to64 x) [63])
+(Rsh8x64 x (MOVDconst [c])) && uint64(c) >= 8 -> (SRAconst (SignExt8to64 x) [63])
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+(Lsh64x64 <t> x y) -> (CSELULT (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh64x32 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh64x8 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh32x64 <t> x y) -> (CSELULT (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh32x32 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh32x8 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh16x64 <t> x y) -> (CSELULT (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh16x32 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh16x8 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh8x64 <t> x y) -> (CSELULT (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh8x32 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh8x8 <t> x y) -> (CSELULT (SLL <t> x (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh64Ux64 <t> x y) -> (CSELULT (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh64Ux32 <t> x y) -> (CSELULT (SRL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) -> (CSELULT (SRL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh64Ux8 <t> x y) -> (CSELULT (SRL <t> x (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh32Ux64 <t> x y) -> (CSELULT (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh32Ux32 <t> x y) -> (CSELULT (SRL <t> (ZeroExt32to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) -> (CSELULT (SRL <t> (ZeroExt32to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh32Ux8 <t> x y) -> (CSELULT (SRL <t> (ZeroExt32to64 x) (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh16Ux64 <t> x y) -> (CSELULT (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh16Ux32 <t> x y) -> (CSELULT (SRL <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) -> (CSELULT (SRL <t> (ZeroExt16to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh16Ux8 <t> x y) -> (CSELULT (SRL <t> (ZeroExt16to64 x) (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh8Ux64 <t> x y) -> (CSELULT (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh8Ux32 <t> x y) -> (CSELULT (SRL <t> (ZeroExt8to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) -> (CSELULT (SRL <t> (ZeroExt8to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh8Ux8 <t> x y) -> (CSELULT (SRL <t> (ZeroExt8to64 x) (ZeroExt8to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh64x64 x y) -> (SRA x (CSELULT <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh64x32 x y) -> (SRA x (CSELULT <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh64x16 x y) -> (SRA x (CSELULT <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh64x8 x y) -> (SRA x (CSELULT <y.Type> (ZeroExt8to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh32x64 x y) -> (SRA (SignExt32to64 x) (CSELULT <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh32x32 x y) -> (SRA (SignExt32to64 x) (CSELULT <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh32x16 x y) -> (SRA (SignExt32to64 x) (CSELULT <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh32x8 x y) -> (SRA (SignExt32to64 x) (CSELULT <y.Type> (ZeroExt8to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh16x64 x y) -> (SRA (SignExt16to64 x) (CSELULT <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh16x32 x y) -> (SRA (SignExt16to64 x) (CSELULT <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh16x16 x y) -> (SRA (SignExt16to64 x) (CSELULT <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh16x8 x y) -> (SRA (SignExt16to64 x) (CSELULT <y.Type> (ZeroExt8to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh8x64 x y) -> (SRA (SignExt8to64 x) (CSELULT <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh8x32 x y) -> (SRA (SignExt8to64 x) (CSELULT <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh8x16 x y) -> (SRA (SignExt8to64 x) (CSELULT <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh8x8 x y) -> (SRA (SignExt8to64 x) (CSELULT <y.Type> (ZeroExt8to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Lrot64 x [c]) -> (RORconst x [64-c&63])
+(Lrot32 x [c]) -> (RORWconst x [32-c&31])
+(Lrot16 <t> x [c]) -> (OR (SLLconst <t> x [c&15]) (SRLconst <t> (ZeroExt16to64 x) [16-c&15]))
+(Lrot8 <t> x [c]) -> (OR (SLLconst <t> x [c&7]) (SRLconst <t> (ZeroExt8to64 x) [8-c&7]))
// constants
(Const64 [val]) -> (MOVDconst [val])
@@ -224,6 +335,129 @@
(Store [4] ptr val mem) && is32BitFloat(val.Type) -> (FMOVSstore ptr val mem)
(Store [8] ptr val mem) && is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
+// zeroing
+(Zero [s] _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore ptr (MOVDconst [0]) mem)
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 2 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore ptr (MOVDconst [0]) mem)
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 2 ->
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore ptr (MOVDconst [0]) mem)
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 4 ->
+ (MOVBstore [3] ptr (MOVDconst [0])
+ (MOVBstore [2] ptr (MOVDconst [0])
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem))))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore ptr (MOVDconst [0]) mem)
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [6] ptr (MOVDconst [0])
+ (MOVHstore [4] ptr (MOVDconst [0])
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem))))
+
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 3 ->
+ (MOVBstore [2] ptr (MOVDconst [0])
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem)))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 6 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [4] ptr (MOVDconst [0])
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem)))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 12 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore [8] ptr (MOVDconst [0])
+ (MOVWstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem)))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 16 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [s] ptr mem) && SizeAndAlign(s).Size() == 24 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore [16] ptr (MOVDconst [0])
+ (MOVDstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem)))
+
+// medium zeroing uses a duff device
+// 4, 8, and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] ptr mem)
+ && SizeAndAlign(s).Size()%8 == 0 && SizeAndAlign(s).Size() > 24 && SizeAndAlign(s).Size() <= 8*128
+ && SizeAndAlign(s).Align()%8 == 0 && !config.noDuffDevice ->
+ (DUFFZERO [4 * (128 - int64(SizeAndAlign(s).Size()/8))] ptr mem)
+
+// large or unaligned zeroing uses a loop
+(Zero [s] ptr mem)
+ && (SizeAndAlign(s).Size() > 8*128 || config.noDuffDevice) || SizeAndAlign(s).Align()%8 != 0 ->
+ (LoweredZero [SizeAndAlign(s).Align()]
+ ptr
+ (ADDconst <ptr.Type> [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)] ptr)
+ mem)
+
+// moves
+(Move [s] _ _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBUload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore dst (MOVHUload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 ->
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore dst (MOVWUload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [2] dst (MOVHUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 ->
+ (MOVBstore [3] dst (MOVBUload [3] src mem)
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore dst (MOVDload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore [4] dst (MOVWUload [4] src mem)
+ (MOVWstore dst (MOVWUload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [6] dst (MOVHUload [6] src mem)
+ (MOVHstore [4] dst (MOVHUload [4] src mem)
+ (MOVHstore [2] dst (MOVHUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem))))
+
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem)))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 6 && SizeAndAlign(s).Align()%2 == 0 ->
+ (MOVHstore [4] dst (MOVHUload [4] src mem)
+ (MOVHstore [2] dst (MOVHUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem)))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 12 && SizeAndAlign(s).Align()%4 == 0 ->
+ (MOVWstore [8] dst (MOVWUload [8] src mem)
+ (MOVWstore [4] dst (MOVWUload [4] src mem)
+ (MOVWstore dst (MOVWUload src mem) mem)))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 24 && SizeAndAlign(s).Align()%8 == 0 ->
+ (MOVDstore [16] dst (MOVDload [16] src mem)
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem)))
+
+// large or unaligned move uses a loop
+// DUFFCOPY is not implemented on ARM64 (TODO)
+(Move [s] dst src mem)
+ && SizeAndAlign(s).Size() > 24 || SizeAndAlign(s).Align()%8 != 0 ->
+ (LoweredMove [SizeAndAlign(s).Align()]
+ dst
+ src
+ (ADDconst <src.Type> src [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)])
+ mem)
+
// calls
(StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
(ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)