1 files changed, 37 insertions, 5 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dcbead8cf4..4b6b4965af 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -22,7 +22,7 @@ check:
 	CMP	R3, R4
 	BLT	backward
 
-	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
 	// R3 and R4 are advanced as we copy.
 
         // (There may be implementations of armv8 where copying by bytes until
@@ -30,11 +30,12 @@ check:
         // optimization, but the on the one tested so far (xgene) it did not
         // make a significance difference.)
 
-	CBZ	R7, noforwardlarge	// Do we need to do any doubleword-by-doubleword copying?
+	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
 
 	ADD	R3, R7, R9	// R9 points just past where we copy by word
 
 forwardlargeloop:
+	// Copy 32 bytes at a time.
 	LDP.P	32(R4), (R8, R10)
 	STP.P	(R8, R10), 32(R3)
 	LDP	-16(R4), (R11, R12)
@@ -43,10 +44,26 @@ forwardlargeloop:
 	CBNZ	R7, forwardlargeloop
 
 noforwardlarge:
-	CBNZ	R6, forwardtail		// Do we need to do any byte-by-byte copying?
+	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
 	RET
 
 forwardtail:
+	// There are R6 <= 31 bytes remaining to copy.
+	// This is large enough to still contain pointers,
+	// which must be copied atomically.
+	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
+	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
+	LDP.P	16(R4), (R8, R10)
+	STP.P	(R8, R10), 16(R3)
+
+	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
+	MOVD.P	8(R4), R8
+	MOVD.P	R8, 8(R3)
+
+	AND	$7, R6
+	CBNZ	R6, 2(PC)
+	RET
+
 	ADD	R3, R6, R9	// R9 points just past the destination memory
 
 forwardtailloop:
@@ -90,7 +107,7 @@ copy1:
 	RET
 
 backward:
-	// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
+	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
@@ -99,13 +116,28 @@ backward:
 
 	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
 
-	SUB	R6, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
+	AND	$7, R6, R12
+	CBZ	R12, backwardtaillarge
+
+	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
 backwardtailloop:
+	// Copy sub-pointer-size tail.
 	MOVBU.W	-1(R4), R8
 	MOVBU.W	R8, -1(R3)
 	CMP	R9, R3
 	BNE	backwardtailloop
 
+backwardtaillarge:
+	// Do 8/16-byte write if possible.
+	// See comment at forwardtail.
+	TBZ	$3, R6, 3(PC)
+	MOVD.W	-8(R4), R8
+	MOVD.W	R8, -8(R3)
+
+	TBZ	$4, R6, 3(PC)
+	LDP.W	-16(R4), (R8, R10)
+	STP.W	(R8, R10), -16(R3)
+
 nobackwardtail:
 	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
 	RET