aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/memmove_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/memmove_arm64.s')
-rw-r--r--src/runtime/memmove_arm64.s42
1 files changed, 37 insertions, 5 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dcbead8cf4..4b6b4965af 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -22,7 +22,7 @@ check:
CMP R3, R4
BLT backward
- // Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+ // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
// R3 and R4 are advanced as we copy.
// (There may be implementations of armv8 where copying by bytes until
@@ -30,11 +30,12 @@ check:
// optimization, but the on the one tested so far (xgene) it did not
// make a significance difference.)
- CBZ R7, noforwardlarge // Do we need to do any doubleword-by-doubleword copying?
+ CBZ R7, noforwardlarge // Do we need to do any quadword copying?
ADD R3, R7, R9 // R9 points just past where we copy by word
forwardlargeloop:
+ // Copy 32 bytes at a time.
LDP.P 32(R4), (R8, R10)
STP.P (R8, R10), 32(R3)
LDP -16(R4), (R11, R12)
@@ -43,10 +44,26 @@ forwardlargeloop:
CBNZ R7, forwardlargeloop
noforwardlarge:
- CBNZ R6, forwardtail // Do we need to do any byte-by-byte copying?
+ CBNZ R6, forwardtail // Do we need to copy any tail bytes?
RET
forwardtail:
+ // There are R6 <= 31 bytes remaining to copy.
+ // This is large enough to still contain pointers,
+ // which must be copied atomically.
+ // Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
+ TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0
+ LDP.P 16(R4), (R8, R10)
+ STP.P (R8, R10), 16(R3)
+
+ TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0
+ MOVD.P 8(R4), R8
+ MOVD.P R8, 8(R3)
+
+ AND $7, R6
+ CBNZ R6, 2(PC)
+ RET
+
ADD R3, R6, R9 // R9 points just past the destination memory
forwardtailloop:
@@ -90,7 +107,7 @@ copy1:
RET
backward:
- // Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
+ // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
@@ -99,13 +116,28 @@ backward:
CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
- SUB R6, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
+ AND $7, R6, R12
+ CBZ R12, backwardtaillarge
+
+ SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
backwardtailloop:
+ // Copy sub-pointer-size tail.
MOVBU.W -1(R4), R8
MOVBU.W R8, -1(R3)
CMP R9, R3
BNE backwardtailloop
+backwardtaillarge:
+ // Do 8/16-byte write if possible.
+ // See comment at forwardtail.
+ TBZ $3, R6, 3(PC)
+ MOVD.W -8(R4), R8
+ MOVD.W R8, -8(R3)
+
+ TBZ $4, R6, 3(PC)
+ LDP.W -16(R4), (R8, R10)
+ STP.W (R8, R10), -16(R3)
+
nobackwardtail:
CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
RET