diff options
Diffstat (limited to 'src/runtime/memmove_arm64.s')
-rw-r--r-- | src/runtime/memmove_arm64.s | 42 |
1 files changed, 37 insertions, 5 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s index dcbead8cf4..4b6b4965af 100644 --- a/src/runtime/memmove_arm64.s +++ b/src/runtime/memmove_arm64.s @@ -22,7 +22,7 @@ check: CMP R3, R4 BLT backward - // Copying forward proceeds by copying R7/8 words then copying R6 bytes. + // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes. // R3 and R4 are advanced as we copy. // (There may be implementations of armv8 where copying by bytes until @@ -30,11 +30,12 @@ check: // optimization, but the on the one tested so far (xgene) it did not // make a significance difference.) - CBZ R7, noforwardlarge // Do we need to do any doubleword-by-doubleword copying? + CBZ R7, noforwardlarge // Do we need to do any quadword copying? ADD R3, R7, R9 // R9 points just past where we copy by word forwardlargeloop: + // Copy 32 bytes at a time. LDP.P 32(R4), (R8, R10) STP.P (R8, R10), 32(R3) LDP -16(R4), (R11, R12) @@ -43,10 +44,26 @@ forwardlargeloop: CBNZ R7, forwardlargeloop noforwardlarge: - CBNZ R6, forwardtail // Do we need to do any byte-by-byte copying? + CBNZ R6, forwardtail // Do we need to copy any tail bytes? RET forwardtail: + // There are R6 <= 31 bytes remaining to copy. + // This is large enough to still contain pointers, + // which must be copied atomically. + // Copy the next 16 bytes, then 8 bytes, then any remaining bytes. + TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0 + LDP.P 16(R4), (R8, R10) + STP.P (R8, R10), 16(R3) + + TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0 + MOVD.P 8(R4), R8 + MOVD.P R8, 8(R3) + + AND $7, R6 + CBNZ R6, 2(PC) + RET + ADD R3, R6, R9 // R9 points just past the destination memory forwardtailloop: @@ -90,7 +107,7 @@ copy1: RET backward: - // Copying backwards proceeds by copying R6 bytes then copying R7/8 words. + // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords. // R3 and R4 are advanced to the end of the destination/source buffers // respectively and moved back as we copy. @@ -99,13 +116,28 @@ backward: CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? - SUB R6, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. + AND $7, R6, R12 + CBZ R12, backwardtaillarge + + SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. backwardtailloop: + // Copy sub-pointer-size tail. MOVBU.W -1(R4), R8 MOVBU.W R8, -1(R3) CMP R9, R3 BNE backwardtailloop +backwardtaillarge: + // Do 8/16-byte write if possible. + // See comment at forwardtail. + TBZ $3, R6, 3(PC) + MOVD.W -8(R4), R8 + MOVD.W R8, -8(R3) + + TBZ $4, R6, 3(PC) + LDP.W -16(R4), (R8, R10) + STP.W (R8, R10), -16(R3) + nobackwardtail: CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? RET |