aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/memmove_arm64.s
diff options
context:
space:
mode:
authorJonathan Swinney <jswinney@amazon.com>2020-10-30 18:46:23 +0000
committerCherry Zhang <cherryyz@google.com>2020-11-02 15:23:43 +0000
commitd5388e23b5c75bf8189b173051d24a0176a5a303 (patch)
tree8d4313273ad941969eb0e8d5b23b323395e507c1 /src/runtime/memmove_arm64.s
parent7be8358f70ff858f28b9aefe11986da25f1762bc (diff)
downloadgo-d5388e23b5c75bf8189b173051d24a0176a5a303.tar.gz
go-d5388e23b5c75bf8189b173051d24a0176a5a303.zip
runtime: improve memmove performance on arm64
Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Martin Möhrmann <moehrmann@google.com> Reviewed-by: eric fang <eric.fang@arm.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/runtime/memmove_arm64.s')
-rw-r--r--src/runtime/memmove_arm64.s332
1 files changed, 208 insertions, 124 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dbb7e9a28a..43d27629e5 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -6,152 +6,236 @@
// See memmove Go doc for important implementation constraints.
+// Register map
+//
+// dstin R0
+// src R1
+// count R2
+// dst R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data R6-R17
+// tmp1 R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
- MOVD to+0(FP), R3
- MOVD from+8(FP), R4
- MOVD n+16(FP), R5
- CBNZ R5, check
- RET
+ MOVD to+0(FP), R0
+ MOVD from+8(FP), R1
+ MOVD n+16(FP), R2
+ CBZ R2, copy0
-check:
- CMP $16, R5
+ // Small copies: 1..16 bytes
+ CMP $16, R2
BLE copy16
- AND $~31, R5, R7 // R7 is N&~31
- SUB R7, R5, R6 // R6 is N&31
-
- CMP R3, R4
- BLT backward
-
- // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
- // R3 and R4 are advanced as we copy.
-
- // (There may be implementations of armv8 where copying by bytes until
- // at least one of source or dest is word aligned is a worthwhile
- // optimization, but the on the one tested so far (xgene) it did not
- // make a significance difference.)
-
- CBZ R7, noforwardlarge // Do we need to do any quadword copying?
-
- ADD R3, R7, R9 // R9 points just past where we copy by word
-
-forwardlargeloop:
- // Copy 32 bytes at a time.
- LDP.P 32(R4), (R8, R10)
- STP.P (R8, R10), 32(R3)
- LDP -16(R4), (R11, R12)
- STP (R11, R12), -16(R3)
- SUB $32, R7, R7
- CBNZ R7, forwardlargeloop
-
-noforwardlarge:
- CBNZ R6, forwardtail // Do we need to copy any tail bytes?
+ // Large copies
+ CMP $128, R2
+ BHI copy_long
+ CMP $32, R2
+ BHI copy32_128
+
+ // Small copies: 17..32 bytes.
+ LDP (R1), (R6, R7)
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ LDP -16(R4), (R12, R13)
+ STP (R6, R7), (R0)
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ STP (R12, R13), -16(R5)
RET
-forwardtail:
- // There are R6 <= 31 bytes remaining to copy.
- // This is large enough to still contain pointers,
- // which must be copied atomically.
- // Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
- TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0
- LDP.P 16(R4), (R8, R10)
- STP.P (R8, R10), 16(R3)
-
- TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0
- MOVD.P 8(R4), R8
- MOVD.P R8, 8(R3)
-
- AND $7, R6
- CBNZ R6, 2(PC)
- RET
-
- ADD R3, R6, R9 // R9 points just past the destination memory
-
-forwardtailloop:
- MOVBU.P 1(R4), R8
- MOVBU.P R8, 1(R3)
- CMP R3, R9
- BNE forwardtailloop
- RET
-
- // Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
copy16:
- ADD R4, R5, R8 // R8 points just past the last source byte
- ADD R3, R5, R9 // R9 points just past the last destination byte
- CMP $8, R5
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ CMP $8, R2
BLT copy7
- MOVD (R4), R6
- MOVD -8(R8), R7
- MOVD R6, (R3)
- MOVD R7, -8(R9)
+ MOVD (R1), R6
+ MOVD -8(R4), R7
+ MOVD R6, (R0)
+ MOVD R7, -8(R5)
RET
copy7:
- TBZ $2, R5, copy3
- MOVWU (R4), R6
- MOVWU -4(R8), R7
- MOVW R6, (R3)
- MOVW R7, -4(R9)
+ TBZ $2, R2, copy3
+ MOVWU (R1), R6
+ MOVWU -4(R4), R7
+ MOVW R6, (R0)
+ MOVW R7, -4(R5)
RET
copy3:
- TBZ $1, R5, copy1
- MOVHU (R4), R6
- MOVHU -2(R8), R7
- MOVH R6, (R3)
- MOVH R7, -2(R9)
+ TBZ $1, R2, copy1
+ MOVHU (R1), R6
+ MOVHU -2(R4), R7
+ MOVH R6, (R0)
+ MOVH R7, -2(R5)
RET
copy1:
- MOVBU (R4), R6
- MOVB R6, (R3)
- RET
-
-backward:
- // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
- // R3 and R4 are advanced to the end of the destination/source buffers
- // respectively and moved back as we copy.
-
- ADD R4, R5, R4 // R4 points just past the last source byte
- ADD R3, R5, R3 // R3 points just past the last destination byte
-
- CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
+ MOVBU (R1), R6
+ MOVB R6, (R0)
- AND $7, R6, R12
- CBZ R12, backwardtaillarge
-
- SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
- // Copy sub-pointer-size tail.
- MOVBU.W -1(R4), R8
- MOVBU.W R8, -1(R3)
- CMP R9, R3
- BNE backwardtailloop
-
-backwardtaillarge:
- // Do 8/16-byte write if possible.
- // See comment at forwardtail.
- TBZ $3, R6, 3(PC)
- MOVD.W -8(R4), R8
- MOVD.W R8, -8(R3)
+copy0:
+ RET
- TBZ $4, R6, 3(PC)
- LDP.W -16(R4), (R8, R10)
- STP.W (R8, R10), -16(R3)
+ // Medium copies: 33..128 bytes.
+copy32_128:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ LDP (R1), (R6, R7)
+ LDP 16(R1), (R8, R9)
+ LDP -32(R4), (R10, R11)
+ LDP -16(R4), (R12, R13)
+ CMP $64, R2
+ BHI copy128
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
+ RET
-nobackwardtail:
- CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
+ // Copy 65..128 bytes.
+copy128:
+ LDP 32(R1), (R14, R15)
+ LDP 48(R1), (R16, R17)
+ CMP $96, R2
+ BLS copy96
+ LDP -64(R4), (R2, R3)
+ LDP -48(R4), (R1, R4)
+ STP (R2, R3), -64(R5)
+ STP (R1, R4), -48(R5)
+
+copy96:
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R14, R15), 32(R0)
+ STP (R16, R17), 48(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
RET
-backwardlarge:
- SUB R7, R3, R9 // R9 points at the lowest destination byte
+ // Copy more than 128 bytes.
+copy_long:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ MOVD ZR, R7
+ MOVD ZR, R8
+
+ CMP $1024, R2
+ BLT backward_check
+ // feature detect to decide how to align
+ MOVBU runtime·arm64UseAlignedLoads(SB), R6
+ CBNZ R6, use_aligned_loads
+ MOVD R0, R7
+ MOVD R5, R8
+ B backward_check
+use_aligned_loads:
+ MOVD R1, R7
+ MOVD R4, R8
+ // R7 and R8 are used here for the realignment calculation. In
+ // the use_aligned_loads case, R7 is the src pointer and R8 is
+ // srcend pointer, which is used in the backward copy case.
+ // When doing aligned stores, R7 is the dst pointer and R8 is
+ // the dstend pointer.
+
+backward_check:
+ // Use backward copy if there is an overlap.
+ SUB R1, R0, R14
+ CBZ R14, copy0
+ CMP R2, R14
+ BCC copy_long_backward
+
+ // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+ LDP (R1), (R12, R13) // Load A
+ AND $15, R7, R14 // Calculate the realignment offset
+ SUB R14, R1, R1
+ SUB R14, R0, R3 // move dst back same amount as src
+ ADD R14, R2, R2
+ LDP 16(R1), (R6, R7) // Load B
+ STP (R12, R13), (R0) // Store A
+ LDP 32(R1), (R8, R9) // Load C
+ LDP 48(R1), (R10, R11) // Load D
+ LDP.W 64(R1), (R12, R13) // Load E
+ // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+ SUBS $144, R2, R2
+ BLS copy64_from_end
+
+loop64:
+ STP (R6, R7), 16(R3) // Store B
+ LDP 16(R1), (R6, R7) // Load B (next iteration)
+ STP (R8, R9), 32(R3) // Store C
+ LDP 32(R1), (R8, R9) // Load C
+ STP (R10, R11), 48(R3) // Store D
+ LDP 48(R1), (R10, R11) // Load D
+ STP.W (R12, R13), 64(R3) // Store E
+ LDP.W 64(R1), (R12, R13) // Load E
+ SUBS $64, R2, R2
+ BHI loop64
+
+ // Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+ LDP -64(R4), (R14, R15) // Load F
+ STP (R6, R7), 16(R3) // Store B
+ LDP -48(R4), (R6, R7) // Load G
+ STP (R8, R9), 32(R3) // Store C
+ LDP -32(R4), (R8, R9) // Load H
+ STP (R10, R11), 48(R3) // Store D
+ LDP -16(R4), (R10, R11) // Load I
+ STP (R12, R13), 64(R3) // Store E
+ STP (R14, R15), -64(R5) // Store F
+ STP (R6, R7), -48(R5) // Store G
+ STP (R8, R9), -32(R5) // Store H
+ STP (R10, R11), -16(R5) // Store I
+ RET
-backwardlargeloop:
- LDP -16(R4), (R8, R10)
- STP (R8, R10), -16(R3)
- LDP.W -32(R4), (R11, R12)
- STP.W (R11, R12), -32(R3)
- CMP R9, R3
- BNE backwardlargeloop
+ // Large backward copy for overlapping copies.
+ // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+ LDP -16(R4), (R12, R13)
+ AND $15, R8, R14
+ SUB R14, R4, R4
+ SUB R14, R2, R2
+ LDP -16(R4), (R6, R7)
+ STP (R12, R13), -16(R5)
+ LDP -32(R4), (R8, R9)
+ LDP -48(R4), (R10, R11)
+ LDP.W -64(R4), (R12, R13)
+ SUB R14, R5, R5
+ SUBS $128, R2, R2
+ BLS copy64_from_start
+
+loop64_backward:
+ STP (R6, R7), -16(R5)
+ LDP -16(R4), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP -32(R4), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP -48(R4), (R10, R11)
+ STP.W (R12, R13), -64(R5)
+ LDP.W -64(R4), (R12, R13)
+ SUBS $64, R2, R2
+ BHI loop64_backward
+
+ // Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+ LDP 48(R1), (R2, R3)
+ STP (R6, R7), -16(R5)
+ LDP 32(R1), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP 16(R1), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP (R1), (R10, R11)
+ STP (R12, R13), -64(R5)
+ STP (R2, R3), 48(R0)
+ STP (R6, R7), 32(R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), (R0)
RET