// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // See memmove Go doc for important implementation constraints. // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 MOVD to+0(FP), R3 MOVD from+8(FP), R4 MOVD n+16(FP), R5 CBNZ R5, check RET check: CMP $16, R5 BLE copy16 AND $~31, R5, R7 // R7 is N&~31 SUB R7, R5, R6 // R6 is N&31 CMP R3, R4 BLT backward // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes. // R3 and R4 are advanced as we copy. // (There may be implementations of armv8 where copying by bytes until // at least one of source or dest is word aligned is a worthwhile // optimization, but the on the one tested so far (xgene) it did not // make a significance difference.) CBZ R7, noforwardlarge // Do we need to do any quadword copying? ADD R3, R7, R9 // R9 points just past where we copy by word forwardlargeloop: // Copy 32 bytes at a time. LDP.P 32(R4), (R8, R10) STP.P (R8, R10), 32(R3) LDP -16(R4), (R11, R12) STP (R11, R12), -16(R3) SUB $32, R7, R7 CBNZ R7, forwardlargeloop noforwardlarge: CBNZ R6, forwardtail // Do we need to copy any tail bytes? RET forwardtail: // There are R6 <= 31 bytes remaining to copy. // This is large enough to still contain pointers, // which must be copied atomically. // Copy the next 16 bytes, then 8 bytes, then any remaining bytes. TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0 LDP.P 16(R4), (R8, R10) STP.P (R8, R10), 16(R3) TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0 MOVD.P 8(R4), R8 MOVD.P R8, 8(R3) AND $7, R6 CBNZ R6, 2(PC) RET ADD R3, R6, R9 // R9 points just past the destination memory forwardtailloop: MOVBU.P 1(R4), R8 MOVBU.P R8, 1(R3) CMP R3, R9 BNE forwardtailloop RET // Small copies: 1..16 bytes. copy16: ADD R4, R5, R8 // R8 points just past the last source byte ADD R3, R5, R9 // R9 points just past the last destination byte CMP $8, R5 BLT copy7 MOVD (R4), R6 MOVD -8(R8), R7 MOVD R6, (R3) MOVD R7, -8(R9) RET copy7: TBZ $2, R5, copy3 MOVWU (R4), R6 MOVWU -4(R8), R7 MOVW R6, (R3) MOVW R7, -4(R9) RET copy3: TBZ $1, R5, copy1 MOVHU (R4), R6 MOVHU -2(R8), R7 MOVH R6, (R3) MOVH R7, -2(R9) RET copy1: MOVBU (R4), R6 MOVB R6, (R3) RET backward: // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords. // R3 and R4 are advanced to the end of the destination/source buffers // respectively and moved back as we copy. ADD R4, R5, R4 // R4 points just past the last source byte ADD R3, R5, R3 // R3 points just past the last destination byte CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? AND $7, R6, R12 CBZ R12, backwardtaillarge SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. backwardtailloop: // Copy sub-pointer-size tail. MOVBU.W -1(R4), R8 MOVBU.W R8, -1(R3) CMP R9, R3 BNE backwardtailloop backwardtaillarge: // Do 8/16-byte write if possible. // See comment at forwardtail. TBZ $3, R6, 3(PC) MOVD.W -8(R4), R8 MOVD.W R8, -8(R3) TBZ $4, R6, 3(PC) LDP.W -16(R4), (R8, R10) STP.W (R8, R10), -16(R3) nobackwardtail: CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? RET backwardlarge: SUB R7, R3, R9 // R9 points at the lowest destination byte backwardlargeloop: LDP -16(R4), (R8, R10) STP (R8, R10), -16(R3) LDP.W -32(R4), (R11, R12) STP.W (R11, R12), -32(R3) CMP R9, R3 BNE backwardlargeloop RET