src/runtime/memmove_arm64.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// See memmove Go doc for important implementation constraints.

// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
	MOVD	to+0(FP), R3
	MOVD	from+8(FP), R4
	MOVD	n+16(FP), R5
	CBNZ	R5, check
	RET

check:
	CMP	$16, R5
	BLE	copy16

	AND	$~31, R5, R7	// R7 is N&~31
	SUB	R7, R5, R6	// R6 is N&31

	CMP	R3, R4
	BLT	backward

	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
	// R3 and R4 are advanced as we copy.

	// (There may be implementations of armv8 where copying by bytes until
	// at least one of source or dest is word aligned is a worthwhile
	// optimization, but the on the one tested so far (xgene) it did not
	// make a significance difference.)

	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?

	ADD	R3, R7, R9	// R9 points just past where we copy by word

forwardlargeloop:
	// Copy 32 bytes at a time.
	LDP.P	32(R4), (R8, R10)
	STP.P	(R8, R10), 32(R3)
	LDP	-16(R4), (R11, R12)
	STP	(R11, R12), -16(R3)
	SUB 	$32, R7, R7
	CBNZ	R7, forwardlargeloop

noforwardlarge:
	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
	RET

forwardtail:
	// There are R6 <= 31 bytes remaining to copy.
	// This is large enough to still contain pointers,
	// which must be copied atomically.
	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
	LDP.P	16(R4), (R8, R10)
	STP.P	(R8, R10), 16(R3)

	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
	MOVD.P	8(R4), R8
	MOVD.P	R8, 8(R3)

	AND	$7, R6
	CBNZ	R6, 2(PC)
	RET

	ADD	R3, R6, R9	// R9 points just past the destination memory

forwardtailloop:
	MOVBU.P 1(R4), R8
	MOVBU.P	R8, 1(R3)
	CMP	R3, R9
	BNE	forwardtailloop
	RET

	// Small copies: 1..16 bytes.
copy16:
	ADD	R4, R5, R8	// R8 points just past the last source byte
	ADD	R3, R5, R9	// R9 points just past the last destination byte
	CMP	$8, R5
	BLT	copy7
	MOVD	(R4), R6
	MOVD	-8(R8), R7
	MOVD	R6, (R3)
	MOVD	R7, -8(R9)
	RET

copy7:
	TBZ	$2, R5, copy3
	MOVWU	(R4), R6
	MOVWU	-4(R8), R7
	MOVW	R6, (R3)
	MOVW	R7, -4(R9)
	RET

copy3:
	TBZ	$1, R5, copy1
	MOVHU	(R4), R6
	MOVHU	-2(R8), R7
	MOVH	R6, (R3)
	MOVH	R7, -2(R9)
	RET

copy1:
	MOVBU	(R4), R6
	MOVB	R6, (R3)
	RET

backward:
	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
	// R3 and R4 are advanced to the end of the destination/source buffers
	// respectively and moved back as we copy.

	ADD	R4, R5, R4	// R4 points just past the last source byte
	ADD	R3, R5, R3	// R3 points just past the last destination byte

	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?

	AND	$7, R6, R12
	CBZ	R12, backwardtaillarge

	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
backwardtailloop:
	// Copy sub-pointer-size tail.
	MOVBU.W	-1(R4), R8
	MOVBU.W	R8, -1(R3)
	CMP	R9, R3
	BNE	backwardtailloop

backwardtaillarge:
	// Do 8/16-byte write if possible.
	// See comment at forwardtail.
	TBZ	$3, R6, 3(PC)
	MOVD.W	-8(R4), R8
	MOVD.W	R8, -8(R3)

	TBZ	$4, R6, 3(PC)
	LDP.W	-16(R4), (R8, R10)
	STP.W	(R8, R10), -16(R3)

nobackwardtail:
	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
	RET

backwardlarge:
	SUB	R7, R3, R9	// R9 points at the lowest destination byte

backwardlargeloop:
	LDP	-16(R4), (R8, R10)
	STP	(R8, R10), -16(R3)
	LDP.W	-32(R4), (R11, R12)
	STP.W	(R11, R12), -32(R3)
	CMP	R9, R3
	BNE	backwardlargeloop
	RET