1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
CBNZ R5, check
RET
check:
CMP $16, R5
BLE copy16
AND $~31, R5, R7 // R7 is N&~31
SUB R7, R5, R6 // R6 is N&31
CMP R3, R4
BLT backward
// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
// R3 and R4 are advanced as we copy.
// (There may be implementations of armv8 where copying by bytes until
// at least one of source or dest is word aligned is a worthwhile
// optimization, but the on the one tested so far (xgene) it did not
// make a significance difference.)
CBZ R7, noforwardlarge // Do we need to do any quadword copying?
ADD R3, R7, R9 // R9 points just past where we copy by word
forwardlargeloop:
// Copy 32 bytes at a time.
LDP.P 32(R4), (R8, R10)
STP.P (R8, R10), 32(R3)
LDP -16(R4), (R11, R12)
STP (R11, R12), -16(R3)
SUB $32, R7, R7
CBNZ R7, forwardlargeloop
noforwardlarge:
CBNZ R6, forwardtail // Do we need to copy any tail bytes?
RET
forwardtail:
// There are R6 <= 31 bytes remaining to copy.
// This is large enough to still contain pointers,
// which must be copied atomically.
// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0
LDP.P 16(R4), (R8, R10)
STP.P (R8, R10), 16(R3)
TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0
MOVD.P 8(R4), R8
MOVD.P R8, 8(R3)
AND $7, R6
CBNZ R6, 2(PC)
RET
ADD R3, R6, R9 // R9 points just past the destination memory
forwardtailloop:
MOVBU.P 1(R4), R8
MOVBU.P R8, 1(R3)
CMP R3, R9
BNE forwardtailloop
RET
// Small copies: 1..16 bytes.
copy16:
ADD R4, R5, R8 // R8 points just past the last source byte
ADD R3, R5, R9 // R9 points just past the last destination byte
CMP $8, R5
BLT copy7
MOVD (R4), R6
MOVD -8(R8), R7
MOVD R6, (R3)
MOVD R7, -8(R9)
RET
copy7:
TBZ $2, R5, copy3
MOVWU (R4), R6
MOVWU -4(R8), R7
MOVW R6, (R3)
MOVW R7, -4(R9)
RET
copy3:
TBZ $1, R5, copy1
MOVHU (R4), R6
MOVHU -2(R8), R7
MOVH R6, (R3)
MOVH R7, -2(R9)
RET
copy1:
MOVBU (R4), R6
MOVB R6, (R3)
RET
backward:
// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
ADD R4, R5, R4 // R4 points just past the last source byte
ADD R3, R5, R3 // R3 points just past the last destination byte
CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
AND $7, R6, R12
CBZ R12, backwardtaillarge
SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
backwardtailloop:
// Copy sub-pointer-size tail.
MOVBU.W -1(R4), R8
MOVBU.W R8, -1(R3)
CMP R9, R3
BNE backwardtailloop
backwardtaillarge:
// Do 8/16-byte write if possible.
// See comment at forwardtail.
TBZ $3, R6, 3(PC)
MOVD.W -8(R4), R8
MOVD.W R8, -8(R3)
TBZ $4, R6, 3(PC)
LDP.W -16(R4), (R8, R10)
STP.W (R8, R10), -16(R3)
nobackwardtail:
CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
RET
backwardlarge:
SUB R7, R3, R9 // R9 points at the lowest destination byte
backwardlargeloop:
LDP -16(R4), (R8, R10)
STP (R8, R10), -16(R3)
LDP.W -32(R4), (R11, R12)
STP.W (R11, R12), -32(R3)
CMP R9, R3
BNE backwardlargeloop
RET
|