aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/memmove_ppc64x.s
blob: fd16ad8129b3013010ffec54806304a3fc0949f8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le
// +build ppc64 ppc64le

#include "textflag.h"

// See memmove Go doc for important implementation constraints.

// func memmove(to, from unsafe.Pointer, n uintptr)

// target address
#define TGT R3
// source address
#define SRC R4
// length to move
#define LEN R5
// number of doublewords
#define DWORDS R6
// number of bytes < 8
#define BYTES R7
// const 16 used as index
#define IDX16 R8
// temp used for copies, etc.
#define TMP R9
// number of 32 byte chunks
#define QWORDS R10

TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
	MOVD	to+0(FP), TGT
	MOVD	from+8(FP), SRC
	MOVD	n+16(FP), LEN

	// Determine if there are doublewords to
	// copy so a more efficient move can be done
check:
	ANDCC	$7, LEN, BYTES	// R7: bytes to copy
	SRD	$3, LEN, DWORDS	// R6: double words to copy
	MOVFL	CR0, CR3	// save CR from ANDCC
	CMP	DWORDS, $0, CR1	// CR1[EQ] set if no double words to copy

	// Determine overlap by subtracting dest - src and comparing against the
	// length.  This catches the cases where src and dest are in different types
	// of storage such as stack and static to avoid doing backward move when not
	// necessary.

	SUB	SRC, TGT, TMP	// dest - src
	CMPU	TMP, LEN, CR2	// < len?
	BC	12, 8, backward // BLT CR2 backward

	// Copying forward if no overlap.

	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
	BEQ	lt32gt8			// < 32 bytes

	// Prepare for moves of 32 bytes at a time.

forward32setup:
	DCBTST	(TGT)			// prepare data cache
	DCBT	(SRC)
	MOVD	QWORDS, CTR		// Number of 32 byte chunks
	MOVD	$16, IDX16		// 16 for index

forward32:
	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
	ADD	$32, SRC
	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
	STXVD2X	VS33, (IDX16)(TGT)
	ADD	$32,TGT			// bump up for next set
	BC	16, 0, forward32	// continue
	ANDCC	$3, DWORDS		// remaining doublewords
	BEQ	checkbytes		// only bytes remain

lt32gt8:
        // At this point >= 8 and < 32
	// Move 16 bytes if possible
	CMP     DWORDS, $2
	BLT     lt16
	LXVD2X	(R0)(SRC), VS32
	ADD	$-2, DWORDS
	STXVD2X	VS32, (R0)(TGT)
	ADD     $16, SRC
	ADD     $16, TGT

lt16:	// Move 8 bytes if possible
	CMP     DWORDS, $1
	BLT     checkbytes
	MOVD    0(SRC), TMP
	ADD	$8, SRC
	MOVD    TMP, 0(TGT)
	ADD     $8, TGT
checkbytes:
	BC	12, 14, LR		// BEQ lr
lt8:	// Move word if possible
	CMP BYTES, $4
	BLT lt4
	MOVWZ 0(SRC), TMP
	ADD $-4, BYTES
	MOVW TMP, 0(TGT)
	ADD $4, SRC
	ADD $4, TGT
lt4:	// Move halfword if possible
	CMP BYTES, $2
	BLT lt2
	MOVHZ 0(SRC), TMP
	ADD $-2, BYTES
	MOVH TMP, 0(TGT)
	ADD $2, SRC
	ADD $2, TGT
lt2:	// Move last byte if 1 left
	CMP BYTES, $1
	BC 12, 0, LR	// ble lr
	MOVBZ 0(SRC), TMP
	MOVBZ TMP, 0(TGT)
	RET

backward:
	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
	// R3 and R4 are advanced to the end of the destination/source buffers
	// respectively and moved back as we copy.

	ADD	LEN, SRC, SRC		// end of source
	ADD	TGT, LEN, TGT		// end of dest

	BEQ	nobackwardtail		// earlier condition

	MOVD	BYTES, CTR			// bytes to move

backwardtailloop:
	MOVBZ 	-1(SRC), TMP		// point to last byte
	SUB	$1,SRC
	MOVBZ 	TMP, -1(TGT)
	SUB	$1,TGT
	BC	16, 0, backwardtailloop // bndz

nobackwardtail:
	BC	4, 5, LR		// ble CR1 lr

backwardlarge:
	MOVD	DWORDS, CTR
	SUB	TGT, SRC, TMP		// Use vsx if moving
	CMP	TMP, $32		// at least 32 byte chunks
	BLT	backwardlargeloop	// and distance >= 32
	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
	BNE	backward32setup

backwardlargeloop:
	MOVD 	-8(SRC), TMP
	SUB	$8,SRC
	MOVD 	TMP, -8(TGT)
	SUB	$8,TGT
	BC	16, 0, backwardlargeloop // bndz
	RET

backward32setup:
	MOVD	QWORDS, CTR			// set up loop ctr
	MOVD	$16, IDX16			// 32 bytes at a time

backward32loop:
	SUB	$32, TGT
	SUB	$32, SRC
	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
	LXVD2X	(IDX16)(TGT), VS33
	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
	STXVD2X	VS33, (IDX16)(SRC)
	BC      16, 0, backward32loop   // bndz
	BC	4, 5, LR		// ble CR1 lr
	MOVD	DWORDS, CTR
	BR	backwardlargeloop