src/hash/crc32/crc32_s390x.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// Vector register range containing CRC-32 constants

#define CONST_PERM_LE2BE        V9
#define CONST_R2R1              V10
#define CONST_R4R3              V11
#define CONST_R5                V12
#define CONST_RU_POLY           V13
#define CONST_CRC_POLY          V14


// The CRC-32 constant block contains reduction constants to fold and
// process particular chunks of the input data stream in parallel.
//
// Note that the constant definitions below are extended in order to compute
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
// The rightmost doubleword can be 0 to prevent contribution to the result or
// can be multiplied by 1 to perform an XOR without the need for a separate
// VECTOR EXCLUSIVE OR instruction.
//
// The polynomials used are bit-reflected:
//
//            IEEE: P'(x) = 0x0edb88320
//      Castagnoli: P'(x) = 0x082f63b78


// IEEE polynomial constants
DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
DATA    ·crclecons+48(SB)/8, $0x0000000000000000
DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
DATA    ·crclecons+64(SB)/8, $0x0000000000000000
DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
DATA    ·crclecons+80(SB)/8, $0x0000000000000000
DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1

GLOBL    ·crclecons(SB),RODATA, $144

// Castagonli Polynomial constants
DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1

GLOBL   ·crcclecons(SB),RODATA, $144

// func hasVectorFacility() bool
TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
	MOVD    $x-24(SP), R1
	XC      $24, 0(R1), 0(R1) // clear the storage
	MOVD    $2, R0            // R0 is the number of double words stored -1
	WORD    $0xB2B01000       // STFLE 0(R1)
	XOR     R0, R0            // reset the value of R0
	MOVBZ   z-8(SP), R1
	AND     $0x40, R1
	BEQ     novector
vectorinstalled:
	// check if the vector instruction has been enabled
	VLEIB   $0, $0xF, V16
	VLGVB   $0, V16, R1
	CMPBNE  R1, $0xF, novector
	MOVB    $1, ret+0(FP) // have vx
	RET
novector:
	MOVB    $0, ret+0(FP) // no vx
	RET


// The CRC-32 function(s) use these calling conventions:
//
// Parameters:
//
//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
//      R3:    Input buffer pointer, performance might be improved if the
//             buffer is on a doubleword boundary.
//      R4:    Length of the buffer, must be 64 bytes or greater.
//
// Register usage:
//
//      R5:     CRC-32 constant pool base pointer.
//      V0:     Initial CRC value and intermediate constants and results.
//      V1..V4: Data for CRC computation.
//      V5..V8: Next data chunks that are fetched from the input buffer.
//
//      V9..V14: CRC-32 constants.

// func vectorizedIEEE(crc uint32, p []byte) uint32
TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
	MOVD    p+8(FP), R3       // data pointer
	MOVD    p_len+16(FP), R4  // len(p)

	MOVD    $·crclecons(SB), R5
	BR      vectorizedBody<>(SB)

// func vectorizedCastagnoli(crc uint32, p []byte) uint32
TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
	MOVD    p+8(FP), R3       // data pointer
	MOVD    p_len+16(FP), R4  // len(p)

	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
	MOVD    $·crcclecons(SB), R5
	BR      vectorizedBody<>(SB)

TEXT vectorizedBody<>(SB),NOSPLIT,$0
	XOR     $0xffffffff, R2 // NOTW R2
	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY

	// Load the initial CRC value into the rightmost word of V0
	VZERO   V0
	VLVGF   $3, R2, V0

	// Load a 64-byte data chunk and XOR with CRC
	VLM     0(R3), V1, V4    // 64-bytes into V1..V4

	// Reflect the data if the CRC operation is in the bit-reflected domain
	VPERM   V1, V1, CONST_PERM_LE2BE, V1
	VPERM   V2, V2, CONST_PERM_LE2BE, V2
	VPERM   V3, V3, CONST_PERM_LE2BE, V3
	VPERM   V4, V4, CONST_PERM_LE2BE, V4

	VX      V0, V1, V1     // V1 ^= CRC
	ADD     $64, R3        // BUF = BUF + 64
	ADD     $(-64), R4

	// Check remaining buffer size and jump to proper folding method
	CMP     R4, $64
	BLT     less_than_64bytes

fold_64bytes_loop:
	// Load the next 64-byte data chunk into V5 to V8
	VLM     0(R3), V5, V8
	VPERM   V5, V5, CONST_PERM_LE2BE, V5
	VPERM   V6, V6, CONST_PERM_LE2BE, V6
	VPERM   V7, V7, CONST_PERM_LE2BE, V7
	VPERM   V8, V8, CONST_PERM_LE2BE, V8


	// Perform a GF(2) multiplication of the doublewords in V1 with
	// the reduction constants in V0.  The intermediate result is
	// then folded (accumulated) with the next data chunk in V5 and
	// stored in V1.  Repeat this step for the register contents
	// in V2, V3, and V4 respectively.

	VGFMAG  CONST_R2R1, V1, V5, V1
	VGFMAG  CONST_R2R1, V2, V6, V2
	VGFMAG  CONST_R2R1, V3, V7, V3
	VGFMAG  CONST_R2R1, V4, V8 ,V4

	// Adjust buffer pointer and length for next loop
	ADD     $64, R3                  // BUF = BUF + 64
	ADD     $(-64), R4               // LEN = LEN - 64

	CMP     R4, $64
	BGE     fold_64bytes_loop

less_than_64bytes:
	// Fold V1 to V4 into a single 128-bit value in V1
	VGFMAG  CONST_R4R3, V1, V2, V1
	VGFMAG  CONST_R4R3, V1, V3, V1
	VGFMAG  CONST_R4R3, V1, V4, V1

	// Check whether to continue with 64-bit folding
	CMP R4, $16
	BLT final_fold

fold_16bytes_loop:
	VL      0(R3), V2               // Load next data chunk
	VPERM   V2, V2, CONST_PERM_LE2BE, V2

	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk

	// Adjust buffer pointer and size for folding next data chunk
	ADD     $16, R3
	ADD     $-16, R4

	// Process remaining data chunks
	CMP     R4 ,$16
	BGE     fold_16bytes_loop

final_fold:
	VLEIB   $7, $0x40, V9
	VSRLB   V9, CONST_R4R3, V0
	VLEIG   $0, $1, V0

	VGFMG   V0, V1, V1

	VLEIB   $7, $0x20, V9         // Shift by words
	VSRLB   V9, V1, V2            // Store remaining bits in V2
	VUPLLF  V1, V1                // Split rightmost doubleword
	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2


	// The input values to the Barret reduction are the degree-63 polynomial
	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
	// constant u.  The Barret reduction result is the CRC value of R(x) mod
	// P(x).
	//
	// The Barret reduction algorithm is defined as:
	//
	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
	//    3. C(x)  = R(x) XOR T2(x) mod x^32
	//
	// Note: To compensate the division by x^32, use the vector unpack
	// instruction to move the leftmost word into the leftmost doubleword
	// of the vector register.  The rightmost doubleword is multiplied
	// with zero to not contribute to the intermedate results.


	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
	VUPLLF  V1, V2
	VGFMG   CONST_RU_POLY, V2, V2


	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
	// The final result is in the rightmost word of V2.

	VUPLLF  V2 , V2
	VGFMAG  CONST_CRC_POLY, V2, V1, V2

done:
	VLGVF   $2, V2, R2
	XOR     $0xffffffff, R2 // NOTW R2
	MOVWZ   R2, ret + 32(FP)
	RET