aboutsummaryrefslogtreecommitdiff
path: root/src/hash
diff options
context:
space:
mode:
authorChris Zou <chriszou@ca.ibm.com>2016-04-18 19:30:17 -0400
committerBrad Fitzpatrick <bradfitz@golang.org>2016-04-22 18:07:15 +0000
commit5833d843de4d774f6b76e982b170f42b8e94a198 (patch)
treeb4e4cd0ddc3ec1676993df6a399955f10e47330d /src/hash
parent7879e9193b39e6455ae03f2baace9c41f6393ee4 (diff)
downloadgo-5833d843de4d774f6b76e982b170f42b8e94a198.tar.gz
go-5833d843de4d774f6b76e982b170f42b8e94a198.zip
hash/crc32: use vector instructions on s390x
The input buffer is aligned to a doubleword boundary to improve performance of the vector instructions. The pure Go implementation is used to align the input data, and is also used when the vector instructions are not available or the data length is less than 64 bytes. Change-Id: Ie259a5f2f1562bcc17961c99e5776c99091d6bed Reviewed-on: https://go-review.googlesource.com/22201 Reviewed-by: Michael Munday <munday@ca.ibm.com> Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Bill O'Farrell <billotosyr@gmail.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/hash')
-rw-r--r--src/hash/crc32/crc32_generic.go2
-rw-r--r--src/hash/crc32/crc32_s390x.go101
-rw-r--r--src/hash/crc32/crc32_s390x.s245
3 files changed, 347 insertions, 1 deletions
diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go
index 62fa72028c..10a6367bc9 100644
--- a/src/hash/crc32/crc32_generic.go
+++ b/src/hash/crc32/crc32_generic.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!amd64p32
+// +build !amd64,!amd64p32,!s390x
package crc32
diff --git a/src/hash/crc32/crc32_s390x.go b/src/hash/crc32/crc32_s390x.go
new file mode 100644
index 0000000000..2f20690389
--- /dev/null
+++ b/src/hash/crc32/crc32_s390x.go
@@ -0,0 +1,101 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+import (
+ "unsafe"
+)
+
+const (
+ vxMinLen = 64
+ vxAlignment = 16
+ vxAlignMask = vxAlignment - 1
+)
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
+
+// vectorizedCastagnoli implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedCastagnoli(crc uint32, p []byte) uint32
+
+// vectorizedIEEE implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedIEEE(crc uint32, p []byte) uint32
+
+func genericCastagnoli(crc uint32, p []byte) uint32 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
+ return updateSlicingBy8(crc, castagnoliTable8, p)
+ }
+ return update(crc, castagnoliTable, p)
+}
+
+func genericIEEE(crc uint32, p []byte) uint32 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
+ ieeeTable8Once.Do(func() {
+ ieeeTable8 = makeTable8(IEEE)
+ })
+ return updateSlicingBy8(crc, ieeeTable8, p)
+ }
+ return update(crc, IEEETable, p)
+}
+
+// updateCastagnoli calculates the checksum of p using genericCastagnoli to
+// align the data appropriately for vectorCastagnoli. It avoids using
+// vectorCastagnoli entirely if the length of p is less than or equal to
+// vxMinLen.
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+ // Use vectorized function if vector facility is available and
+ // data length is above threshold.
+ if hasVX && len(p) > vxMinLen {
+ pAddr := uintptr(unsafe.Pointer(&p[0]))
+ if pAddr&vxAlignMask != 0 {
+ prealign := vxAlignment - int(pAddr&vxAlignMask)
+ crc = genericCastagnoli(crc, p[:prealign])
+ p = p[prealign:]
+ }
+ aligned := len(p) & ^vxAlignMask
+ crc = vectorizedCastagnoli(crc, p[:aligned])
+ p = p[aligned:]
+ // process remaining data
+ if len(p) > 0 {
+ crc = genericCastagnoli(crc, p)
+ }
+ return crc
+ }
+ return genericCastagnoli(crc, p)
+}
+
+// updateIEEE calculates the checksum of p using genericIEEE to align the data
+// appropriately for vectorIEEE. It avoids using vectorIEEE entirely if the length
+// of p is less than or equal to vxMinLen.
+func updateIEEE(crc uint32, p []byte) uint32 {
+ // Use vectorized function if vector facility is available and
+ // data length is above threshold.
+ if hasVX && len(p) > vxMinLen {
+ pAddr := uintptr(unsafe.Pointer(&p[0]))
+ if pAddr&vxAlignMask != 0 {
+ prealign := vxAlignment - int(pAddr&vxAlignMask)
+ crc = genericIEEE(crc, p[:prealign])
+ p = p[prealign:]
+ }
+ aligned := len(p) & ^vxAlignMask
+ crc = vectorizedIEEE(crc, p[:aligned])
+ p = p[aligned:]
+ // process remaining data
+ if len(p) > 0 {
+ crc = genericIEEE(crc, p)
+ }
+ return crc
+ }
+ return genericIEEE(crc, p)
+}
diff --git a/src/hash/crc32/crc32_s390x.s b/src/hash/crc32/crc32_s390x.s
new file mode 100644
index 0000000000..f8d39f3df9
--- /dev/null
+++ b/src/hash/crc32/crc32_s390x.s
@@ -0,0 +1,245 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Vector register range containing CRC-32 constants
+
+#define CONST_PERM_LE2BE V9
+#define CONST_R2R1 V10
+#define CONST_R4R3 V11
+#define CONST_R5 V12
+#define CONST_RU_POLY V13
+#define CONST_CRC_POLY V14
+
+
+// The CRC-32 constant block contains reduction constants to fold and
+// process particular chunks of the input data stream in parallel.
+//
+// Note that the constant definitions below are extended in order to compute
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+// The rightmost doubleword can be 0 to prevent contribution to the result or
+// can be multiplied by 1 to perform an XOR without the need for a separate
+// VECTOR EXCLUSIVE OR instruction.
+//
+// The polynomials used are bit-reflected:
+//
+// IEEE: P'(x) = 0x0edb88320
+// Castagnoli: P'(x) = 0x082f63b78
+
+
+// IEEE polynomial constants
+DATA ·crclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crclecons+8(SB)/8, $0x0706050403020100
+DATA ·crclecons+16(SB)/8, $0x00000001c6e41596 // R2
+DATA ·crclecons+24(SB)/8, $0x0000000154442bd4 // R1
+DATA ·crclecons+32(SB)/8, $0x00000000ccaa009e // R4
+DATA ·crclecons+40(SB)/8, $0x00000001751997d0 // R3
+DATA ·crclecons+48(SB)/8, $0x0000000000000000
+DATA ·crclecons+56(SB)/8, $0x0000000163cd6124 // R5
+DATA ·crclecons+64(SB)/8, $0x0000000000000000
+DATA ·crclecons+72(SB)/8, $0x00000001F7011641 // u'
+DATA ·crclecons+80(SB)/8, $0x0000000000000000
+DATA ·crclecons+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
+
+GLOBL ·crclecons(SB),RODATA, $144
+
+// Castagonli Polynomial constants
+DATA ·crcclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crcclecons+8(SB)/8, $0x0706050403020100
+DATA ·crcclecons+16(SB)/8, $0x000000009e4addf8 // R2
+DATA ·crcclecons+24(SB)/8, $0x00000000740eef02 // R1
+DATA ·crcclecons+32(SB)/8, $0x000000014cd00bd6 // R4
+DATA ·crcclecons+40(SB)/8, $0x00000000f20c0dfe // R3
+DATA ·crcclecons+48(SB)/8, $0x0000000000000000
+DATA ·crcclecons+56(SB)/8, $0x00000000dd45aab8 // R5
+DATA ·crcclecons+64(SB)/8, $0x0000000000000000
+DATA ·crcclecons+72(SB)/8, $0x00000000dea713f1 // u'
+DATA ·crcclecons+80(SB)/8, $0x0000000000000000
+DATA ·crcclecons+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
+
+GLOBL ·crcclecons(SB),RODATA, $144
+
+// func hasVectorFacility() bool
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+ MOVD $x-24(SP), R1
+ XC $24, 0(R1), 0(R1) // clear the storage
+ MOVD $2, R0 // R0 is the number of double words stored -1
+ WORD $0xB2B01000 // STFLE 0(R1)
+ XOR R0, R0 // reset the value of R0
+ MOVBZ z-8(SP), R1
+ AND $0x40, R1
+ BEQ novector
+vectorinstalled:
+ // check if the vector instruction has been enabled
+ VLEIB $0, $0xF, V16
+ VLGVB $0, V16, R1
+ CMPBNE R1, $0xF, novector
+ MOVB $1, ret+0(FP) // have vx
+ RET
+novector:
+ MOVB $0, ret+0(FP) // no vx
+ RET
+
+
+// The CRC-32 function(s) use these calling conventions:
+//
+// Parameters:
+//
+// R2: Initial CRC value, typically ~0; and final CRC (return) value.
+// R3: Input buffer pointer, performance might be improved if the
+// buffer is on a doubleword boundary.
+// R4: Length of the buffer, must be 64 bytes or greater.
+//
+// Register usage:
+//
+// R5: CRC-32 constant pool base pointer.
+// V0: Initial CRC value and intermediate constants and results.
+// V1..V4: Data for CRC computation.
+// V5..V8: Next data chunks that are fetched from the input buffer.
+//
+// V9..V14: CRC-32 constants.
+
+// func vectorizedIEEE(crc uint32, p []byte) uint32
+TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
+ MOVD p+8(FP), R3 // data pointer
+ MOVD p_len+16(FP), R4 // len(p)
+
+ MOVD $·crclecons(SB), R5
+ BR vectorizedBody<>(SB)
+
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
+TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
+ MOVD p+8(FP), R3 // data pointer
+ MOVD p_len+16(FP), R4 // len(p)
+
+ // R5: crc-32 constant pool base pointer, constant is used to reduce crc
+ MOVD $·crcclecons(SB), R5
+ BR vectorizedBody<>(SB)
+
+TEXT vectorizedBody<>(SB),NOSPLIT,$0
+ XOR $0xffffffff, R2 // NOTW R2
+ VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
+
+ // Load the initial CRC value into the rightmost word of V0
+ VZERO V0
+ VLVGF $3, R2, V0
+
+ // Load a 64-byte data chunk and XOR with CRC
+ VLM 0(R3), V1, V4 // 64-bytes into V1..V4
+
+ // Reflect the data if the CRC operation is in the bit-reflected domain
+ VPERM V1, V1, CONST_PERM_LE2BE, V1
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
+ VPERM V3, V3, CONST_PERM_LE2BE, V3
+ VPERM V4, V4, CONST_PERM_LE2BE, V4
+
+ VX V0, V1, V1 // V1 ^= CRC
+ ADD $64, R3 // BUF = BUF + 64
+ ADD $(-64), R4
+
+ // Check remaining buffer size and jump to proper folding method
+ CMP R4, $64
+ BLT less_than_64bytes
+
+fold_64bytes_loop:
+ // Load the next 64-byte data chunk into V5 to V8
+ VLM 0(R3), V5, V8
+ VPERM V5, V5, CONST_PERM_LE2BE, V5
+ VPERM V6, V6, CONST_PERM_LE2BE, V6
+ VPERM V7, V7, CONST_PERM_LE2BE, V7
+ VPERM V8, V8, CONST_PERM_LE2BE, V8
+
+
+ // Perform a GF(2) multiplication of the doublewords in V1 with
+ // the reduction constants in V0. The intermediate result is
+ // then folded (accumulated) with the next data chunk in V5 and
+ // stored in V1. Repeat this step for the register contents
+ // in V2, V3, and V4 respectively.
+
+ VGFMAG CONST_R2R1, V1, V5, V1
+ VGFMAG CONST_R2R1, V2, V6, V2
+ VGFMAG CONST_R2R1, V3, V7, V3
+ VGFMAG CONST_R2R1, V4, V8 ,V4
+
+ // Adjust buffer pointer and length for next loop
+ ADD $64, R3 // BUF = BUF + 64
+ ADD $(-64), R4 // LEN = LEN - 64
+
+ CMP R4, $64
+ BGE fold_64bytes_loop
+
+less_than_64bytes:
+ // Fold V1 to V4 into a single 128-bit value in V1
+ VGFMAG CONST_R4R3, V1, V2, V1
+ VGFMAG CONST_R4R3, V1, V3, V1
+ VGFMAG CONST_R4R3, V1, V4, V1
+
+ // Check whether to continue with 64-bit folding
+ CMP R4, $16
+ BLT final_fold
+
+fold_16bytes_loop:
+ VL 0(R3), V2 // Load next data chunk
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
+
+ VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
+
+ // Adjust buffer pointer and size for folding next data chunk
+ ADD $16, R3
+ ADD $-16, R4
+
+ // Process remaining data chunks
+ CMP R4 ,$16
+ BGE fold_16bytes_loop
+
+final_fold:
+ VLEIB $7, $0x40, V9
+ VSRLB V9, CONST_R4R3, V0
+ VLEIG $0, $1, V0
+
+ VGFMG V0, V1, V1
+
+ VLEIB $7, $0x20, V9 // Shift by words
+ VSRLB V9, V1, V2 // Store remaining bits in V2
+ VUPLLF V1, V1 // Split rightmost doubleword
+ VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
+
+
+ // The input values to the Barret reduction are the degree-63 polynomial
+ // in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ // constant u. The Barret reduction result is the CRC value of R(x) mod
+ // P(x).
+ //
+ // The Barret reduction algorithm is defined as:
+ //
+ // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ // 3. C(x) = R(x) XOR T2(x) mod x^32
+ //
+ // Note: To compensate the division by x^32, use the vector unpack
+ // instruction to move the leftmost word into the leftmost doubleword
+ // of the vector register. The rightmost doubleword is multiplied
+ // with zero to not contribute to the intermedate results.
+
+
+ // T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ VUPLLF V1, V2
+ VGFMG CONST_RU_POLY, V2, V2
+
+
+ // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ // V2 and XOR the intermediate result, T2(x), with the value in V1.
+ // The final result is in the rightmost word of V2.
+
+ VUPLLF V2 , V2
+ VGFMAG CONST_CRC_POLY, V2, V1, V2
+
+done:
+ VLGVF $2, V2, R2
+ XOR $0xffffffff, R2 // NOTW R2
+ MOVWZ R2, ret + 32(FP)
+ RET