aboutsummaryrefslogtreecommitdiff
path: root/src/hash
diff options
context:
space:
mode:
authorKlaus Post <klauspost@gmail.com>2016-03-08 15:57:12 +0100
committerBrad Fitzpatrick <bradfitz@golang.org>2016-03-08 16:46:24 +0000
commitb212c68b9007ad328da81b3d589c032ba0de3434 (patch)
treeb244bf89d112bdb4258debf1fb304a10d7550dd5 /src/hash
parent39214275d6dd89f91ee2b5162698777a97cd6e72 (diff)
downloadgo-b212c68b9007ad328da81b3d589c032ba0de3434.tar.gz
go-b212c68b9007ad328da81b3d589c032ba0de3434.zip
hash/crc32: use slicing by 8 for Castagnoli and smaller sizes
This adds "slicing by 8" optimization to Castagnoli tables which will speed up CRC32 calculation on systems without asssembler, which are all but AMD64. In my tests, it is faster to use "slicing by 8" for sizes all down to 16 bytes, so the switchover point has been adjusted. There are no benchmarks for small sizes, so I have added one for 40 bytes, as well as one for bigger sizes (32KB). Castagnoli, No assembler, 40 Byte payload: (before, after) BenchmarkCastagnoli40B-4 10000000 161 ns/op 246.94 MB/s BenchmarkCastagnoli40B-4 20000000 100 ns/op 398.01 MB/s Castagnoli, No assembler, 32KB payload: (before, after) BenchmarkCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s BenchmarkCastagnoli32KB-4 30000 45171 ns/op 725.41 MB/s IEEE, No assembler, 1KB payload: (before, after) BenchmarkCrc1KB-4 500000 3604 ns/op 284.10 MB/s BenchmarkCrc1KB-4 1000000 1463 ns/op 699.79 MB/s Compared: benchmark old ns/op new ns/op delta BenchmarkCastagnoli40B-4 161 100 -37.89% BenchmarkCastagnoli32KB-4 115426 45171 -60.87% BenchmarkCrc1KB-4 3604 1463 -59.41% benchmark old MB/s new MB/s speedup BenchmarkCastagnoli40B-4 246.94 398.01 1.61x BenchmarkCastagnoli32KB-4 283.89 725.41 2.56x BenchmarkCrc1KB-4 284.10 699.79 2.46x Change-Id: I303e4ec84e8d4dafd057d64c0e43deb2b498e968 Reviewed-on: https://go-review.googlesource.com/19335 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/hash')
-rw-r--r--src/hash/crc32/crc32.go10
-rw-r--r--src/hash/crc32/crc32_amd64.go8
-rw-r--r--src/hash/crc32/crc32_amd64p32.go8
-rw-r--r--src/hash/crc32/crc32_generic.go12
-rw-r--r--src/hash/crc32/crc32_test.go63
5 files changed, 62 insertions, 39 deletions
diff --git a/src/hash/crc32/crc32.go b/src/hash/crc32/crc32.go
index dc5994885f..c3ac7b80c3 100644
--- a/src/hash/crc32/crc32.go
+++ b/src/hash/crc32/crc32.go
@@ -20,6 +20,9 @@ import (
// The size of a CRC-32 checksum in bytes.
const Size = 4
+// Use "slice by 8" when payload >= this value.
+const sliceBy8Cutoff = 16
+
// Predefined polynomials.
const (
// IEEE is by far and away the most common CRC-32 polynomial.
@@ -45,10 +48,12 @@ type Table [256]uint32
// Castagnoli table so we can compare against it to find when the caller is
// using this polynomial.
var castagnoliTable *Table
+var castagnoliTable8 *slicing8Table
var castagnoliOnce sync.Once
func castagnoliInit() {
castagnoliTable = makeTable(Castagnoli)
+ castagnoliTable8 = makeTable8(Castagnoli)
}
// IEEETable is the table for the IEEE polynomial.
@@ -146,6 +151,9 @@ func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
p = p[8:]
}
crc = ^crc
+ if len(p) == 0 {
+ return crc
+ }
return update(crc, &tab[0], p)
}
@@ -178,4 +186,4 @@ func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
// ChecksumIEEE returns the CRC-32 checksum of data
// using the IEEE polynomial.
-func ChecksumIEEE(data []byte) uint32 { return Update(0, IEEETable, data) }
+func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) }
diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go
index ab4e2b8c8c..a0180a12de 100644
--- a/src/hash/crc32/crc32_amd64.go
+++ b/src/hash/crc32/crc32_amd64.go
@@ -30,6 +30,10 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
if sse42 {
return castagnoliSSE42(crc, p)
}
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
+ return updateSlicingBy8(crc, castagnoliTable8, p)
+ }
return update(crc, castagnoliTable, p)
}
@@ -44,8 +48,8 @@ func updateIEEE(crc uint32, p []byte) uint32 {
return crc
}
- // only use slicing-by-8 when input is >= 4KB
- if len(p) >= 4096 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
ieeeTable8Once.Do(func() {
ieeeTable8 = makeTable8(IEEE)
})
diff --git a/src/hash/crc32/crc32_amd64p32.go b/src/hash/crc32/crc32_amd64p32.go
index 067fbb162f..1f6cd34643 100644
--- a/src/hash/crc32/crc32_amd64p32.go
+++ b/src/hash/crc32/crc32_amd64p32.go
@@ -22,12 +22,16 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
if sse42 {
return castagnoliSSE42(crc, p)
}
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
+ return updateSlicingBy8(crc, castagnoliTable8, p)
+ }
return update(crc, castagnoliTable, p)
}
func updateIEEE(crc uint32, p []byte) uint32 {
- // only use slicing-by-8 when input is >= 4KB
- if len(p) >= 4096 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
ieeeTable8Once.Do(func() {
ieeeTable8 = makeTable8(IEEE)
})
diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go
index 8fc11a75db..08988f4b38 100644
--- a/src/hash/crc32/crc32_generic.go
+++ b/src/hash/crc32/crc32_generic.go
@@ -6,16 +6,20 @@
package crc32
-// The file contains the generic version of updateCastagnoli which just calls
-// the software implementation.
+// This file contains the generic version of updateCastagnoli which does
+// slicing-by-8, or uses the fallback for very small sizes.
func updateCastagnoli(crc uint32, p []byte) uint32 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
+ return updateSlicingBy8(crc, castagnoliTable8, p)
+ }
return update(crc, castagnoliTable, p)
}
func updateIEEE(crc uint32, p []byte) uint32 {
- // only use slicing-by-8 when input is >= 4KB
- if len(p) >= 4096 {
+ // Use slicing-by-8 on larger inputs.
+ if len(p) >= sliceBy8Cutoff {
ieeeTable8Once.Do(func() {
ieeeTable8 = makeTable8(IEEE)
})
diff --git a/src/hash/crc32/crc32_test.go b/src/hash/crc32/crc32_test.go
index 1ca3ac2a27..e2b3557828 100644
--- a/src/hash/crc32/crc32_test.go
+++ b/src/hash/crc32/crc32_test.go
@@ -5,6 +5,7 @@
package crc32
import (
+ "hash"
"io"
"testing"
)
@@ -81,49 +82,51 @@ func TestGolden(t *testing.T) {
}
}
-func BenchmarkIEEECrc1KB(b *testing.B) {
- b.SetBytes(1024)
- data := make([]byte, 1024)
- for i := range data {
- data[i] = byte(i)
- }
- h := NewIEEE()
- in := make([]byte, 0, h.Size())
+func BenchmarkIEEECrc40B(b *testing.B) {
+ benchmark(b, NewIEEE(), 40)
+}
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- h.Reset()
- h.Write(data)
- h.Sum(in)
- }
+func BenchmarkIEEECrc1KB(b *testing.B) {
+ benchmark(b, NewIEEE(), 1<<10)
}
func BenchmarkIEEECrc4KB(b *testing.B) {
- b.SetBytes(4096)
- data := make([]byte, 4096)
- for i := range data {
- data[i] = byte(i)
- }
- h := NewIEEE()
- in := make([]byte, 0, h.Size())
+ benchmark(b, NewIEEE(), 4<<10)
+}
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- h.Reset()
- h.Write(data)
- h.Sum(in)
- }
+func BenchmarkIEEECrc32KB(b *testing.B) {
+ benchmark(b, NewIEEE(), 32<<10)
+}
+
+func BenchmarkCastagnoliCrc40B(b *testing.B) {
+ benchmark(b, New(MakeTable(Castagnoli)), 40)
}
func BenchmarkCastagnoliCrc1KB(b *testing.B) {
- b.SetBytes(1024)
- data := make([]byte, 1024)
+ benchmark(b, New(MakeTable(Castagnoli)), 1<<10)
+}
+
+func BenchmarkCastagnoliCrc4KB(b *testing.B) {
+ benchmark(b, New(MakeTable(Castagnoli)), 4<<10)
+}
+
+func BenchmarkCastagnoliCrc32KB(b *testing.B) {
+ benchmark(b, New(MakeTable(Castagnoli)), 32<<10)
+}
+
+func benchmark(b *testing.B, h hash.Hash32, n int64) {
+ b.SetBytes(n)
+ data := make([]byte, n)
for i := range data {
data[i] = byte(i)
}
- h := New(MakeTable(Castagnoli))
in := make([]byte, 0, h.Size())
+ // Warm up
+ h.Reset()
+ h.Write(data)
+ h.Sum(in)
+
b.ResetTimer()
for i := 0; i < b.N; i++ {
h.Reset()