aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Shaw <chickencha@gmail.com>2010-11-08 17:33:53 -0800
committerRob Pike <r@golang.org>2010-11-08 17:33:53 -0800
commit49fdfe21dd5d1a08b1edaac5c0caeedf492d32e2 (patch)
tree66bc5da9747df76e04fbc5e273ca3f77c4acaa5f
parente9c901dbf4673443cc151d2e53100c0ebff48a44 (diff)
downloadgo-49fdfe21dd5d1a08b1edaac5c0caeedf492d32e2.tar.gz
go-49fdfe21dd5d1a08b1edaac5c0caeedf492d32e2.zip
bytes: SSE for bytes.IndexByte on amd64
Performance on 2.8 GHz Intel Core i7: Before: BenchmarkIndexByte4K 1000000 2997 ns/op 1366.70 MB/s BenchmarkIndexByte4M 500 3049772 ns/op 1375.28 MB/s BenchmarkIndexByte64M 50 49582280 ns/op 1353.48 MB/s After: BenchmarkIndexByte4K 10000000 298 ns/op 13744.97 MB/s BenchmarkIndexByte4M 10000 285993 ns/op 14665.76 MB/s BenchmarkIndexByte64M 500 4618172 ns/op 14531.48 MB/s R=rsc, PeterGo, r2, r CC=golang-dev https://golang.org/cl/2888041
-rw-r--r--src/pkg/bytes/asm_amd64.s93
-rw-r--r--src/pkg/bytes/bytes_test.go53
2 files changed, 137 insertions, 9 deletions
diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s
index 7e78700ecf..c6793cbdcc 100644
--- a/src/pkg/bytes/asm_amd64.s
+++ b/src/pkg/bytes/asm_amd64.s
@@ -3,15 +3,90 @@
// license that can be found in the LICENSE file.
TEXT ·IndexByte(SB),7,$0
- MOVQ p+0(FP), SI
- MOVL len+8(FP), CX
- MOVB b+16(FP), AL
- MOVQ SI, DI
+ MOVQ p+0(FP), SI
+ MOVL len+8(FP), BX
+ MOVB b+16(FP), AL
+ MOVQ SI, DI
+
+ CMPL BX, $16
+ JLT small
+
+ // round up to first 16-byte boundary
+ TESTQ $15, SI
+ JZ aligned
+ MOVQ SI, CX
+ ANDQ $~15, CX
+ ADDQ $16, CX
+
+ // search the beginning
+ SUBQ SI, CX
+ REPN; SCASB
+ JZ success
+
+// DI is 16-byte aligned; get ready to search using SSE instructions
+aligned:
+ // round down to last 16-byte boundary
+ MOVQ BX, R11
+ ADDQ SI, R11
+ ANDQ $~15, R11
+
+ // shuffle X0 around so that each byte contains c
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+ JMP condition
+
+sse:
+ // move the next 16-byte chunk of the buffer into X1
+ MOVO (DI), X1
+ // compare bytes in X0 to X1
+ PCMPEQB X0, X1
+ // take the top bit of each byte in X1 and put the result in DX
+ PMOVMSKB X1, DX
+ TESTL DX, DX
+ JNZ ssesuccess
+ ADDQ $16, DI
+
+condition:
+ CMPQ DI, R11
+ JLT sse
+
+ // search the end
+ MOVQ SI, CX
+ ADDQ BX, CX
+ SUBQ R11, CX
+ // if CX == 0, the zero flag will be set and we'll end up
+ // returning a false success
+ JZ failure
REPN; SCASB
- JZ 3(PC)
- MOVL $-1, ret+24(FP)
+ JZ success
+
+failure:
+ MOVL $-1, ret+24(FP)
+ RET
+
+// handle for lengths < 16
+small:
+ MOVL BX, CX
+ REPN; SCASB
+ JZ success
+ MOVL $-1, ret+24(FP)
RET
- SUBQ SI, DI
- SUBL $1, DI
- MOVL DI, ret+24(FP)
+
+// we've found the chunk containing the byte
+// now just figure out which specific byte it is
+ssesuccess:
+ // get the index of the least significant set bit
+ BSFW DX, DX
+ SUBQ SI, DI
+ ADDQ DI, DX
+ MOVL DX, ret+24(FP)
+ RET
+
+success:
+ SUBQ SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+24(FP)
RET
+
diff --git a/src/pkg/bytes/bytes_test.go b/src/pkg/bytes/bytes_test.go
index 6f42338eb8..f3ca371f83 100644
--- a/src/pkg/bytes/bytes_test.go
+++ b/src/pkg/bytes/bytes_test.go
@@ -93,6 +93,9 @@ var indexTests = []BinOpTest{
{"abc", "b", 1},
{"abc", "c", 2},
{"abc", "x", -1},
+ {"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
+ {"foofyfoobarfoobar", "y", 4},
+ {"oooooooooooooooooooooo", "r", -1},
}
var lastIndexTests = []BinOpTest{
@@ -177,6 +180,56 @@ func TestIndexByte(t *testing.T) {
}
}
+// test a larger buffer with different sizes and alignments
+func TestIndexByteBig(t *testing.T) {
+ const n = 1024
+ b := make([]byte, n)
+ for i := 0; i < n; i++ {
+ // different start alignments
+ b1 := b[i:]
+ for j := 0; j < len(b1); j++ {
+ b1[j] = 'x'
+ pos := IndexByte(b1, 'x')
+ if pos != j {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ b1[j] = 0
+ pos = IndexByte(b1, 'x')
+ if pos != -1 {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ }
+ // different end alignments
+ b1 = b[:i]
+ for j := 0; j < len(b1); j++ {
+ b1[j] = 'x'
+ pos := IndexByte(b1, 'x')
+ if pos != j {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ b1[j] = 0
+ pos = IndexByte(b1, 'x')
+ if pos != -1 {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ }
+ // different start and end alignments
+ b1 = b[i/2 : n-(i+1)/2]
+ for j := 0; j < len(b1); j++ {
+ b1[j] = 'x'
+ pos := IndexByte(b1, 'x')
+ if pos != j {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ b1[j] = 0
+ pos = IndexByte(b1, 'x')
+ if pos != -1 {
+ t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+ }
+ }
+ }
+}
+
func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests {
a := []byte(tt.a)