aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Griesemer <gri@golang.org>2011-09-20 14:36:19 -0700
committerRobert Griesemer <gri@golang.org>2011-09-20 14:36:19 -0700
commit5ee7ef90cddf2d5b3fa7fd9092a86f47fc5d56ee (patch)
treeda9a1f5f9f3bada636ab3bd929e5153803242260
parent86e65bac5c9368fc807f8c0356fe5162fb68b09e (diff)
downloadgo-5ee7ef90cddf2d5b3fa7fd9092a86f47fc5d56ee.tar.gz
go-5ee7ef90cddf2d5b3fa7fd9092a86f47fc5d56ee.zip
suffixarray: improved serialization code
Use gobs to serialize indexes instead of encoding/binary. Even with gobs, serialize data in slices instead of applying gob to the entire data structure at once, to reduce the amount of extra buffer memory needed inside gob. 7x faster Write/Read for new BenchmarkSaveRestore compared to old code; possibly because encoding/binary is more expensive for int32 slice elements (interface call to get little/big endian encoding), while gob's encoding is fixed (unconfirmed). new (using gobs): suffixarray.BenchmarkSaveRestore 1 2153604000 ns/op old (using encoding/binary): suffixarray.BenchmarkSaveRestore 1 15118322000 ns/op The actual serialized data is slightly larger then using the old code for very large indices because full 32bit indices require 5bytes using gobs instead of 4bytes (encoding/binary) in serialized form. R=r CC=golang-dev https://golang.org/cl/5087041
-rw-r--r--src/pkg/index/suffixarray/suffixarray.go64
-rw-r--r--src/pkg/index/suffixarray/suffixarray_test.go19
2 files changed, 65 insertions, 18 deletions
diff --git a/src/pkg/index/suffixarray/suffixarray.go b/src/pkg/index/suffixarray/suffixarray.go
index c2a999483b..cff7daa9d5 100644
--- a/src/pkg/index/suffixarray/suffixarray.go
+++ b/src/pkg/index/suffixarray/suffixarray.go
@@ -18,8 +18,8 @@ package suffixarray
import (
"bytes"
- "encoding/binary"
"exp/regexp"
+ "gob"
"io"
"os"
"sort"
@@ -37,13 +37,18 @@ func New(data []byte) *Index {
return &Index{data, qsufsort(data)}
}
+// Read and Write slice the data into successive portions of length gobN,
+// so gob can allocate smaller buffers for its I/O.
+const gobN = 1 << 16 // slightly better than say 1 << 20 (BenchmarkSaveRestore)
+
// Read reads the index from r into x; x must not be nil.
func (x *Index) Read(r io.Reader) os.Error {
- var n int32
- if err := binary.Read(r, binary.LittleEndian, &n); err != nil {
+ d := gob.NewDecoder(r)
+ var n int
+ if err := d.Decode(&n); err != nil {
return err
}
- if 2*n < int32(cap(x.data)) || int32(cap(x.data)) < n {
+ if 2*n < cap(x.data) || cap(x.data) < n {
// new data is significantly smaller or larger then
// existing buffers - allocate new ones
x.data = make([]byte, n)
@@ -53,28 +58,51 @@ func (x *Index) Read(r io.Reader) os.Error {
x.data = x.data[0:n]
x.sa = x.sa[0:n]
}
-
- if err := binary.Read(r, binary.LittleEndian, x.data); err != nil {
- return err
- }
- if err := binary.Read(r, binary.LittleEndian, x.sa); err != nil {
- return err
+ for i := 0; i < n; {
+ j := i + gobN
+ if j > n {
+ j = n
+ }
+ // data holds next piece of x.data; its length is updated by Decode
+ data := x.data[i:j]
+ if err := d.Decode(&data); err != nil {
+ return err
+ }
+ if len(data) != j-i {
+ return os.NewError("suffixarray.Read: inconsistent data format")
+ }
+ // sa holds next piece of x.data; its length is updated by Decode
+ sa := x.sa[i:j]
+ if err := d.Decode(&sa); err != nil {
+ return err
+ }
+ if len(sa) != j-i {
+ return os.NewError("suffixarray.Read: inconsistent data format")
+ }
+ i = j
}
-
return nil
}
// Write writes the index x to w.
func (x *Index) Write(w io.Writer) os.Error {
- n := int32(len(x.data))
- if err := binary.Write(w, binary.LittleEndian, n); err != nil {
- return err
- }
- if err := binary.Write(w, binary.LittleEndian, x.data); err != nil {
+ e := gob.NewEncoder(w)
+ n := len(x.data)
+ if err := e.Encode(n); err != nil {
return err
}
- if err := binary.Write(w, binary.LittleEndian, x.sa); err != nil {
- return err
+ for i := 0; i < n; {
+ j := i + gobN
+ if j > n {
+ j = n
+ }
+ if err := e.Encode(x.data[i:j]); err != nil {
+ return err
+ }
+ if err := e.Encode(x.sa[i:j]); err != nil {
+ return err
+ }
+ i = j
}
return nil
}
diff --git a/src/pkg/index/suffixarray/suffixarray_test.go b/src/pkg/index/suffixarray/suffixarray_test.go
index cffedfba0f..9b4d89f42e 100644
--- a/src/pkg/index/suffixarray/suffixarray_test.go
+++ b/src/pkg/index/suffixarray/suffixarray_test.go
@@ -7,6 +7,7 @@ package suffixarray
import (
"bytes"
"exp/regexp"
+ "rand"
"sort"
"strings"
"testing"
@@ -255,3 +256,21 @@ func TestIndex(t *testing.T) {
testLookups(t, &tc, x, -1)
}
}
+
+func BenchmarkSaveRestore(b *testing.B) {
+ b.StopTimer()
+ r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
+ data := make([]byte, 10<<20) // 10MB index data
+ for i := range data {
+ data[i] = byte(r.Intn(256))
+ }
+ x := New(data)
+ testSaveRestore(nil, nil, x) // verify correctness
+ buf := bytes.NewBuffer(make([]byte, len(data))) // avoid frequent growing
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ x.Write(buf)
+ var y Index
+ y.Read(buf)
+ }
+}