diff options
author | Rob Pike <r@golang.org> | 2011-01-03 11:31:51 -0800 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2011-01-03 11:31:51 -0800 |
commit | c0d0d4ef05378c6a297e36c60b7af2b799f5653f (patch) | |
tree | be0f9f1c8a5f3b8197fe30a122b180882a62f048 | |
parent | 84fc1e20f17097c1b6710f79cdd03dafeb908eaf (diff) | |
download | go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.tar.gz go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.zip |
regexp: fix performance bug, make anchored searches fail fast.
The bug was that for an anchored pattern such as ^x, the prefix
scan ignored the anchor, and could scan the whole file if there was
no x present. The fix is to do prefix matching after the anchor;
the cost miniscule; the speedups huge.
R=rsc, gri
CC=golang-dev
https://golang.org/cl/3837042
-rw-r--r-- | src/pkg/regexp/all_test.go | 46 | ||||
-rw-r--r-- | src/pkg/regexp/regexp.go | 35 |
2 files changed, 74 insertions, 7 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 8f115aa49c..3b2c489bce 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -377,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) { re.ReplaceAllString(x, "") } } + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index ef6a8aa0ba..4d13fad8b3 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -571,15 +571,20 @@ func (re *Regexp) doParse() { } } -// Extract regular text from the beginning of the pattern. +// Extract regular text from the beginning of the pattern, +// possibly after a leading iBOT. // That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte var utf = make([]byte, utf8.UTFMax) var inst *instr - // First instruction is start; skip that. + // First instruction is start; skip that. Also skip any initial iBOT. + inst = re.inst[0].next + for inst.kind == iBOT { + inst = inst.next + } Loop: - for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next { + for ; inst.kind != iEnd; inst = inst.next { // stop if this is not a char if inst.kind != iChar { break @@ -748,14 +753,30 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { if bytestr != nil { end = len(bytestr) } + anchored := re.inst[0].next.kind == iBOT + if anchored && pos > 0 { + return nil + } // fast check for initial plain substring prefixed := false // has this iteration begun by skipping a prefix? if re.prefix != "" { - var advance int - if bytestr == nil { - advance = strings.Index(str[pos:], re.prefix) + advance := 0 + if anchored { + if bytestr == nil { + if !strings.HasPrefix(str, re.prefix) { + return nil + } + } else { + if !bytes.HasPrefix(bytestr, re.prefixBytes) { + return nil + } + } } else { - advance = bytes.Index(bytestr[pos:], re.prefixBytes) + if bytestr == nil { + advance = strings.Index(str[pos:], re.prefix) + } else { + advance = bytes.Index(bytestr[pos:], re.prefixBytes) + } } if advance == -1 { return nil |