aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2011-01-03 11:31:51 -0800
committerRob Pike <r@golang.org>2011-01-03 11:31:51 -0800
commitc0d0d4ef05378c6a297e36c60b7af2b799f5653f (patch)
treebe0f9f1c8a5f3b8197fe30a122b180882a62f048
parent84fc1e20f17097c1b6710f79cdd03dafeb908eaf (diff)
downloadgo-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.tar.gz
go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.zip
regexp: fix performance bug, make anchored searches fail fast.
The bug was that for an anchored pattern such as ^x, the prefix scan ignored the anchor, and could scan the whole file if there was no x present. The fix is to do prefix matching after the anchor; the cost miniscule; the speedups huge. R=rsc, gri CC=golang-dev https://golang.org/cl/3837042
-rw-r--r--src/pkg/regexp/all_test.go46
-rw-r--r--src/pkg/regexp/regexp.go35
2 files changed, 74 insertions, 7 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index 8f115aa49c..3b2c489bce 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -377,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) {
re.ReplaceAllString(x, "")
}
}
+
+func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
+ b.StopTimer()
+ x := []byte("abcdefghijklmnopqrstuvwxyz")
+ re := MustCompile("^zbc(d|e)")
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ re.Match(x)
+ }
+}
+
+func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
+ b.StopTimer()
+ x := []byte("abcdefghijklmnopqrstuvwxyz")
+ for i := 0; i < 15; i++ {
+ x = append(x, x...)
+ }
+ re := MustCompile("^zbc(d|e)")
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ re.Match(x)
+ }
+}
+
+func BenchmarkAnchoredShortMatch(b *testing.B) {
+ b.StopTimer()
+ x := []byte("abcdefghijklmnopqrstuvwxyz")
+ re := MustCompile("^.bc(d|e)")
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ re.Match(x)
+ }
+}
+
+func BenchmarkAnchoredLongMatch(b *testing.B) {
+ b.StopTimer()
+ x := []byte("abcdefghijklmnopqrstuvwxyz")
+ for i := 0; i < 15; i++ {
+ x = append(x, x...)
+ }
+ re := MustCompile("^.bc(d|e)")
+ b.StartTimer()
+ for i := 0; i < b.N; i++ {
+ re.Match(x)
+ }
+}
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index ef6a8aa0ba..4d13fad8b3 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -571,15 +571,20 @@ func (re *Regexp) doParse() {
}
}
-// Extract regular text from the beginning of the pattern.
+// Extract regular text from the beginning of the pattern,
+// possibly after a leading iBOT.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
var b []byte
var utf = make([]byte, utf8.UTFMax)
var inst *instr
- // First instruction is start; skip that.
+ // First instruction is start; skip that. Also skip any initial iBOT.
+ inst = re.inst[0].next
+ for inst.kind == iBOT {
+ inst = inst.next
+ }
Loop:
- for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
+ for ; inst.kind != iEnd; inst = inst.next {
// stop if this is not a char
if inst.kind != iChar {
break
@@ -748,14 +753,30 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
if bytestr != nil {
end = len(bytestr)
}
+ anchored := re.inst[0].next.kind == iBOT
+ if anchored && pos > 0 {
+ return nil
+ }
// fast check for initial plain substring
prefixed := false // has this iteration begun by skipping a prefix?
if re.prefix != "" {
- var advance int
- if bytestr == nil {
- advance = strings.Index(str[pos:], re.prefix)
+ advance := 0
+ if anchored {
+ if bytestr == nil {
+ if !strings.HasPrefix(str, re.prefix) {
+ return nil
+ }
+ } else {
+ if !bytes.HasPrefix(bytestr, re.prefixBytes) {
+ return nil
+ }
+ }
} else {
- advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+ if bytestr == nil {
+ advance = strings.Index(str[pos:], re.prefix)
+ } else {
+ advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+ }
}
if advance == -1 {
return nil