regexp: fix performance bug, make anchored searches fail fast.

The bug was that for an anchored pattern such as ^x, the prefix scan ignored the anchor, and could scan the whole file if there was no x present. The fix is to do prefix matching after the anchor; the cost miniscule; the speedups huge. R=rsc, gri CC=golang-dev https://golang.org/cl/3837042
author: Rob Pike <r@golang.org> 2011-01-03 11:31:51 -0800
committer: Rob Pike <r@golang.org> 2011-01-03 11:31:51 -0800
commit: c0d0d4ef05378c6a297e36c60b7af2b799f5653f (patch)
tree: be0f9f1c8a5f3b8197fe30a122b180882a62f048
parent: 84fc1e20f17097c1b6710f79cdd03dafeb908eaf (diff)
download: go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.tar.gz
go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.zip
2 files changed, 74 insertions, 7 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index 8f115aa49c..3b2c489bce 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -377,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) {
 		re.ReplaceAllString(x, "")
 	}
 }
+
+func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	re := MustCompile("^zbc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	for i := 0; i < 15; i++ {
+		x = append(x, x...)
+	}
+	re := MustCompile("^zbc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredShortMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	re := MustCompile("^.bc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
+
+func BenchmarkAnchoredLongMatch(b *testing.B) {
+	b.StopTimer()
+	x := []byte("abcdefghijklmnopqrstuvwxyz")
+	for i := 0; i < 15; i++ {
+		x = append(x, x...)
+	}
+	re := MustCompile("^.bc(d|e)")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		re.Match(x)
+	}
+}
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index ef6a8aa0ba..4d13fad8b3 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -571,15 +571,20 @@ func (re *Regexp) doParse() {
 	}
 }
 
-// Extract regular text from the beginning of the pattern.
+// Extract regular text from the beginning of the pattern,
+// possibly after a leading iBOT.
 // That text can be used by doExecute to speed up matching.
 func (re *Regexp) setPrefix() {
 	var b []byte
 	var utf = make([]byte, utf8.UTFMax)
 	var inst *instr
-	// First instruction is start; skip that.
+	// First instruction is start; skip that.  Also skip any initial iBOT.
+	inst = re.inst[0].next
+	for inst.kind == iBOT {
+		inst = inst.next
+	}
 Loop:
-	for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
+	for ; inst.kind != iEnd; inst = inst.next {
 		// stop if this is not a char
 		if inst.kind != iChar {
 			break
@@ -748,14 +753,30 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
 	if bytestr != nil {
 		end = len(bytestr)
 	}
+	anchored := re.inst[0].next.kind == iBOT
+	if anchored && pos > 0 {
+		return nil
+	}
 	// fast check for initial plain substring
 	prefixed := false // has this iteration begun by skipping a prefix?
 	if re.prefix != "" {
-		var advance int
-		if bytestr == nil {
-			advance = strings.Index(str[pos:], re.prefix)
+		advance := 0
+		if anchored {
+			if bytestr == nil {
+				if !strings.HasPrefix(str, re.prefix) {
+					return nil
+				}
+			} else {
+				if !bytes.HasPrefix(bytestr, re.prefixBytes) {
+					return nil
+				}
+			}
 		} else {
-			advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+			if bytestr == nil {
+				advance = strings.Index(str[pos:], re.prefix)
+			} else {
+				advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+			}
 		}
 		if advance == -1 {
 			return nil
author	Rob Pike <r@golang.org>	2011-01-03 11:31:51 -0800
committer	Rob Pike <r@golang.org>	2011-01-03 11:31:51 -0800
commit	c0d0d4ef05378c6a297e36c60b7af2b799f5653f (patch)
tree	be0f9f1c8a5f3b8197fe30a122b180882a62f048
parent	84fc1e20f17097c1b6710f79cdd03dafeb908eaf (diff)
download	go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.tar.gz go-c0d0d4ef05378c6a297e36c60b7af2b799f5653f.zip