aboutsummaryrefslogtreecommitdiff
path: root/src/regexp
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2021-05-11 10:52:44 -0400
committerRuss Cox <rsc@golang.org>2021-05-13 14:52:20 +0000
commit2a61b3c59088115245d084d5ae07dd4be5fbe1b0 (patch)
treeedebf0990b16428f7da90f36c007b87e627fd16f /src/regexp
parentfd4631e24f53cf836a67b00e82e2159854ec31d0 (diff)
downloadgo-2a61b3c59088115245d084d5ae07dd4be5fbe1b0.tar.gz
go-2a61b3c59088115245d084d5ae07dd4be5fbe1b0.zip
regexp: fix repeat of preferred empty match
In Perl mode, (|a)* should match an empty string at the start of the input. Instead it matches as many a's as possible. Because (|a)+ is handled correctly, matching only an empty string, this leads to the paradox that e* can match more text than e+ (for e = (|a)) and that e+ is sometimes different from ee*. This is a very old bug that ultimately derives from the picture I drew for e* in https://swtch.com/~rsc/regexp/regexp1.html. The picture is correct for longest-match (POSIX) regexps but subtly wrong for preferred-match (Perl) regexps in the case where e has a preferred empty match. Pointed out by Andrew Gallant in private mail. The current code treats e* and e+ as the same structure, with different entry points. In the case of e* the preference list ends up not quite in the right order, in part because the “before e” and “after e” states are the same state. Splitting them apart fixes the preference list, and that can be done by compiling e* as if it were (e+)?. Like with any bug fix, there is a very low chance of breaking a program that accidentally depends on the buggy behavior. RE2, Go, and Rust all have this bug, and we've all agreed to fix it, to keep the implementations in sync. Fixes #46123. Change-Id: I70e742e71e0a23b626593b16ddef3c1e73b413b0 Reviewed-on: https://go-review.googlesource.com/c/go/+/318750 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Rob Pike <r@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
Diffstat (limited to 'src/regexp')
-rw-r--r--src/regexp/find_test.go1
-rw-r--r--src/regexp/onepass_test.go2
-rw-r--r--src/regexp/syntax/compile.go29
-rw-r--r--src/regexp/syntax/prog_test.go15
-rw-r--r--src/regexp/testdata/basic.dat12
-rw-r--r--src/regexp/testdata/nullsubexpr.dat18
-rw-r--r--src/regexp/testdata/re2-exhaustive.txt.bz2bin394016 -> 428262 bytes
-rw-r--r--src/regexp/testdata/re2-search.txt145
8 files changed, 176 insertions, 46 deletions
diff --git a/src/regexp/find_test.go b/src/regexp/find_test.go
index 87c49b074f..64c2239d90 100644
--- a/src/regexp/find_test.go
+++ b/src/regexp/find_test.go
@@ -97,6 +97,7 @@ var findTests = []FindTest{
{`\B`, "xx", build(1, 1, 1)},
{`\B`, "x y", nil},
{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
+ {`(|a)*`, "aa", build(3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2)},
// RE2 tests
{`[^\S\s]`, "abcd", nil},
diff --git a/src/regexp/onepass_test.go b/src/regexp/onepass_test.go
index 32264d5f1e..6a42eda391 100644
--- a/src/regexp/onepass_test.go
+++ b/src/regexp/onepass_test.go
@@ -142,7 +142,7 @@ var onePassTests = []struct {
{`^(?:(a)|(?:a*))$`, false},
{`^(?:(?:(?:.(?:$))?))$`, true},
{`^abcd$`, true},
- {`^(?:(?:a{0,})*?)$`, true},
+ {`^(?:(?:a{0,})*?)$`, false},
{`^(?:(?:a+)*)$`, true},
{`^(?:(?:a|(?:aa)))$`, true},
{`^(?:[^\s\S])$`, true},
diff --git a/src/regexp/syntax/compile.go b/src/regexp/syntax/compile.go
index 7524d628fe..c9f9fa024b 100644
--- a/src/regexp/syntax/compile.go
+++ b/src/regexp/syntax/compile.go
@@ -57,8 +57,9 @@ func (l1 patchList) append(p *Prog, l2 patchList) patchList {
// A frag represents a compiled program fragment.
type frag struct {
- i uint32 // index of first instruction
- out patchList // where to record end instruction
+ i uint32 // index of first instruction
+ out patchList // where to record end instruction
+ nullable bool // whether fragment can match empty string
}
type compiler struct {
@@ -159,7 +160,7 @@ func (c *compiler) compile(re *Regexp) frag {
func (c *compiler) inst(op InstOp) frag {
// TODO: impose length limit
- f := frag{i: uint32(len(c.p.Inst))}
+ f := frag{i: uint32(len(c.p.Inst)), nullable: true}
c.p.Inst = append(c.p.Inst, Inst{Op: op})
return f
}
@@ -194,7 +195,7 @@ func (c *compiler) cat(f1, f2 frag) frag {
// TODO: elide nop
f1.out.patch(c.p, f2.i)
- return frag{f1.i, f2.out}
+ return frag{f1.i, f2.out, f1.nullable && f2.nullable}
}
func (c *compiler) alt(f1, f2 frag) frag {
@@ -211,6 +212,7 @@ func (c *compiler) alt(f1, f2 frag) frag {
i.Out = f1.i
i.Arg = f2.i
f.out = f1.out.append(c.p, f2.out)
+ f.nullable = f1.nullable || f2.nullable
return f
}
@@ -228,7 +230,12 @@ func (c *compiler) quest(f1 frag, nongreedy bool) frag {
return f
}
-func (c *compiler) star(f1 frag, nongreedy bool) frag {
+// loop returns the fragment for the main loop of a plus or star.
+// For plus, it can be used after changing the entry to f1.i.
+// For star, it can be used directly when f1 can't match an empty string.
+// (When f1 can match an empty string, f1* must be implemented as (f1+)?
+// to get the priority match order correct.)
+func (c *compiler) loop(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
@@ -242,8 +249,17 @@ func (c *compiler) star(f1 frag, nongreedy bool) frag {
return f
}
+func (c *compiler) star(f1 frag, nongreedy bool) frag {
+ if f1.nullable {
+ // Use (f1+)? to get priority match order correct.
+ // See golang.org/issue/46123.
+ return c.quest(c.plus(f1, nongreedy), nongreedy)
+ }
+ return c.loop(f1, nongreedy)
+}
+
func (c *compiler) plus(f1 frag, nongreedy bool) frag {
- return frag{f1.i, c.star(f1, nongreedy).out}
+ return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
}
func (c *compiler) empty(op EmptyOp) frag {
@@ -255,6 +271,7 @@ func (c *compiler) empty(op EmptyOp) frag {
func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
+ f.nullable = false
i := &c.p.Inst[f.i]
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
diff --git a/src/regexp/syntax/prog_test.go b/src/regexp/syntax/prog_test.go
index 50bfa3d4be..5603aea228 100644
--- a/src/regexp/syntax/prog_test.go
+++ b/src/regexp/syntax/prog_test.go
@@ -89,6 +89,21 @@ var compileTests = []struct {
2 anynotnl -> 3
3 match
`},
+ {"(?:|a)+", ` 0 fail
+ 1 nop -> 4
+ 2 rune1 "a" -> 4
+ 3* alt -> 1, 2
+ 4 alt -> 3, 5
+ 5 match
+`},
+ {"(?:|a)*", ` 0 fail
+ 1 nop -> 4
+ 2 rune1 "a" -> 4
+ 3 alt -> 1, 2
+ 4 alt -> 3, 6
+ 5* alt -> 3, 6
+ 6 match
+`},
}
func TestCompile(t *testing.T) {
diff --git a/src/regexp/testdata/basic.dat b/src/regexp/testdata/basic.dat
index 7859290ba1..1776b1ff96 100644
--- a/src/regexp/testdata/basic.dat
+++ b/src/regexp/testdata/basic.dat
@@ -124,24 +124,20 @@ E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
-#E (a*)* - (0,0)(0,0)
-E (a*)* - (0,0)(?,?) RE2/Go
+E (a*)* - (0,0)(0,0)
E (a*)+ - (0,0)(0,0)
-#E (a*|b)* - (0,0)(0,0)
-E (a*|b)* - (0,0)(?,?) RE2/Go
+E (a*|b)* - (0,0)(0,0)
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
-#E (^)* - (0,0)(0,0)
-E (^)* - (0,0)(?,?) RE2/Go
+E (^)* - (0,0)(0,0)
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
-#E ((a*|b))* - (0,0)(0,0)(0,0)
-E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
+E ((a*|b))* - (0,0)(0,0)(0,0)
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
diff --git a/src/regexp/testdata/nullsubexpr.dat b/src/regexp/testdata/nullsubexpr.dat
index 2e18fbb917..68d9c99996 100644
--- a/src/regexp/testdata/nullsubexpr.dat
+++ b/src/regexp/testdata/nullsubexpr.dat
@@ -1,8 +1,7 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
+E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
@@ -19,8 +18,7 @@ E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
+E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
@@ -28,8 +26,7 @@ E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
-#E SAME b (0,0)(0,0)
-E SAME b (0,0)(?,?) RE2/Go
+E SAME b (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
@@ -41,11 +38,9 @@ E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
-#E SAME aaaaaa (0,0)(0,0)
-E SAME aaaaaa (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,0)(0,0)
E ([^ab]*)* ccccxx (0,6)(0,6)
-#E SAME ababab (0,0)(0,0)
-E SAME ababab (0,0)(?,?) RE2/Go
+E SAME ababab (0,0)(0,0)
E ((z)+|a)* zabcde (0,2)(1,2)
@@ -65,8 +60,7 @@ B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
-#E (a*)*(x) x (0,1)(0,0)(0,1)
-E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
+E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
diff --git a/src/regexp/testdata/re2-exhaustive.txt.bz2 b/src/regexp/testdata/re2-exhaustive.txt.bz2
index a357f28016..6638476dec 100644
--- a/src/regexp/testdata/re2-exhaustive.txt.bz2
+++ b/src/regexp/testdata/re2-exhaustive.txt.bz2
Binary files differ
diff --git a/src/regexp/testdata/re2-search.txt b/src/regexp/testdata/re2-search.txt
index 4d02e9cebd..8c4098a4f1 100644
--- a/src/regexp/testdata/re2-search.txt
+++ b/src/regexp/testdata/re2-search.txt
@@ -1,5 +1,5 @@
# RE2 basic search tests built by make log
-# Thu Sep 8 13:43:43 EDT 2011
+# Wed May 12 12:13:22 EDT 2021
Regexp.SearchTests
strings
""
@@ -227,22 +227,6 @@ regexps
0-0;0-0;0-0;0-0
strings
""
-""
-regexps
-"a*"
-0-0;0-0;0-0;0-0
-0-0;0-0;0-0;0-0
-"^(?:a*)$"
-0-0;0-0;0-0;0-0
-0-0;0-0;0-0;0-0
-"^(?:a*)"
-0-0;0-0;0-0;0-0
-0-0;0-0;0-0;0-0
-"(?:a*)$"
-0-0;0-0;0-0;0-0
-0-0;0-0;0-0;0-0
-strings
-""
"xabcdx"
regexps
"ab|cd"
@@ -3651,6 +3635,86 @@ regexps
0-1;0-1;0-1;0-1
strings
""
+"a"
+regexps
+"a\\C+"
+-;-;-;-
+-;-;-;-
+"^(?:a\\C+)$"
+-;-;-;-
+-;-;-;-
+"^(?:a\\C+)"
+-;-;-;-
+-;-;-;-
+"(?:a\\C+)$"
+-;-;-;-
+-;-;-;-
+strings
+""
+"a"
+regexps
+"a\\C?"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C?)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C?)"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"(?:a\\C?)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+strings
+""
+"a"
+regexps
+"a\\C*?"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C*?)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C*?)"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"(?:a\\C*?)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+strings
+""
+"a"
+regexps
+"a\\C+?"
+-;-;-;-
+-;-;-;-
+"^(?:a\\C+?)$"
+-;-;-;-
+-;-;-;-
+"^(?:a\\C+?)"
+-;-;-;-
+-;-;-;-
+"(?:a\\C+?)$"
+-;-;-;-
+-;-;-;-
+strings
+""
+"a"
+regexps
+"a\\C??"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C??)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"^(?:a\\C??)"
+-;-;-;-
+0-1;0-1;0-1;0-1
+"(?:a\\C??)$"
+-;-;-;-
+0-1;0-1;0-1;0-1
+strings
+""
"baba"
regexps
"a\\C*|ba\\C"
@@ -3666,7 +3730,50 @@ regexps
-;-;-;-
-;1-4;-;1-4
strings
-"abc"
+""
+"Inc."
regexps
-"a.*?c|a.*?b"
+"\\w*I\\w*"
+-;-;-;-
+-;0-3;-;0-3
+"^(?:\\w*I\\w*)$"
+-;-;-;-
+-;-;-;-
+"^(?:\\w*I\\w*)"
+-;-;-;-
+-;0-3;-;0-3
+"(?:\\w*I\\w*)$"
+-;-;-;-
+-;-;-;-
+strings
+""
+"aaa"
+regexps
+"(?:|a)*"
+0-0;0-0;0-0;0-0
+0-3;0-0;0-3;0-3
+"^(?:(?:|a)*)$"
+0-0;0-0;0-0;0-0
+0-3;0-3;0-3;0-3
+"^(?:(?:|a)*)"
+0-0;0-0;0-0;0-0
+0-3;0-0;0-3;0-3
+"(?:(?:|a)*)$"
+0-0;0-0;0-0;0-0
+0-3;0-3;0-3;0-3
+strings
+""
+"aaa"
+regexps
+"(?:|a)+"
+0-0;0-0;0-0;0-0
+0-3;0-0;0-3;0-3
+"^(?:(?:|a)+)$"
+0-0;0-0;0-0;0-0
+0-3;0-3;0-3;0-3
+"^(?:(?:|a)+)"
+0-0;0-0;0-0;0-0
+0-3;0-0;0-3;0-3
+"(?:(?:|a)+)$"
+0-0;0-0;0-0;0-0
0-3;0-3;0-3;0-3