aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2011-06-29 00:55:37 -0400
committerRuss Cox <rsc@golang.org>2011-06-29 00:55:37 -0400
commit7e1a3e9f209d33eff36eb6876e0505a300be9ba6 (patch)
tree311c012b97aa2122afe4cbca4ff3f3f660496a1d
parent3379414b210d1c03c27b4b340c3a12da430ec2e9 (diff)
downloadgo-7e1a3e9f209d33eff36eb6876e0505a300be9ba6.tar.gz
go-7e1a3e9f209d33eff36eb6876e0505a300be9ba6.zip
exp/regexp/syntax: incremental concat, alternate
Also reuse of *Regexp nodes. I believe this is the end of the parser. The only non-execution code that remains is the code to expand x{3,5} into simpler operations. R=sam.thorogood, r CC=golang-dev https://golang.org/cl/4629078
-rw-r--r--src/pkg/exp/regexp/syntax/parse.go297
-rw-r--r--src/pkg/exp/regexp/syntax/parse_test.go241
-rw-r--r--src/pkg/exp/regexp/syntax/regexp.go2
3 files changed, 365 insertions, 175 deletions
diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go
index cbde6c6041..ae40d5fc94 100644
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@@ -79,29 +79,110 @@ const (
type parser struct {
flags Flags // parse mode flags
stack []*Regexp // stack of parsed expressions
- numCap int // number of capturing groups seen
+ free *Regexp
+ numCap int // number of capturing groups seen
wholeRegexp string
tmpClass []int // temporary char class work space
}
+func (p *parser) newRegexp(op Op) *Regexp {
+ re := p.free
+ if re != nil {
+ p.free = re.Sub0[0]
+ *re = Regexp{}
+ } else {
+ re = new(Regexp)
+ }
+ re.Op = op
+ return re
+}
+
+func (p *parser) reuse(re *Regexp) {
+ re.Sub0[0] = p.free
+ p.free = re
+}
+
// Parse stack manipulation.
// push pushes the regexp re onto the parse stack and returns the regexp.
func (p *parser) push(re *Regexp) *Regexp {
- // TODO: automatic concatenation
- // TODO: turn character class into literal
// TODO: compute simple
+ if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
+ // Single rune.
+ if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
+ return nil
+ }
+ re.Op = OpLiteral
+ re.Rune = re.Rune[:1]
+ re.Flags = p.flags &^ FoldCase
+ } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
+ re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
+ unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
+ unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
+ re.Op == OpCharClass && len(re.Rune) == 2 &&
+ re.Rune[0]+1 == re.Rune[1] &&
+ unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
+ unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
+ // Case-insensitive rune like [Aa] or [Δδ].
+ if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
+ return nil
+ }
+
+ // Rewrite as (case-insensitive) literal.
+ re.Op = OpLiteral
+ re.Rune = re.Rune[:1]
+ re.Flags = p.flags | FoldCase
+ } else {
+ // Incremental concatenation.
+ p.maybeConcat(-1, 0)
+ }
+
p.stack = append(p.stack, re)
return re
}
-// newLiteral returns a new OpLiteral Regexp with the given flags
-func newLiteral(r int, flags Flags) *Regexp {
- re := &Regexp{
- Op: OpLiteral,
- Flags: flags,
+// maybeConcat implements incremental concatenation
+// of literal runes into string nodes. The parser calls this
+// before each push, so only the top fragment of the stack
+// might need processing. Since this is called before a push,
+// the topmost literal is no longer subject to operators like *
+// (Otherwise ab* would turn into (ab)*.)
+// If r >= 0 and there's a node left over, maybeConcat uses it
+// to push r with the given flags.
+// maybeConcat reports whether r was pushed.
+func (p *parser) maybeConcat(r int, flags Flags) bool {
+ n := len(p.stack)
+ if n < 2 {
+ return false
+ }
+
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
+ return false
+ }
+
+ // Push re1 into re2.
+ re2.Rune = append(re2.Rune, re1.Rune...)
+
+ // Reuse re1 if possible.
+ if r >= 0 {
+ re1.Rune = re1.Rune0[:1]
+ re1.Rune[0] = r
+ re1.Flags = flags
+ return true
}
+
+ p.stack = p.stack[:n-1]
+ p.reuse(re1)
+ return false // did not push r
+}
+
+// newLiteral returns a new OpLiteral Regexp with the given flags
+func (p *parser) newLiteral(r int, flags Flags) *Regexp {
+ re := p.newRegexp(OpLiteral)
+ re.Flags = flags
re.Rune0[0] = r
re.Rune = re.Rune0[:1]
return re
@@ -109,14 +190,16 @@ func newLiteral(r int, flags Flags) *Regexp {
// literal pushes a literal regexp for the rune r on the stack
// and returns that regexp.
-func (p *parser) literal(r int) *Regexp {
- return p.push(newLiteral(r, p.flags))
+func (p *parser) literal(r int) {
+ p.push(p.newLiteral(r, p.flags))
}
// op pushes a regexp with the given op onto the stack
// and returns that regexp.
func (p *parser) op(op Op) *Regexp {
- return p.push(&Regexp{Op: op, Flags: p.flags})
+ re := p.newRegexp(op)
+ re.Flags = p.flags
+ return p.push(re)
}
// repeat replaces the top stack element with itself repeated
@@ -140,12 +223,10 @@ func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (strin
return "", &Error{ErrMissingRepeatArgument, opstr}
}
sub := p.stack[n-1]
- re := &Regexp{
- Op: op,
- Min: min,
- Max: max,
- Flags: flags,
- }
+ re := p.newRegexp(op)
+ re.Min = min
+ re.Max = max
+ re.Flags = flags
re.Sub = re.Sub0[:1]
re.Sub[0] = sub
p.stack[n-1] = re
@@ -154,60 +235,97 @@ func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (strin
// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
func (p *parser) concat() *Regexp {
- // TODO: Flatten concats.
+ p.maybeConcat(-1, 0)
// Scan down to find pseudo-operator | or (.
i := len(p.stack)
for i > 0 && p.stack[i-1].Op < opPseudo {
i--
}
- sub := p.stack[i:]
+ subs := p.stack[i:]
p.stack = p.stack[:i]
- var re *Regexp
- switch len(sub) {
- case 0:
- re = &Regexp{Op: OpEmptyMatch}
- case 1:
- re = sub[0]
- default:
- re = &Regexp{Op: OpConcat}
- re.Sub = append(re.Sub0[:0], sub...)
+ // Empty concatenation is special case.
+ if len(subs) == 0 {
+ return p.push(p.newRegexp(OpEmptyMatch))
}
- return p.push(re)
+
+ return p.collapse(subs, OpConcat)
}
// alternate replaces the top of the stack (above the topmost '(') with its alternation.
func (p *parser) alternate() *Regexp {
- // TODO: Flatten alternates.
-
// Scan down to find pseudo-operator (.
// There are no | above (.
i := len(p.stack)
for i > 0 && p.stack[i-1].Op < opPseudo {
i--
}
- sub := p.stack[i:]
+ subs := p.stack[i:]
p.stack = p.stack[:i]
- var re *Regexp
- switch len(sub) {
- case 0:
- re = &Regexp{Op: OpNoMatch}
- case 1:
- re = sub[0]
- default:
- re = &Regexp{Op: OpAlternate}
- re.Sub = append(re.Sub0[:0], sub...)
+ // Make sure top class is clean.
+ // All the others already are (see swapVerticalBar).
+ if len(subs) > 0 {
+ cleanAlt(subs[len(subs)-1])
+ }
+
+ // Empty alternate is special case
+ // (shouldn't happen but easy to handle).
+ if len(subs) == 0 {
+ return p.push(p.newRegexp(OpNoMatch))
+ }
+
+ return p.collapse(subs, OpAlternate)
+}
+
+// cleanAlt cleans re for eventual inclusion in an alternation.
+func cleanAlt(re *Regexp) {
+ switch re.Op {
+ case OpCharClass:
+ re.Rune = cleanClass(&re.Rune)
+ if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
+ re.Rune = nil
+ re.Op = OpAnyChar
+ return
+ }
+ if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
+ re.Rune = nil
+ re.Op = OpAnyCharNotNL
+ return
+ }
+ if cap(re.Rune)-len(re.Rune) > 100 {
+ // re.Rune will not grow any more.
+ // Make a copy or inline to reclaim storage.
+ re.Rune = append(re.Rune0[:0], re.Rune...)
+ }
+ }
+}
+
+// collapse pushes the result of applying op to sub
+// onto the stack. If sub contains op nodes, they all
+// get flattened into a single node.
+// sub points into p.stack so it cannot be kept.
+func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
+ if len(subs) == 1 {
+ return p.push(subs[0])
+ }
+ re := p.newRegexp(op)
+ re.Sub = re.Sub0[:0]
+ for _, sub := range subs {
+ if sub.Op == op {
+ re.Sub = append(re.Sub, sub.Sub...)
+ p.reuse(sub)
+ } else {
+ re.Sub = append(re.Sub, sub)
+ }
}
return p.push(re)
}
func literalRegexp(s string, flags Flags) *Regexp {
- re := &Regexp{
- Op: OpLiteral,
- Flags: flags,
- }
+ re := &Regexp{Op: OpLiteral}
+ re.Flags = flags
re.Rune = re.Rune0[:0] // use local storage for small strings
for _, c := range s {
if len(re.Rune) >= cap(re.Rune) {
@@ -265,7 +383,6 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
p.op(opLeftParen).Cap = p.numCap
t = t[1:]
case '|':
- p.concat()
if err = p.parseVerticalBar(); err != nil {
return nil, err
}
@@ -361,7 +478,8 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
}
}
- re := &Regexp{Op: OpCharClass, Flags: p.flags}
+ re := p.newRegexp(OpCharClass)
+ re.Flags = p.flags
// Look for Unicode character group like \p{Han}
if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
@@ -381,12 +499,10 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
re.Rune = r
t = rest
- // TODO: Handle FoldCase flag.
p.push(re)
break BigSwitch
}
-
- // TODO: Give re back to parser's pool.
+ p.reuse(re)
// Ordinary single-character escape.
if c, t, err = p.parseEscape(t); err != nil {
@@ -592,6 +708,35 @@ func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
return
}
+// can this be represented as a character class?
+// single-rune literal string, char class, ., and .|\n.
+func isCharClass(re *Regexp) bool {
+ return re.Op == OpLiteral && len(re.Rune) == 1 ||
+ re.Op == OpCharClass ||
+ re.Op == OpAnyCharNotNL ||
+ re.Op == OpAnyChar
+}
+
+// does re match r?
+func matchRune(re *Regexp, r int) bool {
+ switch re.Op {
+ case OpLiteral:
+ return len(re.Rune) == 1 && re.Rune[0] == r
+ case OpCharClass:
+ for i := 0; i < len(re.Rune); i += 2 {
+ if re.Rune[i] <= r && r <= re.Rune[i+1] {
+ return true
+ }
+ }
+ return false
+ case OpAnyCharNotNL:
+ return r != '\n'
+ case OpAnyChar:
+ return true
+ }
+ return false
+}
+
// parseVerticalBar handles a | in the input.
func (p *parser) parseVerticalBar() os.Error {
p.concat()
@@ -611,10 +756,55 @@ func (p *parser) parseVerticalBar() os.Error {
// swapVerticalBar swaps the two and returns true.
// Otherwise it returns false.
func (p *parser) swapVerticalBar() bool {
- if n := len(p.stack); n >= 2 {
+ // If above and below vertical bar are literal or char class,
+ // can merge into a single char class.
+ n := len(p.stack)
+ if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
+ re1 := p.stack[n-1]
+ re3 := p.stack[n-3]
+ // Make re3 the more complex of the two.
+ if re1.Op > re3.Op {
+ re1, re3 = re3, re1
+ p.stack[n-3] = re3
+ }
+ switch re3.Op {
+ case OpAnyChar:
+ // re1 doesn't add anything.
+ case OpAnyCharNotNL:
+ // re1 might add \n
+ if matchRune(re1, '\n') {
+ re3.Op = OpAnyChar
+ }
+ case OpCharClass:
+ // re1 is simpler, so either literal or char class
+ if re1.Op == OpLiteral {
+ re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
+ } else {
+ re3.Rune = appendClass(re3.Rune, re1.Rune)
+ }
+ case OpLiteral:
+ // both literal
+ if re1.Rune[0] == re3.Rune[0] {
+ break
+ }
+ re3.Op = OpCharClass
+ re3.Rune = append(re3.Rune, re3.Rune[0])
+ re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
+ }
+ p.reuse(re1)
+ p.stack = p.stack[:n-1]
+ return true
+ }
+
+ if n >= 2 {
re1 := p.stack[n-1]
re2 := p.stack[n-2]
if re2.Op == opVerticalBar {
+ if n >= 3 {
+ // Now out of reach.
+ // Clean opportunistically.
+ cleanAlt(p.stack[n-3])
+ }
p.stack[n-2] = re1
p.stack[n-1] = re2
return true
@@ -937,7 +1127,8 @@ func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, e
// and pushes it onto the parse stack.
func (p *parser) parseClass(s string) (rest string, err os.Error) {
t := s[1:] // chop [
- re := &Regexp{Op: OpCharClass, Flags: p.flags}
+ re := p.newRegexp(OpCharClass)
+ re.Flags = p.flags
re.Rune = re.Rune0[:0]
sign := +1
@@ -1017,8 +1208,6 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
}
t = t[1:] // chop ]
- // TODO: Handle FoldCase flag.
-
// Use &re.Rune instead of &class to avoid allocation.
re.Rune = class
class = cleanClass(&re.Rune)
diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go
index 4938069794..51856b613e 100644
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@@ -16,148 +16,146 @@ var parseTests = []struct {
Dump string
}{
// Base cases
- {"a", "lit{a}"},
- {"a.", "cat{lit{a}dot{}}"},
- {"a.b", "cat{lit{a}dot{}lit{b}}"},
- // { "ab", "str{ab}" },
- {"ab", "cat{lit{a}lit{b}}"},
- {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},
- // { "abc", "str{abc}" },
- {"abc", "cat{lit{a}lit{b}lit{c}}"},
- {"a|^", "alt{lit{a}bol{}}"},
- // { "a|b", "cc{0x61-0x62}" },
- {"a|b", "alt{lit{a}lit{b}}"},
- {"(a)", "cap{lit{a}}"},
- {"(a)|b", "alt{cap{lit{a}}lit{b}}"},
- {"a*", "star{lit{a}}"},
- {"a+", "plus{lit{a}}"},
- {"a?", "que{lit{a}}"},
- {"a{2}", "rep{2,2 lit{a}}"},
- {"a{2,3}", "rep{2,3 lit{a}}"},
- {"a{2,}", "rep{2,-1 lit{a}}"},
- {"a*?", "nstar{lit{a}}"},
- {"a+?", "nplus{lit{a}}"},
- {"a??", "nque{lit{a}}"},
- {"a{2}?", "nrep{2,2 lit{a}}"},
- {"a{2,3}?", "nrep{2,3 lit{a}}"},
- {"a{2,}?", "nrep{2,-1 lit{a}}"},
- {"", "emp{}"},
- // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored
- {"|", "alt{emp{}emp{}}"},
- {"|x|", "alt{emp{}lit{x}emp{}}"},
- {".", "dot{}"},
- {"^", "bol{}"},
- {"$", "eol{}"},
- {"\\|", "lit{|}"},
- {"\\(", "lit{(}"},
- {"\\)", "lit{)}"},
- {"\\*", "lit{*}"},
- {"\\+", "lit{+}"},
- {"\\?", "lit{?}"},
- {"{", "lit{{}"},
- {"}", "lit{}}"},
- {"\\.", "lit{.}"},
- {"\\^", "lit{^}"},
- {"\\$", "lit{$}"},
- {"\\\\", "lit{\\}"},
- {"[ace]", "cc{0x61 0x63 0x65}"},
- {"[abc]", "cc{0x61-0x63}"},
- {"[a-z]", "cc{0x61-0x7a}"},
- // { "[a]", "lit{a}" },
- {"[a]", "cc{0x61}"},
- {"\\-", "lit{-}"},
- {"-", "lit{-}"},
- {"\\_", "lit{_}"},
+ {`a`, `lit{a}`},
+ {`a.`, `cat{lit{a}dot{}}`},
+ {`a.b`, `cat{lit{a}dot{}lit{b}}`},
+ {`ab`, `str{ab}`},
+ {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
+ {`abc`, `str{abc}`},
+ {`a|^`, `alt{lit{a}bol{}}`},
+ {`a|b`, `cc{0x61-0x62}`},
+ {`(a)`, `cap{lit{a}}`},
+ {`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
+ {`a*`, `star{lit{a}}`},
+ {`a+`, `plus{lit{a}}`},
+ {`a?`, `que{lit{a}}`},
+ {`a{2}`, `rep{2,2 lit{a}}`},
+ {`a{2,3}`, `rep{2,3 lit{a}}`},
+ {`a{2,}`, `rep{2,-1 lit{a}}`},
+ {`a*?`, `nstar{lit{a}}`},
+ {`a+?`, `nplus{lit{a}}`},
+ {`a??`, `nque{lit{a}}`},
+ {`a{2}?`, `nrep{2,2 lit{a}}`},
+ {`a{2,3}?`, `nrep{2,3 lit{a}}`},
+ {`a{2,}?`, `nrep{2,-1 lit{a}}`},
+ {``, `emp{}`},
+ // { `|`, `emp{}` }, // alt{emp{}emp{}} but got factored
+ {`|`, `alt{emp{}emp{}}`},
+ {`|x|`, `alt{emp{}lit{x}emp{}}`},
+ {`.`, `dot{}`},
+ {`^`, `bol{}`},
+ {`$`, `eol{}`},
+ {`\|`, `lit{|}`},
+ {`\(`, `lit{(}`},
+ {`\)`, `lit{)}`},
+ {`\*`, `lit{*}`},
+ {`\+`, `lit{+}`},
+ {`\?`, `lit{?}`},
+ {`{`, `lit{{}`},
+ {`}`, `lit{}}`},
+ {`\.`, `lit{.}`},
+ {`\^`, `lit{^}`},
+ {`\$`, `lit{$}`},
+ {`\\`, `lit{\}`},
+ {`[ace]`, `cc{0x61 0x63 0x65}`},
+ {`[abc]`, `cc{0x61-0x63}`},
+ {`[a-z]`, `cc{0x61-0x7a}`},
+ {`[a]`, `lit{a}`},
+ {`\-`, `lit{-}`},
+ {`-`, `lit{-}`},
+ {`\_`, `lit{_}`},
// Posix and Perl extensions
- {"[[:lower:]]", "cc{0x61-0x7a}"},
- {"[a-z]", "cc{0x61-0x7a}"},
- {"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
- {"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
- {"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
- {"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
- {"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
- {"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
- {"\\d", "cc{0x30-0x39}"},
- {"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
- {"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
- {"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
- {"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
- {"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
- {"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"},
- {"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
- {"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
- // { "\\C", "byte{}" },
+ {`[[:lower:]]`, `cc{0x61-0x7a}`},
+ {`[a-z]`, `cc{0x61-0x7a}`},
+ {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+ {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+ {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`\d`, `cc{0x30-0x39}`},
+ {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
+ {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
+ {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
+ {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
+ {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
+ {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
+ // { `\C`, `byte{}` }, // probably never
// Unicode, negatives, and a double negative.
- {"\\p{Braille}", "cc{0x2800-0x28ff}"},
- {"\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
- {"\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
- {"\\P{^Braille}", "cc{0x2800-0x28ff}"},
- {"\\pZ", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
- {"[\\p{Braille}]", "cc{0x2800-0x28ff}"},
- {"[\\P{Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
- {"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
- {"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
- {"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
- {"\\p{Lu}", mkCharClass(unicode.IsUpper)},
- {"[\\p{Lu}]", mkCharClass(unicode.IsUpper)},
- {"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)},
+ {`\p{Braille}`, `cc{0x2800-0x28ff}`},
+ {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`\P{^Braille}`, `cc{0x2800-0x28ff}`},
+ {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+ {`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
+ {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
+ {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+ {`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+ {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
+ {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
// Hex, octal.
- {"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"},
- {"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"},
+ {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
+ {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
// More interesting regular expressions.
- // { "a{,2}", "str{a{,2}}" },
- // { "\\.\\^\\$\\\\", "str{.^$\\}" },
- {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},
- {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},
- {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8
- {"a*{", "cat{star{lit{a}}lit{{}}"},
+ {`a{,2}`, `str{a{,2}}`},
+ {`\.\^\$\\`, `str{.^$\}`},
+ {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
+ {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+ {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
+ {`a*{`, `cat{star{lit{a}}lit{{}}`},
// Test precedences
- // { "(?:ab)*", "star{str{ab}}" },
- // { "(ab)*", "star{cap{str{ab}}}" },
- // { "ab|cd", "alt{str{ab}str{cd}}" },
- // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
- {"(?:ab)*", "star{cat{lit{a}lit{b}}}"},
- {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},
- {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},
- {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},
+ {`(?:ab)*`, `star{str{ab}}`},
+ {`(ab)*`, `star{cap{str{ab}}}`},
+ {`ab|cd`, `alt{str{ab}str{cd}}`},
+ {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
// Test flattening.
- {"(?:a)", "lit{a}"},
- // { "(?:ab)(?:cd)", "str{abcd}" },
- // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
- // { "a|.", "dot{}" },
- // { ".|a", "dot{}" },
+ {`(?:a)`, `lit{a}`},
+ {`(?:ab)(?:cd)`, `str{abcd}`},
+ {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+ {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+ {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
+ {`a|.`, `dot{}`},
+ {`.|a`, `dot{}`},
+ {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
+ {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
// Test Perl quoted literals
- {"\\Q+|*?{[\\E", "str{+|*?{[}"},
- {"\\Q+\\E+", "plus{lit{+}}"},
- {"\\Q\\\\E", "lit{\\}"},
- {"\\Q\\\\\\E", "str{\\\\}"},
+ {`\Q+|*?{[\E`, `str{+|*?{[}`},
+ {`\Q+\E+`, `plus{lit{+}}`},
+ {`\Q\\E`, `lit{\}`},
+ {`\Q\\\E`, `str{\\}`},
// Test Perl \A and \z
- {"(?m)^", "bol{}"},
- {"(?m)$", "eol{}"},
- {"(?-m)^", "bot{}"},
- {"(?-m)$", "eot{}"},
- {"(?m)\\A", "bot{}"},
- {"(?m)\\z", "eot{\\z}"},
- {"(?-m)\\A", "bot{}"},
- {"(?-m)\\z", "eot{\\z}"},
+ {`(?m)^`, `bol{}`},
+ {`(?m)$`, `eol{}`},
+ {`(?-m)^`, `bot{}`},
+ {`(?-m)$`, `eot{}`},
+ {`(?m)\A`, `bot{}`},
+ {`(?m)\z`, `eot{\z}`},
+ {`(?-m)\A`, `bot{}`},
+ {`(?-m)\z`, `eot{\z}`},
// Test named captures
- {"(?P<name>a)", "cap{name:lit{a}}"},
+ {`(?P<name>a)`, `cap{name:lit{a}}`},
// Case-folded literals
- // { "[Aa]", "litfold{a}" },
+ {`[Aa]`, `litfold{A}`},
+ {`[\x{100}\x{101}]`, `litfold{Ā}`},
+ {`[Δδ]`, `litfold{Δ}`},
// Strings
- // { "abcde", "str{abcde}" },
- // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
+ {`abcde`, `str{abcde}`},
+ {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
}
const testFlags = MatchNL | PerlX | UnicodeGroups
@@ -230,8 +228,9 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) {
}
if re.Flags&FoldCase != 0 {
for _, r := range re.Rune {
- if unicode.ToUpper(r) != r {
+ if unicode.SimpleFold(r) != r {
b.WriteString("fold")
+ break
}
}
}
diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go
index a0c465967f..248ace503c 100644
--- a/src/pkg/exp/regexp/syntax/regexp.go
+++ b/src/pkg/exp/regexp/syntax/regexp.go
@@ -33,6 +33,8 @@ type Regexp struct {
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
+// Character class operators are listed simplest to most complex
+// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings