diff options
author | Robert Griesemer <gri@golang.org> | 2016-03-04 17:09:08 -0800 |
---|---|---|
committer | Matthew Dempsky <mdempsky@google.com> | 2016-08-18 21:33:38 +0000 |
commit | c8683ff7977c526fb48ae007971fed16ef32ff62 (patch) | |
tree | 408437654d4fd529b864b27ff66068f646bde746 /src/cmd/compile/internal/syntax/scanner.go | |
parent | 3b967be4219c789ef9c47aa5e9607cab3005e1cd (diff) | |
download | go-c8683ff7977c526fb48ae007971fed16ef32ff62.tar.gz go-c8683ff7977c526fb48ae007971fed16ef32ff62.zip |
cmd/compile/internal/syntax: fast Go syntax trees, initial commit.
Syntax tree nodes, scanner, parser, basic printers.
Builds syntax trees for entire Go std lib at a rate of ~1.8M lines/s
in warmed up state (MacMini, 2.3 GHz Intel Core i7, 8GB RAM):
$ go test -run StdLib -fast
parsed 1074617 lines (2832 files) in 579.66364ms (1853863 lines/s)
allocated 282.212Mb (486.854Mb/s)
PASS
Change-Id: Ie26d9a7bf4e5ff07457aedfcc9b89f0eba72ae3f
Reviewed-on: https://go-review.googlesource.com/27195
Run-TryBot: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: Robert Griesemer <gri@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/syntax/scanner.go')
-rw-r--r-- | src/cmd/compile/internal/syntax/scanner.go | 651 |
1 files changed, 651 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go new file mode 100644 index 0000000000..0f0f1ead9a --- /dev/null +++ b/src/cmd/compile/internal/syntax/scanner.go @@ -0,0 +1,651 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "fmt" + "io" + "strings" + "unicode" + "unicode/utf8" +) + +type scanner struct { + source + nlsemi bool // if set '\n' and EOF translate to ';' + + // current token, valid after calling next() + pos, line int + tok token + lit string // valid if tok is _Name or _Literal + kind LitKind // valid if tok is _Literal + op Operator // valid if tok is _Operator, _AssignOp, or _IncOp + prec int // valid if tok is _Operator, _AssignOp, or _IncOp + + pragmas []Pragma +} + +func (s *scanner) init(src io.Reader, errh ErrorHandler) { + s.source.init(src, errh) + s.nlsemi = false +} + +func (s *scanner) next() { + nlsemi := s.nlsemi + s.nlsemi = false + +redo: + // skip white space + c := s.getr() + for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { + c = s.getr() + } + + // token start + s.pos, s.line = s.source.pos0(), s.source.line0 + + if isLetter(c) || c >= utf8.RuneSelf && unicode.IsLetter(c) { + s.ident() + return + } + + switch c { + case -1: + if nlsemi { + s.tok = _Semi + break + } + s.tok = _EOF + + case '\n': + s.tok = _Semi + + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + s.number(c) + + case '"': + s.stdString() + + case '`': + s.rawString() + + case '\'': + s.rune() + + case '(': + s.tok = _Lparen + + case '[': + s.tok = _Lbrack + + case '{': + s.tok = _Lbrace + + case ',': + s.tok = _Comma + + case ';': + s.tok = _Semi + + case ')': + s.nlsemi = true + s.tok = _Rparen + + case ']': + s.nlsemi = true + s.tok = _Rbrack + + case '}': + s.nlsemi = true + s.tok = _Rbrace + + case ':': + if s.getr() == '=' { + s.tok = _Define + break + } + s.ungetr() + s.tok = _Colon + + case '.': + c = s.getr() + if isDigit(c) { + s.ungetr() + s.source.r0-- // make sure '.' is part of literal (line cannot have changed) + s.number('.') + break + } + if c == '.' { + c = s.getr() + if c == '.' { + s.tok = _DotDotDot + break + } + s.ungetr() + s.source.r0-- // make next ungetr work (line cannot have changed) + } + s.ungetr() + s.tok = _Dot + + case '+': + s.op, s.prec = Add, precAdd + c = s.getr() + if c != '+' { + goto assignop + } + s.nlsemi = true + s.tok = _IncOp + + case '-': + s.op, s.prec = Sub, precAdd + c = s.getr() + if c != '-' { + goto assignop + } + s.nlsemi = true + s.tok = _IncOp + + case '*': + s.op, s.prec = Mul, precMul + // don't goto assignop - want _Star token + if s.getr() == '=' { + s.tok = _AssignOp + break + } + s.ungetr() + s.tok = _Star + + case '/': + c = s.getr() + if c == '/' { + s.lineComment() + goto redo + } + if c == '*' { + s.fullComment() + if s.source.line > s.line && nlsemi { + // A multi-line comment acts like a newline; + // it translates to a ';' if nlsemi is set. + s.tok = _Semi + break + } + goto redo + } + s.op, s.prec = Div, precMul + goto assignop + + case '%': + s.op, s.prec = Rem, precMul + c = s.getr() + goto assignop + + case '&': + c = s.getr() + if c == '&' { + s.op, s.prec = AndAnd, precAndAnd + s.tok = _Operator + break + } + s.op, s.prec = And, precMul + if c == '^' { + s.op = AndNot + c = s.getr() + } + goto assignop + + case '|': + c = s.getr() + if c == '|' { + s.op, s.prec = OrOr, precOrOr + s.tok = _Operator + break + } + s.op, s.prec = Or, precAdd + goto assignop + + case '~': + s.error("bitwise complement operator is ^") + fallthrough + + case '^': + s.op, s.prec = Xor, precAdd + c = s.getr() + goto assignop + + case '<': + c = s.getr() + if c == '=' { + s.op, s.prec = Leq, precCmp + s.tok = _Operator + break + } + if c == '<' { + s.op, s.prec = Shl, precMul + c = s.getr() + goto assignop + } + if c == '-' { + s.tok = _Arrow + break + } + s.ungetr() + s.op, s.prec = Lss, precCmp + s.tok = _Operator + + case '>': + c = s.getr() + if c == '=' { + s.op, s.prec = Geq, precCmp + s.tok = _Operator + break + } + if c == '>' { + s.op, s.prec = Shr, precMul + c = s.getr() + goto assignop + } + s.ungetr() + s.op, s.prec = Gtr, precCmp + s.tok = _Operator + + case '=': + if s.getr() == '=' { + s.op, s.prec = Eql, precCmp + s.tok = _Operator + break + } + s.ungetr() + s.tok = _Assign + + case '!': + if s.getr() == '=' { + s.op, s.prec = Neq, precCmp + s.tok = _Operator + break + } + s.ungetr() + s.op, s.prec = Not, 0 + s.tok = _Operator + + default: + s.tok = 0 + s.error(fmt.Sprintf("invalid rune %q", c)) + goto redo + } + + return + +assignop: + if c == '=' { + s.tok = _AssignOp + return + } + s.ungetr() + s.tok = _Operator +} + +func isLetter(c rune) bool { + return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' +} + +func isDigit(c rune) bool { + return '0' <= c && c <= '9' +} + +func (s *scanner) ident() { + s.startLit() + + // accelerate common case (7bit ASCII) + c := s.getr() + for isLetter(c) || isDigit(c) { + c = s.getr() + } + + // general case + if c >= utf8.RuneSelf { + for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) { + c = s.getr() + } + } + s.ungetr() + + lit := s.stopLit() + + // possibly a keyword + if len(lit) >= 2 { + if tok := keywordMap[hash(lit)]; tok != 0 && strbyteseql(tokstrings[tok], lit) { + s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) + s.tok = tok + return + } + } + + s.nlsemi = true + s.lit = string(lit) + s.tok = _Name +} + +// hash is a perfect hash function for keywords. +// It assumes that s has at least length 2. +func hash(s []byte) uint { + return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) +} + +func strbyteseql(s string, b []byte) bool { + if len(s) == len(b) { + for i, b := range b { + if s[i] != b { + return false + } + } + return true + } + return false +} + +var keywordMap [1 << 6]token // size must be power of two + +func init() { + // populate keywordMap + for tok := _Break; tok <= _Var; tok++ { + h := hash([]byte(tokstrings[tok])) + if keywordMap[h] != 0 { + panic("imperfect hash") + } + keywordMap[h] = tok + } +} + +func (s *scanner) number(c rune) { + s.startLit() + + if c != '.' { + s.kind = IntLit // until proven otherwise + if c == '0' { + c = s.getr() + if c == 'x' || c == 'X' { + // hex + c = s.getr() + hasDigit := false + for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { + c = s.getr() + hasDigit = true + } + if !hasDigit { + s.error("malformed hex constant") + } + goto done + } + + // decimal 0, octal, or float + has8or9 := false + for isDigit(c) { + if c > '7' { + has8or9 = true + } + c = s.getr() + } + if c != '.' && c != 'e' && c != 'E' && c != 'i' { + // octal + if has8or9 { + s.error("malformed octal constant") + } + goto done + } + + } else { + // decimal or float + for isDigit(c) { + c = s.getr() + } + } + } + + // float + if c == '.' { + s.kind = FloatLit + c = s.getr() + for isDigit(c) { + c = s.getr() + } + } + + // exponent + if c == 'e' || c == 'E' { + s.kind = FloatLit + c = s.getr() + if c == '-' || c == '+' { + c = s.getr() + } + if !isDigit(c) { + s.error("malformed floating-point constant exponent") + } + for isDigit(c) { + c = s.getr() + } + } + + // complex + if c == 'i' { + s.kind = ImagLit + s.getr() + } + +done: + s.ungetr() + s.nlsemi = true + s.lit = string(s.stopLit()) + s.tok = _Literal +} + +func (s *scanner) stdString() { + s.startLit() + + for { + r := s.getr() + if r == '"' { + break + } + if r == '\\' { + s.escape('"') + continue + } + if r == '\n' { + s.ungetr() // assume newline is not part of literal + s.error("newline in string") + break + } + if r < 0 { + s.error_at(s.pos, s.line, "string not terminated") + break + } + } + + s.nlsemi = true + s.lit = string(s.stopLit()) + s.kind = StringLit + s.tok = _Literal +} + +func (s *scanner) rawString() { + s.startLit() + + for { + r := s.getr() + if r == '`' { + break + } + if r < 0 { + s.error_at(s.pos, s.line, "string not terminated") + break + } + } + // We leave CRs in the string since they are part of the + // literal (even though they are not part of the literal + // value). + + s.nlsemi = true + s.lit = string(s.stopLit()) + s.kind = StringLit + s.tok = _Literal +} + +func (s *scanner) rune() { + s.startLit() + + r := s.getr() + if r == '\'' { + s.error("empty character literal") + } else if r == '\n' { + s.ungetr() // assume newline is not part of literal + s.error("newline in character literal") + } else { + ok := true + if r == '\\' { + ok = s.escape('\'') + } + r = s.getr() + if r != '\'' { + // only report error if we're ok so far + if ok { + s.error("missing '") + } + s.ungetr() + } + } + + s.nlsemi = true + s.lit = string(s.stopLit()) + s.kind = RuneLit + s.tok = _Literal +} + +func (s *scanner) lineComment() { + // recognize pragmas + var prefix string + r := s.getr() + switch r { + case 'g': + prefix = "go:" + case 'l': + prefix = "line " + default: + goto skip + } + + s.startLit() + for _, m := range prefix { + if r != m { + s.stopLit() + goto skip + } + r = s.getr() + } + + for r >= 0 { + if r == '\n' { + s.ungetr() + break + } + r = s.getr() + } + s.pragmas = append(s.pragmas, Pragma{ + Line: s.line, + Text: strings.TrimSuffix(string(s.stopLit()), "\r"), + }) + return + +skip: + // consume line + for r != '\n' && r >= 0 { + r = s.getr() + } + s.ungetr() // don't consume '\n' - needed for nlsemi logic +} + +func (s *scanner) fullComment() { + for { + r := s.getr() + for r == '*' { + r = s.getr() + if r == '/' { + return + } + } + if r < 0 { + s.error_at(s.pos, s.line, "comment not terminated") + return + } + } +} + +func (s *scanner) escape(quote rune) bool { + var n int + var base, max uint32 + + c := s.getr() + switch c { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: + return true + case '0', '1', '2', '3', '4', '5', '6', '7': + n, base, max = 3, 8, 255 + case 'x': + c = s.getr() + n, base, max = 2, 16, 255 + case 'u': + c = s.getr() + n, base, max = 4, 16, unicode.MaxRune + case 'U': + c = s.getr() + n, base, max = 8, 16, unicode.MaxRune + default: + if c < 0 { + return true // complain in caller about EOF + } + s.error("unknown escape sequence") + return false + } + + var x uint32 + for i := n; i > 0; i-- { + d := base + switch { + case isDigit(c): + d = uint32(c) - '0' + case 'a' <= c && c <= 'f': + d = uint32(c) - ('a' - 10) + case 'A' <= c && c <= 'F': + d = uint32(c) - ('A' - 10) + } + if d >= base { + if c < 0 { + return true // complain in caller about EOF + } + if c != quote { + s.error(fmt.Sprintf("illegal character %#U in escape sequence", c)) + } else { + s.error("escape sequence incomplete") + } + s.ungetr() + return false + } + // d < base + x = x*base + d + c = s.getr() + } + s.ungetr() + + if x > max && n == 3 { + s.error(fmt.Sprintf("octal escape value > 255: %d", x)) + return false + } + + if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { + s.error("escape sequence is invalid Unicode code point") + return false + } + + return true +} |