diff options
author | Robert Griesemer <gri@golang.org> | 2016-06-07 22:49:20 -0700 |
---|---|---|
committer | Matthew Dempsky <mdempsky@google.com> | 2016-08-16 14:32:07 -0700 |
commit | f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83 (patch) | |
tree | 7c3c00a9fc44c68576fffdae9a31616bfcd78888 | |
parent | fe52bcc8e6a40c38ec554456d570405d673d080b (diff) | |
download | go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.tar.gz go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.zip |
cmd/compile/internal/syntax: fix source.getr
The source's rune reader logic was overly clever and tried to
determine if an invalid rune was due to an actual source error
or due to not enough bytes in the buffer.
Replaced with slightly optimized version of bufio Reader's
ReadRune method for now. Performance may be a tiny bit lower
but we can always optimize later.
Fixes endless loops when parsing certain source files.
Added test case to ScanError test.
-rw-r--r-- | src/cmd/compile/internal/syntax/parser.go | 4 | ||||
-rw-r--r-- | src/cmd/compile/internal/syntax/parser_test.go | 4 | ||||
-rw-r--r-- | src/cmd/compile/internal/syntax/scanner_test.go | 3 | ||||
-rw-r--r-- | src/cmd/compile/internal/syntax/source.go | 82 |
4 files changed, 54 insertions, 39 deletions
diff --git a/src/cmd/compile/internal/syntax/parser.go b/src/cmd/compile/internal/syntax/parser.go index 5960b45fbb..86d2d6a503 100644 --- a/src/cmd/compile/internal/syntax/parser.go +++ b/src/cmd/compile/internal/syntax/parser.go @@ -26,11 +26,11 @@ type parser struct { func (p *parser) init(src io.Reader, errh ErrorHandler) { p.scanner.init(src, func(pos, line int, msg string) { p.nerrors++ - if errh != nil { + if !debug && errh != nil { errh(pos, line, msg) return } - fmt.Printf("%d: %s\n", line, msg) + panic(fmt.Sprintf("%d: %s\n", line, msg)) }) p.fnest = 0 diff --git a/src/cmd/compile/internal/syntax/parser_test.go b/src/cmd/compile/internal/syntax/parser_test.go index 1cf26c4c47..12fc019414 100644 --- a/src/cmd/compile/internal/syntax/parser_test.go +++ b/src/cmd/compile/internal/syntax/parser_test.go @@ -29,6 +29,10 @@ func TestParse(t *testing.T) { } func TestStdLib(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode") + } + var m1 runtime.MemStats runtime.ReadMemStats(&m1) start := time.Now() diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go index 7c8545c3f6..89ba04022e 100644 --- a/src/cmd/compile/internal/syntax/scanner_test.go +++ b/src/cmd/compile/internal/syntax/scanner_test.go @@ -275,6 +275,9 @@ func TestScanErrors(t *testing.T) { // {`"\x`, "escape sequence not terminated"}, {`"\x"`, "illegal character U+0022 '\"' in escape sequence"}, {`"\Uffffffff"`, "escape sequence is invalid Unicode code point"}, + + // former problem cases + {"\xef", "invalid UTF-8 encoding"}, } { var s scanner hasError := false diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go index a04c089367..38692c33be 100644 --- a/src/cmd/compile/internal/syntax/source.go +++ b/src/cmd/compile/internal/syntax/source.go @@ -66,50 +66,58 @@ func (s *source) ungetr() { } func (s *source) getr() rune { - for { - s.r0, s.line0 = s.r, s.line - - // common case: ASCII and enough bytes - if b := s.buf[s.r]; b < utf8.RuneSelf { - s.r++ - if b == 0 { - s.error("invalid NUL character") - continue - } - if b == '\n' { - s.line++ - } - return rune(b) - } +redo: + s.r0, s.line0 = s.r, s.line - // uncommon case: not ASCII or not enough bytes - r, w := utf8.DecodeRune(s.buf[s.r:s.w]) // optimistically assume valid rune - if r != utf8.RuneError || w > 1 { - s.r += w - // BOM's are only allowed as the first character in a file - const BOM = 0xfeff - if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) - s.error("invalid BOM in the middle of the file") - continue - } - return r - } + // TODO(gri) We could avoid at least one test that is always taken + // in the for loop below by duplicating the common case code (ASCII) + // here since we always have at least the sentinel (utf8.RuneSelf) + // in the buffer. Measure and optimize eventually. - if w == 0 && s.err != nil { - if s.err != io.EOF { - s.error(s.err.Error()) - } - return -1 + // make sure we have at least one rune in buffer, or we are at EOF + for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.err == nil && s.w-s.r < len(s.buf) { + s.fill() // s.w-s.r < len(s.buf) => buffer is not full + } + + // common case: ASCII and enough bytes + // (invariant: s.buf[s.w] == utf8.RuneSelf) + if b := s.buf[s.r]; b < utf8.RuneSelf { + s.r++ + if b == 0 { + s.error("invalid NUL character") + goto redo } + if b == '\n' { + s.line++ + } + return rune(b) + } - if w == 1 && (s.r+utf8.UTFMax <= s.w || utf8.FullRune(s.buf[s.r:s.w])) { - s.r++ - s.error("invalid UTF-8 encoding") - continue + // EOF + if s.r == s.w { + if s.err != io.EOF { + s.error(s.err.Error()) } + return -1 + } + + // uncommon case: not ASCII + r, w := utf8.DecodeRune(s.buf[s.r:s.w]) + s.r += w - s.fill() + if r == utf8.RuneError && w == 1 { + s.error("invalid UTF-8 encoding") + goto redo } + + // BOM's are only allowed as the first character in a file + const BOM = 0xfeff + if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) + s.error("invalid BOM in the middle of the file") + goto redo + } + + return r } func (s *source) fill() { |