cmd/compile/internal/syntax: fix source.getr

The source's rune reader logic was overly clever and tried to determine if an invalid rune was due to an actual source error or due to not enough bytes in the buffer. Replaced with slightly optimized version of bufio Reader's ReadRune method for now. Performance may be a tiny bit lower but we can always optimize later. Fixes endless loops when parsing certain source files. Added test case to ScanError test.
author: Robert Griesemer <gri@golang.org> 2016-06-07 22:49:20 -0700
committer: Matthew Dempsky <mdempsky@google.com> 2016-08-16 14:32:07 -0700
commit: f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83 (patch)
tree: 7c3c00a9fc44c68576fffdae9a31616bfcd78888
parent: fe52bcc8e6a40c38ec554456d570405d673d080b (diff)
download: go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.tar.gz
go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.zip
4 files changed, 54 insertions, 39 deletions
diff --git a/src/cmd/compile/internal/syntax/parser.go b/src/cmd/compile/internal/syntax/parser.go
index 5960b45fbb..86d2d6a503 100644
--- a/src/cmd/compile/internal/syntax/parser.go
+++ b/src/cmd/compile/internal/syntax/parser.go
@@ -26,11 +26,11 @@ type parser struct {
 func (p *parser) init(src io.Reader, errh ErrorHandler) {
 	p.scanner.init(src, func(pos, line int, msg string) {
 		p.nerrors++
-		if errh != nil {
+		if !debug && errh != nil {
 			errh(pos, line, msg)
 			return
 		}
-		fmt.Printf("%d: %s\n", line, msg)
+		panic(fmt.Sprintf("%d: %s\n", line, msg))
 	})
 
 	p.fnest = 0
diff --git a/src/cmd/compile/internal/syntax/parser_test.go b/src/cmd/compile/internal/syntax/parser_test.go
index 1cf26c4c47..12fc019414 100644
--- a/src/cmd/compile/internal/syntax/parser_test.go
+++ b/src/cmd/compile/internal/syntax/parser_test.go
@@ -29,6 +29,10 @@ func TestParse(t *testing.T) {
 }
 
 func TestStdLib(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode")
+	}
+
 	var m1 runtime.MemStats
 	runtime.ReadMemStats(&m1)
 	start := time.Now()
diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go
index 7c8545c3f6..89ba04022e 100644
--- a/src/cmd/compile/internal/syntax/scanner_test.go
+++ b/src/cmd/compile/internal/syntax/scanner_test.go
@@ -275,6 +275,9 @@ func TestScanErrors(t *testing.T) {
 		// {`"\x`, "escape sequence not terminated"},
 		{`"\x"`, "illegal character U+0022 '\"' in escape sequence"},
 		{`"\Uffffffff"`, "escape sequence is invalid Unicode code point"},
+
+		// former problem cases
+		{"\xef", "invalid UTF-8 encoding"},
 	} {
 		var s scanner
 		hasError := false
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go
index a04c089367..38692c33be 100644
--- a/src/cmd/compile/internal/syntax/source.go
+++ b/src/cmd/compile/internal/syntax/source.go
@@ -66,50 +66,58 @@ func (s *source) ungetr() {
 }
 
 func (s *source) getr() rune {
-	for {
-		s.r0, s.line0 = s.r, s.line
-
-		// common case: ASCII and enough bytes
-		if b := s.buf[s.r]; b < utf8.RuneSelf {
-			s.r++
-			if b == 0 {
-				s.error("invalid NUL character")
-				continue
-			}
-			if b == '\n' {
-				s.line++
-			}
-			return rune(b)
-		}
+redo:
+	s.r0, s.line0 = s.r, s.line
 
-		// uncommon case: not ASCII or not enough bytes
-		r, w := utf8.DecodeRune(s.buf[s.r:s.w]) // optimistically assume valid rune
-		if r != utf8.RuneError || w > 1 {
-			s.r += w
-			// BOM's are only allowed as the first character in a file
-			const BOM = 0xfeff
-			if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
-				s.error("invalid BOM in the middle of the file")
-				continue
-			}
-			return r
-		}
+	// TODO(gri) We could avoid at least one test that is always taken
+	// in the for loop below by duplicating the common case code (ASCII)
+	// here since we always have at least the sentinel (utf8.RuneSelf)
+	// in the buffer. Measure and optimize eventually.
 
-		if w == 0 && s.err != nil {
-			if s.err != io.EOF {
-				s.error(s.err.Error())
-			}
-			return -1
+	// make sure we have at least one rune in buffer, or we are at EOF
+	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.err == nil && s.w-s.r < len(s.buf) {
+		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
+	}
+
+	// common case: ASCII and enough bytes
+	// (invariant: s.buf[s.w] == utf8.RuneSelf)
+	if b := s.buf[s.r]; b < utf8.RuneSelf {
+		s.r++
+		if b == 0 {
+			s.error("invalid NUL character")
+			goto redo
 		}
+		if b == '\n' {
+			s.line++
+		}
+		return rune(b)
+	}
 
-		if w == 1 && (s.r+utf8.UTFMax <= s.w || utf8.FullRune(s.buf[s.r:s.w])) {
-			s.r++
-			s.error("invalid UTF-8 encoding")
-			continue
+	// EOF
+	if s.r == s.w {
+		if s.err != io.EOF {
+			s.error(s.err.Error())
 		}
+		return -1
+	}
+
+	// uncommon case: not ASCII
+	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
+	s.r += w
 
-		s.fill()
+	if r == utf8.RuneError && w == 1 {
+		s.error("invalid UTF-8 encoding")
+		goto redo
 	}
+
+	// BOM's are only allowed as the first character in a file
+	const BOM = 0xfeff
+	if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
+		s.error("invalid BOM in the middle of the file")
+		goto redo
+	}
+
+	return r
 }
 
 func (s *source) fill() {
author	Robert Griesemer <gri@golang.org>	2016-06-07 22:49:20 -0700
committer	Matthew Dempsky <mdempsky@google.com>	2016-08-16 14:32:07 -0700
commit	f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83 (patch)
tree	7c3c00a9fc44c68576fffdae9a31616bfcd78888
parent	fe52bcc8e6a40c38ec554456d570405d673d080b (diff)
download	go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.tar.gz go-f363ad2a55bd61c6cce5c3af107b11cc3fbeeb83.zip