scanner: match go/scanner and disallow NUL character;

also check for illegal UTF-8 sequences R=rsc CC=golang-dev https://golang.org/cl/218061
author: Robert Griesemer <gri@golang.org> 2010-02-22 14:21:59 -0800
committer: Robert Griesemer <gri@golang.org> 2010-02-22 14:21:59 -0800
commit: 22e960547f5f14caf2dd401b20ebfe64749fa7b2 (patch)
tree: 1bb86f4fd921c3b5b82fbb57b28a0953309a1c6b
parent: 0485a999ff078c760a9cd4013b0f21b6ed90ffda (diff)
download: go-22e960547f5f14caf2dd401b20ebfe64749fa7b2.tar.gz
go-22e960547f5f14caf2dd401b20ebfe64749fa7b2.zip
2 files changed, 17 insertions, 6 deletions
diff --git a/src/pkg/scanner/scanner.go b/src/pkg/scanner/scanner.go
index c4233aa581..c9b46f0ea3 100644
--- a/src/pkg/scanner/scanner.go
+++ b/src/pkg/scanner/scanner.go
@@ -2,9 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// A general-purpose scanner for text. Takes an io.Reader
-// providing the source which then can be tokenized through
-// repeated calls to the Scan function.
+// A general-purpose scanner for UTF-8 encoded text. Takes an io.Reader
+// providing the source which then can be tokenized through repeated
+// calls to the Scan function. For compatibility with existing tools,
+// the NUL character is not allowed (implementation restriction).
 //
 // By default, a Scanner skips white space and comments and
 // recognizes literals as defined by the Go language spec.
@@ -245,13 +246,20 @@ func (s *Scanner) next() int {
 			// uncommon case: not ASCII
 			var width int
 			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
+			if ch == utf8.RuneError && width == 1 {
+				s.error("illegal UTF-8 encoding")
+			}
 			s.srcPos += width - 1
 		}
 	}
 
 	s.srcPos++
 	s.column++
-	if ch == '\n' {
+	switch ch {
+	case 0:
+		// implementation restriction for compatibility with other tools
+		s.error("illegal character NUL")
+	case '\n':
 		s.line++
 		s.column = 0
 	}
diff --git a/src/pkg/scanner/scanner_test.go b/src/pkg/scanner/scanner_test.go
index 926048010f..563ceea0cc 100644
--- a/src/pkg/scanner/scanner_test.go
+++ b/src/pkg/scanner/scanner_test.go
@@ -226,7 +226,7 @@ var tokenList = []token{
 	token{String, "`" + f100 + "`"},
 
 	token{Comment, "// individual characters\n"},
-	token{'\x00', "\x00"},
+	// NUL character is not allowed
 	token{'\x01', "\x01"},
 	token{' ' - 1, string(' ' - 1)},
 	token{'+', "+"},
@@ -390,7 +390,8 @@ func TestScanNext(t *testing.T) {
 func TestScanWhitespace(t *testing.T) {
 	var buf bytes.Buffer
 	var ws uint64
-	for ch := byte(0); ch < ' '; ch++ {
+	// start at 1, NUL character is not allowed
+	for ch := byte(1); ch < ' '; ch++ {
 		buf.WriteByte(ch)
 		ws |= 1 << ch
 	}
@@ -442,6 +443,8 @@ func TestError(t *testing.T) {
 	testError(t, "`abc", "literal not terminated", String)
 	testError(t, `//`, "comment not terminated", EOF)
 	testError(t, `/*/`, "comment not terminated", EOF)
+	testError(t, `"abc`+"\x00"+`def"`, "illegal character NUL", String)
+	testError(t, `"abc`+"\xff"+`def"`, "illegal UTF-8 encoding", String)
 }
author	Robert Griesemer <gri@golang.org>	2010-02-22 14:21:59 -0800
committer	Robert Griesemer <gri@golang.org>	2010-02-22 14:21:59 -0800
commit	22e960547f5f14caf2dd401b20ebfe64749fa7b2 (patch)
tree	1bb86f4fd921c3b5b82fbb57b28a0953309a1c6b
parent	0485a999ff078c760a9cd4013b0f21b6ed90ffda (diff)
download	go-22e960547f5f14caf2dd401b20ebfe64749fa7b2.tar.gz go-22e960547f5f14caf2dd401b20ebfe64749fa7b2.zip