From 3a7da56582c4909803ba72cdc3503d6c25cb9c62 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Wed, 8 Jun 2016 16:08:34 -0700 Subject: cmd/compile/internal/syntax: fix many string/rune literal corner cases + many more test cases --- src/cmd/compile/internal/syntax/scanner.go | 68 ++++++++++----- src/cmd/compile/internal/syntax/scanner_test.go | 108 ++++++++++++++++++------ src/cmd/compile/internal/syntax/source.go | 2 +- 3 files changed, 126 insertions(+), 52 deletions(-) diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go index aaf254f051..b9d586d89f 100644 --- a/src/cmd/compile/internal/syntax/scanner.go +++ b/src/cmd/compile/internal/syntax/scanner.go @@ -450,10 +450,12 @@ func (s *scanner) number(c rune) { func (s *scanner) stdString() { s.startLit() + for { r := s.getr() - if r == '\\' && !s.escape('"') { - continue // error already reported + if r == '\\' { + s.escape('"') + continue } if r == '"' { break @@ -463,11 +465,13 @@ func (s *scanner) stdString() { break } } + s.lit = string(s.stopLit()) } func (s *scanner) rawString() { s.startLit() + for { r := s.getr() if r == '`' { @@ -477,22 +481,37 @@ func (s *scanner) rawString() { s.error("string not terminated") break } - // TODO(gri) deal with CRs (or don't?) } + // We leave CRs in the string since they are part of the + // literal (even though they are not part of the literal + // value). + s.lit = string(s.stopLit()) } func (s *scanner) rune() { s.startLit() + r := s.getr() - if r == '\\' && !s.escape('\'') { - panic(0) - } - c := s.getr() - if c != '\'' { - panic(c) + if r != '\'' { + ok := true + if r == '\\' { + ok = s.escape('\'') + } + r = s.getr() + if r != '\'' { + // only report error if we're ok so far + if ok { + s.error("missing '") + } + s.ungetr() + } + } else { + s.error("empty character literal") } + s.lit = string(s.stopLit()) + return } func (s *scanner) lineComment() { @@ -574,19 +593,15 @@ func (s *scanner) escape(quote rune) bool { c = s.getr() n, base, max = 8, 16, unicode.MaxRune default: - var msg string - if c >= 0 { - msg = "unknown escape sequence" - } else { - msg = "escape sequence not terminated" + if c < 0 { + return true // complain in caller about EOF } - s.error(msg) + s.error("unknown escape sequence") return false } var x uint32 -loop: - for ; n > 0; n-- { + for i := n; i > 0; i-- { d := base switch { case isDigit(c): @@ -597,14 +612,16 @@ loop: d = uint32(c) - ('A' - 10) } if d >= base { - var msg string - if c >= 0 { - msg = fmt.Sprintf("illegal character %#U in escape sequence", c) + if c < 0 { + return true // complain in caller about EOF + } + if c != quote { + s.error(fmt.Sprintf("illegal character %#U in escape sequence", c)) } else { - msg = "escape sequence not terminated" + s.error("escape sequence incomplete") } - s.error(msg) - break loop + s.ungetr() + return false } // d < base x = x*base + d @@ -612,6 +629,11 @@ loop: } s.ungetr() + if x > max && n == 3 { + s.error(fmt.Sprintf("octal escape value > 255: %d", x)) + return false + } + if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { s.error("escape sequence is invalid Unicode code point") return false diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go index 89ba04022e..297b61428a 100644 --- a/src/cmd/compile/internal/syntax/scanner_test.go +++ b/src/cmd/compile/internal/syntax/scanner_test.go @@ -255,44 +255,96 @@ var sampleTokens = [...]struct { func TestScanErrors(t *testing.T) { for _, test := range []struct { - src, msg string + src, msg string + pos, line int }{ + // Note: Positions for lexical errors are the earliest position + // where the error is apparent, not the beginning of the respective + // token. + // rune-level errors - {"fo\x00o", "invalid NUL character"}, - {"fo\ufeffo", "invalid BOM in the middle of the file"}, - {"\xff", "invalid UTF-8 encoding"}, + {"fo\x00o", "invalid NUL character", 2, 1}, + {"foo\n\ufeff bar", "invalid BOM in the middle of the file", 4, 2}, + {"foo\n\n\xff ", "invalid UTF-8 encoding", 5, 3}, // token-level errors - {"~", "bitwise complement operator is ^"}, - {"$", "invalid rune '$'"}, - {"0xyz", "malformed hex constant"}, - {"08", "malformed octal constant"}, - {"1.0e+x", "malformed floating-point constant exponent"}, - {`"foo`, "string not terminated"}, - {"`foo", "string not terminated"}, - {"/* foo", "comment not terminated"}, - {`"foo\z"`, "unknown escape sequence"}, - // {`"\x`, "escape sequence not terminated"}, - {`"\x"`, "illegal character U+0022 '\"' in escape sequence"}, - {`"\Uffffffff"`, "escape sequence is invalid Unicode code point"}, + {"x + ~y", "bitwise complement operator is ^", 4, 1}, + {"foo$bar = 0", "invalid rune '$'", 3, 1}, + {"const x = 0xyz", "malformed hex constant", 12, 1}, + {"0123456789", "malformed octal constant", 10, 1}, + {"0123456789. /*", "comment not terminated", 14, 1}, // valid float constant + {"0123456789e0 /*", "comment not terminated", 15, 1}, // valid float constant + {"var a, b = 08, 07\n", "malformed octal constant", 13, 1}, + {"(x + 1.0e+x)", "malformed floating-point constant exponent", 10, 1}, + + {`''`, "empty character literal", 1, 1}, + {`'\`, "missing '", 2, 1}, + {`'\'`, "missing '", 3, 1}, + {`'\x`, "missing '", 3, 1}, + {`'\x'`, "escape sequence incomplete", 3, 1}, + {`'\y'`, "unknown escape sequence", 2, 1}, + {`'\x0'`, "escape sequence incomplete", 4, 1}, + {`'\00'`, "escape sequence incomplete", 4, 1}, + {`'\377' /*`, "comment not terminated", 9, 1}, // valid octal escape + {`'\378`, "illegal character U+0038 '8' in escape sequence", 4, 1}, + {`'\400'`, "octal escape value > 255: 256", 5, 1}, + {`'xx`, "missing '", 2, 1}, + + {`"`, "string not terminated", 1, 1}, + {`"foo`, "string not terminated", 4, 1}, + {"`", "string not terminated", 1, 1}, + {"`foo", "string not terminated", 4, 1}, + {"/*/", "comment not terminated", 3, 1}, + {"/*\n\nfoo", "comment not terminated", 7, 3}, + {`"\`, "string not terminated", 2, 1}, + {`"\"`, "string not terminated", 3, 1}, + {`"\x`, "string not terminated", 3, 1}, + {`"\x"`, "escape sequence incomplete", 3, 1}, + {`"\y"`, "unknown escape sequence", 2, 1}, + {`"\x0"`, "escape sequence incomplete", 4, 1}, + {`"\00"`, "escape sequence incomplete", 4, 1}, + {`"\377" /*`, "comment not terminated", 9, 1}, // valid octal escape + {`"\378"`, "illegal character U+0038 '8' in escape sequence", 4, 1}, + {`"\400"`, "octal escape value > 255: 256", 5, 1}, + + {`s := "foo\z"`, "unknown escape sequence", 10, 1}, + {`s := "foo\z00\nbar"`, "unknown escape sequence", 10, 1}, + {`"\x`, "string not terminated", 3, 1}, + {`"\x"`, "escape sequence incomplete", 3, 1}, + {`var s string = "\x"`, "escape sequence incomplete", 18, 1}, + {`return "\Uffffffff"`, "escape sequence is invalid Unicode code point", 18, 1}, // former problem cases - {"\xef", "invalid UTF-8 encoding"}, + {"package p\n\n\xef", "invalid UTF-8 encoding", 11, 3}, } { var s scanner - hasError := false - s.init(&bytesReader{[]byte(test.src)}, func(_, line int, msg string) { - hasError = true - // TODO(gri) test exact position as well - if line != 1 { - t.Errorf("got line = %d; want 1", line) - } - if msg != test.msg { - t.Errorf("got msg = %q; want %q", msg, test.msg) + nerrors := 0 + s.init(&bytesReader{[]byte(test.src)}, func(pos, line int, msg string) { + nerrors++ + // only check the first error + if nerrors == 1 { + if msg != test.msg { + t.Errorf("%q: got msg = %q; want %q", test.src, msg, test.msg) + } + if pos != test.pos { + t.Errorf("%q: got pos = %d; want %d", test.src, pos, test.pos) + } + if line != test.line { + t.Errorf("%q: got line = %d; want %d", test.src, line, test.line) + } + } else if nerrors > 1 { + t.Errorf("%q: got unexpected %q at pos = %d, line = %d", test.src, msg, pos, line) } }) - s.next() - if !hasError { + + for { + s.next() + if s.tok == _EOF { + break + } + } + + if nerrors == 0 { t.Errorf("%q: got no error; want %q", test.src, test.msg) } } diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go index 38692c33be..c72389b171 100644 --- a/src/cmd/compile/internal/syntax/source.go +++ b/src/cmd/compile/internal/syntax/source.go @@ -58,7 +58,7 @@ func (s *source) error_at(pos, line int, msg string) { } func (s *source) pos() int { - return s.offs + s.r + return s.offs + s.r0 } func (s *source) ungetr() { -- cgit v1.2.3-54-g00ecf