// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file implements source, a buffered rune reader // specialized for scanning Go code: Reading // ASCII characters, maintaining current (line, col) // position information, and recording of the most // recently read source segment are highly optimized. // This file is self-contained (go tool compile source.go // compiles) and thus could be made into its own package. package syntax import ( "io" "unicode/utf8" ) // The source buffer is accessed using three indices b (begin), // r (read), and e (end): // // - If b >= 0, it points to the beginning of a segment of most // recently read characters (typically a Go literal). // // - r points to the byte immediately following the most recently // read character ch, which starts at r-chw. // // - e points to the byte immediately following the last byte that // was read into the buffer. // // The buffer content is terminated at buf[e] with the sentinel // character utf8.RuneSelf. This makes it possible to test for // the common case of ASCII characters with a single 'if' (see // nextch method). // // +------ content in use -------+ // v v // buf [...read...|...segment...|ch|...unread...|s|...free...] // ^ ^ ^ ^ // | | | | // b r-chw r e // // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel type source struct { in io.Reader errh func(line, col uint, msg string) buf []byte // source buffer ioerr error // pending I/O error, or nil b, r, e int // buffer indices (see comment above) line, col uint // source position of ch (0-based) ch rune // most recently read character chw int // width of ch } const sentinel = utf8.RuneSelf func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) { s.in = in s.errh = errh if s.buf == nil { s.buf = make([]byte, nextSize(0)) } s.buf[0] = sentinel s.ioerr = nil s.b, s.r, s.e = -1, 0, 0 s.line, s.col = 0, 0 s.ch = ' ' s.chw = 0 } // starting points for line and column numbers const linebase = 1 const colbase = 1 // pos returns the (line, col) source position of s.ch. func (s *source) pos() (line, col uint) { return linebase + s.line, colbase + s.col } // error reports the error msg at source position s.pos(). func (s *source) error(msg string) { line, col := s.pos() s.errh(line, col, msg) } // start starts a new active source segment (including s.ch). // As long as stop has not been called, the active segment's // bytes (excluding s.ch) may be retrieved by calling segment. func (s *source) start() { s.b = s.r - s.chw } func (s *source) stop() { s.b = -1 } func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] } // rewind rewinds the scanner's read position and character s.ch // to the start of the currently active segment, which must not // contain any newlines (otherwise position information will be // incorrect). Currently, rewind is only needed for handling the // source sequence ".."; it must not be called outside an active // segment. func (s *source) rewind() { // ok to verify precondition - rewind is rarely called if s.b < 0 { panic("no active segment") } s.col -= uint(s.r - s.b) s.r = s.b s.nextch() } func (s *source) nextch() { redo: s.col += uint(s.chw) if s.ch == '\n' { s.line++ s.col = 0 } // fast common case: at least one ASCII character if s.ch = rune(s.buf[s.r]); s.ch < sentinel { s.r++ s.chw = 1 if s.ch == 0 { s.error("invalid NUL character") goto redo } return } // slower general case: add more bytes to buffer if we don't have a full rune for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil { s.fill() } // EOF if s.r == s.e { if s.ioerr != io.EOF { // ensure we never start with a '/' (e.g., rooted path) in the error message s.error("I/O error: " + s.ioerr.Error()) s.ioerr = nil } s.ch = -1 s.chw = 0 return } s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e]) s.r += s.chw if s.ch == utf8.RuneError && s.chw == 1 { s.error("invalid UTF-8 encoding") goto redo } // BOM's are only allowed as the first character in a file const BOM = 0xfeff if s.ch == BOM { if s.line > 0 || s.col > 0 { s.error("invalid BOM in the middle of the file") } goto redo } } // fill reads more source bytes into s.buf. // It returns with at least one more byte in the buffer, or with s.ioerr != nil. func (s *source) fill() { // determine content to preserve b := s.r if s.b >= 0 { b = s.b s.b = 0 // after buffer has grown or content has been moved down } content := s.buf[b:s.e] // grow buffer or move content down if len(content)*2 > len(s.buf) { s.buf = make([]byte, nextSize(len(s.buf))) copy(s.buf, content) } else if b > 0 { copy(s.buf, content) } s.r -= b s.e -= b // read more data: try a limited number of times for i := 0; i < 10; i++ { var n int n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel if n < 0 { panic("negative read") // incorrect underlying io.Reader implementation } if n > 0 || s.ioerr != nil { s.e += n s.buf[s.e] = sentinel return } // n == 0 } s.buf[s.e] = sentinel s.ioerr = io.ErrNoProgress } // nextSize returns the next bigger size for a buffer of a given size. func nextSize(size int) int { const min = 4 << 10 // 4K: minimum buffer size const max = 1 << 20 // 1M: maximum buffer size which is still doubled if size < min { return min } if size <= max { return size << 1 } return size + max }