1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// This file implements source, a buffered rune reader 6// specialized for scanning Go code: Reading 7// ASCII characters, maintaining current (line, col) 8// position information, and recording of the most 9// recently read source segment are highly optimized. 10// This file is self-contained (go tool compile source.go 11// compiles) and thus could be made into its own package. 12 13package syntax 14 15import ( 16 "io" 17 "unicode/utf8" 18) 19 20// The source buffer is accessed using three indices b (begin), 21// r (read), and e (end): 22// 23// - If b >= 0, it points to the beginning of a segment of most 24// recently read characters (typically a Go literal). 25// 26// - r points to the byte immediately following the most recently 27// read character ch, which starts at r-chw. 28// 29// - e points to the byte immediately following the last byte that 30// was read into the buffer. 31// 32// The buffer content is terminated at buf[e] with the sentinel 33// character utf8.RuneSelf. This makes it possible to test for 34// the common case of ASCII characters with a single 'if' (see 35// nextch method). 36// 37// +------ content in use -------+ 38// v v 39// buf [...read...|...segment...|ch|...unread...|s|...free...] 40// ^ ^ ^ ^ 41// | | | | 42// b r-chw r e 43// 44// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel 45 46type source struct { 47 in io.Reader 48 errh func(line, col uint, msg string) 49 50 buf []byte // source buffer 51 ioerr error // pending I/O error, or nil 52 b, r, e int // buffer indices (see comment above) 53 line, col uint // source position of ch (0-based) 54 ch rune // most recently read character 55 chw int // width of ch 56} 57 58const sentinel = utf8.RuneSelf 59 60func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) { 61 s.in = in 62 s.errh = errh 63 64 if s.buf == nil { 65 s.buf = make([]byte, nextSize(0)) 66 } 67 s.buf[0] = sentinel 68 s.ioerr = nil 69 s.b, s.r, s.e = -1, 0, 0 70 s.line, s.col = 0, 0 71 s.ch = ' ' 72 s.chw = 0 73} 74 75// starting points for line and column numbers 76const linebase = 1 77const colbase = 1 78 79// pos returns the (line, col) source position of s.ch. 80func (s *source) pos() (line, col uint) { 81 return linebase + s.line, colbase + s.col 82} 83 84// error reports the error msg at source position s.pos(). 85func (s *source) error(msg string) { 86 line, col := s.pos() 87 s.errh(line, col, msg) 88} 89 90// start starts a new active source segment (including s.ch). 91// As long as stop has not been called, the active segment's 92// bytes (excluding s.ch) may be retrieved by calling segment. 93func (s *source) start() { s.b = s.r - s.chw } 94func (s *source) stop() { s.b = -1 } 95func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] } 96 97// rewind rewinds the scanner's read position and character s.ch 98// to the start of the currently active segment, which must not 99// contain any newlines (otherwise position information will be 100// incorrect). Currently, rewind is only needed for handling the 101// source sequence ".."; it must not be called outside an active 102// segment. 103func (s *source) rewind() { 104 // ok to verify precondition - rewind is rarely called 105 if s.b < 0 { 106 panic("no active segment") 107 } 108 s.col -= uint(s.r - s.b) 109 s.r = s.b 110 s.nextch() 111} 112 113func (s *source) nextch() { 114redo: 115 s.col += uint(s.chw) 116 if s.ch == '\n' { 117 s.line++ 118 s.col = 0 119 } 120 121 // fast common case: at least one ASCII character 122 if s.ch = rune(s.buf[s.r]); s.ch < sentinel { 123 s.r++ 124 s.chw = 1 125 if s.ch == 0 { 126 s.error("invalid NUL character") 127 goto redo 128 } 129 return 130 } 131 132 // slower general case: add more bytes to buffer if we don't have a full rune 133 for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil { 134 s.fill() 135 } 136 137 // EOF 138 if s.r == s.e { 139 if s.ioerr != io.EOF { 140 // ensure we never start with a '/' (e.g., rooted path) in the error message 141 s.error("I/O error: " + s.ioerr.Error()) 142 s.ioerr = nil 143 } 144 s.ch = -1 145 s.chw = 0 146 return 147 } 148 149 s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e]) 150 s.r += s.chw 151 152 if s.ch == utf8.RuneError && s.chw == 1 { 153 s.error("invalid UTF-8 encoding") 154 goto redo 155 } 156 157 // BOM's are only allowed as the first character in a file 158 const BOM = 0xfeff 159 if s.ch == BOM { 160 if s.line > 0 || s.col > 0 { 161 s.error("invalid BOM in the middle of the file") 162 } 163 goto redo 164 } 165} 166 167// fill reads more source bytes into s.buf. 168// It returns with at least one more byte in the buffer, or with s.ioerr != nil. 169func (s *source) fill() { 170 // determine content to preserve 171 b := s.r 172 if s.b >= 0 { 173 b = s.b 174 s.b = 0 // after buffer has grown or content has been moved down 175 } 176 content := s.buf[b:s.e] 177 178 // grow buffer or move content down 179 if len(content)*2 > len(s.buf) { 180 s.buf = make([]byte, nextSize(len(s.buf))) 181 copy(s.buf, content) 182 } else if b > 0 { 183 copy(s.buf, content) 184 } 185 s.r -= b 186 s.e -= b 187 188 // read more data: try a limited number of times 189 for i := 0; i < 10; i++ { 190 var n int 191 n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel 192 if n < 0 { 193 panic("negative read") // incorrect underlying io.Reader implementation 194 } 195 if n > 0 || s.ioerr != nil { 196 s.e += n 197 s.buf[s.e] = sentinel 198 return 199 } 200 // n == 0 201 } 202 203 s.buf[s.e] = sentinel 204 s.ioerr = io.ErrNoProgress 205} 206 207// nextSize returns the next bigger size for a buffer of a given size. 208func nextSize(size int) int { 209 const min = 4 << 10 // 4K: minimum buffer size 210 const max = 1 << 20 // 1M: maximum buffer size which is still doubled 211 if size < min { 212 return min 213 } 214 if size <= max { 215 return size << 1 216 } 217 return size + max 218} 219