1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements source, a buffered rune reader
6// specialized for scanning Go code: Reading
7// ASCII characters, maintaining current (line, col)
8// position information, and recording of the most
9// recently read source segment are highly optimized.
10// This file is self-contained (go tool compile source.go
11// compiles) and thus could be made into its own package.
12
13package syntax
14
15import (
16	"io"
17	"unicode/utf8"
18)
19
20// The source buffer is accessed using three indices b (begin),
21// r (read), and e (end):
22//
23// - If b >= 0, it points to the beginning of a segment of most
24//   recently read characters (typically a Go literal).
25//
26// - r points to the byte immediately following the most recently
27//   read character ch, which starts at r-chw.
28//
29// - e points to the byte immediately following the last byte that
30//   was read into the buffer.
31//
32// The buffer content is terminated at buf[e] with the sentinel
33// character utf8.RuneSelf. This makes it possible to test for
34// the common case of ASCII characters with a single 'if' (see
35// nextch method).
36//
37//                +------ content in use -------+
38//                v                             v
39// buf [...read...|...segment...|ch|...unread...|s|...free...]
40//                ^             ^  ^            ^
41//                |             |  |            |
42//                b         r-chw  r            e
43//
44// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
45
46type source struct {
47	in   io.Reader
48	errh func(line, col uint, msg string)
49
50	buf       []byte // source buffer
51	ioerr     error  // pending I/O error, or nil
52	b, r, e   int    // buffer indices (see comment above)
53	line, col uint   // source position of ch (0-based)
54	ch        rune   // most recently read character
55	chw       int    // width of ch
56}
57
58const sentinel = utf8.RuneSelf
59
60func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
61	s.in = in
62	s.errh = errh
63
64	if s.buf == nil {
65		s.buf = make([]byte, nextSize(0))
66	}
67	s.buf[0] = sentinel
68	s.ioerr = nil
69	s.b, s.r, s.e = -1, 0, 0
70	s.line, s.col = 0, 0
71	s.ch = ' '
72	s.chw = 0
73}
74
75// starting points for line and column numbers
76const linebase = 1
77const colbase = 1
78
79// pos returns the (line, col) source position of s.ch.
80func (s *source) pos() (line, col uint) {
81	return linebase + s.line, colbase + s.col
82}
83
84// error reports the error msg at source position s.pos().
85func (s *source) error(msg string) {
86	line, col := s.pos()
87	s.errh(line, col, msg)
88}
89
90// start starts a new active source segment (including s.ch).
91// As long as stop has not been called, the active segment's
92// bytes (excluding s.ch) may be retrieved by calling segment.
93func (s *source) start()          { s.b = s.r - s.chw }
94func (s *source) stop()           { s.b = -1 }
95func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
96
97// rewind rewinds the scanner's read position and character s.ch
98// to the start of the currently active segment, which must not
99// contain any newlines (otherwise position information will be
100// incorrect). Currently, rewind is only needed for handling the
101// source sequence ".."; it must not be called outside an active
102// segment.
103func (s *source) rewind() {
104	// ok to verify precondition - rewind is rarely called
105	if s.b < 0 {
106		panic("no active segment")
107	}
108	s.col -= uint(s.r - s.b)
109	s.r = s.b
110	s.nextch()
111}
112
113func (s *source) nextch() {
114redo:
115	s.col += uint(s.chw)
116	if s.ch == '\n' {
117		s.line++
118		s.col = 0
119	}
120
121	// fast common case: at least one ASCII character
122	if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
123		s.r++
124		s.chw = 1
125		if s.ch == 0 {
126			s.error("invalid NUL character")
127			goto redo
128		}
129		return
130	}
131
132	// slower general case: add more bytes to buffer if we don't have a full rune
133	for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
134		s.fill()
135	}
136
137	// EOF
138	if s.r == s.e {
139		if s.ioerr != io.EOF {
140			// ensure we never start with a '/' (e.g., rooted path) in the error message
141			s.error("I/O error: " + s.ioerr.Error())
142			s.ioerr = nil
143		}
144		s.ch = -1
145		s.chw = 0
146		return
147	}
148
149	s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
150	s.r += s.chw
151
152	if s.ch == utf8.RuneError && s.chw == 1 {
153		s.error("invalid UTF-8 encoding")
154		goto redo
155	}
156
157	// BOM's are only allowed as the first character in a file
158	const BOM = 0xfeff
159	if s.ch == BOM {
160		if s.line > 0 || s.col > 0 {
161			s.error("invalid BOM in the middle of the file")
162		}
163		goto redo
164	}
165}
166
167// fill reads more source bytes into s.buf.
168// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
169func (s *source) fill() {
170	// determine content to preserve
171	b := s.r
172	if s.b >= 0 {
173		b = s.b
174		s.b = 0 // after buffer has grown or content has been moved down
175	}
176	content := s.buf[b:s.e]
177
178	// grow buffer or move content down
179	if len(content)*2 > len(s.buf) {
180		s.buf = make([]byte, nextSize(len(s.buf)))
181		copy(s.buf, content)
182	} else if b > 0 {
183		copy(s.buf, content)
184	}
185	s.r -= b
186	s.e -= b
187
188	// read more data: try a limited number of times
189	for i := 0; i < 10; i++ {
190		var n int
191		n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
192		if n < 0 {
193			panic("negative read") // incorrect underlying io.Reader implementation
194		}
195		if n > 0 || s.ioerr != nil {
196			s.e += n
197			s.buf[s.e] = sentinel
198			return
199		}
200		// n == 0
201	}
202
203	s.buf[s.e] = sentinel
204	s.ioerr = io.ErrNoProgress
205}
206
207// nextSize returns the next bigger size for a buffer of a given size.
208func nextSize(size int) int {
209	const min = 4 << 10 // 4K: minimum buffer size
210	const max = 1 << 20 // 1M: maximum buffer size which is still doubled
211	if size < min {
212		return min
213	}
214	if size <= max {
215		return size << 1
216	}
217	return size + max
218}
219