xref: /aosp_15_r20/external/emboss/compiler/front_end/lr1.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70)
1*99e0aae7SDavid Rees# Copyright 2019 Google LLC
2*99e0aae7SDavid Rees#
3*99e0aae7SDavid Rees# Licensed under the Apache License, Version 2.0 (the "License");
4*99e0aae7SDavid Rees# you may not use this file except in compliance with the License.
5*99e0aae7SDavid Rees# You may obtain a copy of the License at
6*99e0aae7SDavid Rees#
7*99e0aae7SDavid Rees#     https://www.apache.org/licenses/LICENSE-2.0
8*99e0aae7SDavid Rees#
9*99e0aae7SDavid Rees# Unless required by applicable law or agreed to in writing, software
10*99e0aae7SDavid Rees# distributed under the License is distributed on an "AS IS" BASIS,
11*99e0aae7SDavid Rees# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*99e0aae7SDavid Rees# See the License for the specific language governing permissions and
13*99e0aae7SDavid Rees# limitations under the License.
14*99e0aae7SDavid Rees
15*99e0aae7SDavid Rees"""LR(1) parser generator.
16*99e0aae7SDavid Rees
17*99e0aae7SDavid ReesThe primary class in this module, Grammar, takes a list of context-free grammar
18*99e0aae7SDavid Reesproductions, and produces the corresponding LR(1) shift-reduce parser.  This is
19*99e0aae7SDavid Reesan implementation of the algorithm on pages 221 and 261-265 of "Compilers:
20*99e0aae7SDavid ReesPrinciples, Techniques, & Tools" (Second Edition) by Aho, Lam, Sethi, and
21*99e0aae7SDavid ReesUllman, also known as "The Dragon Book," hereafter referred to as "ALSU."
22*99e0aae7SDavid Rees
23*99e0aae7SDavid ReesThis module only implements the LR(1) algorithms; unlike tools such as yacc, it
24*99e0aae7SDavid Reesdoes not implement the various bits of glue necessary to actually use a parser.
25*99e0aae7SDavid ReesClients are expected to provide their own tokenizers and handle turning a raw
26*99e0aae7SDavid Reesparse tree into an intermediate representation on their own.
27*99e0aae7SDavid Rees"""
28*99e0aae7SDavid Rees
29*99e0aae7SDavid Reesimport collections
30*99e0aae7SDavid Rees
31*99e0aae7SDavid Reesfrom compiler.util import parser_types
32*99e0aae7SDavid Rees
33*99e0aae7SDavid Rees
34*99e0aae7SDavid Reesclass Item(collections.namedtuple("Item", ["production", "dot", "terminal",
35*99e0aae7SDavid Rees                                           "next_symbol"])):
36*99e0aae7SDavid Rees  """An Item is an LR(1) Item: a production, a cursor location, and a terminal.
37*99e0aae7SDavid Rees
38*99e0aae7SDavid Rees  An Item represents a partially-parsed production, and a lookahead symbol.  The
39*99e0aae7SDavid Rees  position of the dot indicates what portion of the production has been parsed.
40*99e0aae7SDavid Rees  Generally, Items are an internal implementation detail, but they can be useful
41*99e0aae7SDavid Rees  elsewhere, particularly for debugging.
42*99e0aae7SDavid Rees
43*99e0aae7SDavid Rees  Attributes:
44*99e0aae7SDavid Rees    production: The Production this Item covers.
45*99e0aae7SDavid Rees    dot: The index of the "dot" in production's rhs.
46*99e0aae7SDavid Rees    terminal: The terminal lookahead symbol that follows the production in the
47*99e0aae7SDavid Rees        input stream.
48*99e0aae7SDavid Rees  """
49*99e0aae7SDavid Rees
50*99e0aae7SDavid Rees  def __str__(self):
51*99e0aae7SDavid Rees    """__str__ generates ASLU notation."""
52*99e0aae7SDavid Rees    return (str(self.production.lhs) + " -> " + " ".join(
53*99e0aae7SDavid Rees        [str(r) for r in self.production.rhs[0:self.dot] + (".",) +
54*99e0aae7SDavid Rees         self.production.rhs[self.dot:]]) + ", " + str(self.terminal))
55*99e0aae7SDavid Rees
56*99e0aae7SDavid Rees  @staticmethod
57*99e0aae7SDavid Rees  def parse(text):
58*99e0aae7SDavid Rees    """Parses an Item in ALSU notation.
59*99e0aae7SDavid Rees
60*99e0aae7SDavid Rees    Parses an Item from notation like:
61*99e0aae7SDavid Rees
62*99e0aae7SDavid Rees       symbol -> foo . bar baz, qux
63*99e0aae7SDavid Rees
64*99e0aae7SDavid Rees    where "symbol -> foo bar baz" will be taken as the production, the position
65*99e0aae7SDavid Rees    of the "." is taken as "dot" (in this case 1), and the symbol after "," is
66*99e0aae7SDavid Rees    taken as the "terminal".  The following are also valid items:
67*99e0aae7SDavid Rees
68*99e0aae7SDavid Rees       sym -> ., foo
69*99e0aae7SDavid Rees       sym -> . foo bar, baz
70*99e0aae7SDavid Rees       sym -> foo bar ., baz
71*99e0aae7SDavid Rees
72*99e0aae7SDavid Rees    Symbols on the right-hand side of the production should be separated by
73*99e0aae7SDavid Rees    whitespace.
74*99e0aae7SDavid Rees
75*99e0aae7SDavid Rees    Arguments:
76*99e0aae7SDavid Rees      text: The text to parse into an Item.
77*99e0aae7SDavid Rees
78*99e0aae7SDavid Rees    Returns:
79*99e0aae7SDavid Rees      An Item.
80*99e0aae7SDavid Rees    """
81*99e0aae7SDavid Rees    production, terminal = text.split(",")
82*99e0aae7SDavid Rees    terminal = terminal.strip()
83*99e0aae7SDavid Rees    if terminal == "$":
84*99e0aae7SDavid Rees      terminal = END_OF_INPUT
85*99e0aae7SDavid Rees    lhs, rhs = production.split("->")
86*99e0aae7SDavid Rees    lhs = lhs.strip()
87*99e0aae7SDavid Rees    if lhs == "S'":
88*99e0aae7SDavid Rees      lhs = START_PRIME
89*99e0aae7SDavid Rees    before_dot, after_dot = rhs.split(".")
90*99e0aae7SDavid Rees    handle = before_dot.split()
91*99e0aae7SDavid Rees    tail = after_dot.split()
92*99e0aae7SDavid Rees    return make_item(parser_types.Production(lhs, tuple(handle + tail)),
93*99e0aae7SDavid Rees                     len(handle), terminal)
94*99e0aae7SDavid Rees
95*99e0aae7SDavid Rees
96*99e0aae7SDavid Reesdef make_item(production, dot, symbol):
97*99e0aae7SDavid Rees  return Item(production, dot, symbol,
98*99e0aae7SDavid Rees              None if dot >= len(production.rhs) else production.rhs[dot])
99*99e0aae7SDavid Rees
100*99e0aae7SDavid Rees
101*99e0aae7SDavid Reesclass Conflict(
102*99e0aae7SDavid Rees    collections.namedtuple("Conflict", ["state", "symbol", "actions"])
103*99e0aae7SDavid Rees):
104*99e0aae7SDavid Rees  """Conflict represents a parse conflict."""
105*99e0aae7SDavid Rees
106*99e0aae7SDavid Rees  def __str__(self):
107*99e0aae7SDavid Rees    return "Conflict for {} in state {}: ".format(
108*99e0aae7SDavid Rees        self.symbol, self.state) + " vs ".join([str(a) for a in self.actions])
109*99e0aae7SDavid Rees
110*99e0aae7SDavid Rees
111*99e0aae7SDavid ReesShift = collections.namedtuple("Shift", ["state", "items"])
112*99e0aae7SDavid ReesReduce = collections.namedtuple("Reduce", ["rule"])
113*99e0aae7SDavid ReesAccept = collections.namedtuple("Accept", [])
114*99e0aae7SDavid ReesError = collections.namedtuple("Error", ["code"])
115*99e0aae7SDavid Rees
116*99e0aae7SDavid ReesSymbol = collections.namedtuple("Symbol", ["symbol"])
117*99e0aae7SDavid Rees
118*99e0aae7SDavid Rees# START_PRIME is the implicit 'real' root symbol for the grammar.
119*99e0aae7SDavid ReesSTART_PRIME = "S'"
120*99e0aae7SDavid Rees
121*99e0aae7SDavid Rees# END_OF_INPUT is the implicit symbol at the end of input.
122*99e0aae7SDavid ReesEND_OF_INPUT = "$"
123*99e0aae7SDavid Rees
124*99e0aae7SDavid Rees# ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced
125*99e0aae7SDavid Rees# by every other token.
126*99e0aae7SDavid ReesANY_TOKEN = parser_types.Token(object(), "*",
127*99e0aae7SDavid Rees                               parser_types.parse_location("0:0-0:0"))
128*99e0aae7SDavid Rees
129*99e0aae7SDavid Rees
130*99e0aae7SDavid Reesclass Reduction(collections.namedtuple("Reduction",
131*99e0aae7SDavid Rees                                       ["symbol", "children", "production",
132*99e0aae7SDavid Rees                                        "source_location"])):
133*99e0aae7SDavid Rees  """A Reduction is a non-leaf node in a parse tree.
134*99e0aae7SDavid Rees
135*99e0aae7SDavid Rees  Attributes:
136*99e0aae7SDavid Rees    symbol: The name of this element in the parse.
137*99e0aae7SDavid Rees    children: The child elements of this parse.
138*99e0aae7SDavid Rees    production: The grammar production to which this reduction corresponds.
139*99e0aae7SDavid Rees    source_location: If known, the range in the source text corresponding to the
140*99e0aae7SDavid Rees      tokens from which this reduction was parsed.  May be 'None' if this
141*99e0aae7SDavid Rees      reduction was produced from no symbols, or if the tokens fed to `parse`
142*99e0aae7SDavid Rees      did not include source_location.
143*99e0aae7SDavid Rees  """
144*99e0aae7SDavid Rees  pass
145*99e0aae7SDavid Rees
146*99e0aae7SDavid Rees
147*99e0aae7SDavid Reesclass Grammar(object):
148*99e0aae7SDavid Rees  """Grammar is an LR(1) context-free grammar.
149*99e0aae7SDavid Rees
150*99e0aae7SDavid Rees  Attributes:
151*99e0aae7SDavid Rees    start: The start symbol for the grammar.
152*99e0aae7SDavid Rees    productions: A list of productions in the grammar, including the S' -> start
153*99e0aae7SDavid Rees      production.
154*99e0aae7SDavid Rees    symbols: A set of all symbols in the grammar, including $ and S'.
155*99e0aae7SDavid Rees    nonterminals: A set of all nonterminal symbols in the grammar, including S'.
156*99e0aae7SDavid Rees    terminals: A set of all terminal symbols in the grammar, including $.
157*99e0aae7SDavid Rees  """
158*99e0aae7SDavid Rees
159*99e0aae7SDavid Rees  def __init__(self, start_symbol, productions):
160*99e0aae7SDavid Rees    """Constructs a Grammar object.
161*99e0aae7SDavid Rees
162*99e0aae7SDavid Rees    Arguments:
163*99e0aae7SDavid Rees      start_symbol: The start symbol for the grammar.
164*99e0aae7SDavid Rees      productions: A list of productions (not including the "S' -> start_symbol"
165*99e0aae7SDavid Rees          production).
166*99e0aae7SDavid Rees    """
167*99e0aae7SDavid Rees    object.__init__(self)
168*99e0aae7SDavid Rees    self.start = start_symbol
169*99e0aae7SDavid Rees    self._seed_production = parser_types.Production(START_PRIME, (self.start,))
170*99e0aae7SDavid Rees    self.productions = productions + [self._seed_production]
171*99e0aae7SDavid Rees
172*99e0aae7SDavid Rees    self._single_level_closure_of_item_cache = {}
173*99e0aae7SDavid Rees    self._closure_of_item_cache = {}
174*99e0aae7SDavid Rees    self._compute_symbols()
175*99e0aae7SDavid Rees    self._compute_seed_firsts()
176*99e0aae7SDavid Rees    self._set_productions_by_lhs()
177*99e0aae7SDavid Rees    self._populate_item_cache()
178*99e0aae7SDavid Rees
179*99e0aae7SDavid Rees  def _set_productions_by_lhs(self):
180*99e0aae7SDavid Rees    # Prepopulating _productions_by_lhs speeds up _closure_of_item by about 30%,
181*99e0aae7SDavid Rees    # which is significant on medium-to-large grammars.
182*99e0aae7SDavid Rees    self._productions_by_lhs = {}
183*99e0aae7SDavid Rees    for production in self.productions:
184*99e0aae7SDavid Rees      self._productions_by_lhs.setdefault(production.lhs, list()).append(
185*99e0aae7SDavid Rees          production)
186*99e0aae7SDavid Rees
187*99e0aae7SDavid Rees  def _populate_item_cache(self):
188*99e0aae7SDavid Rees    # There are a relatively small number of possible Items for a grammar, and
189*99e0aae7SDavid Rees    # the algorithm needs to get Items from their constituent components very
190*99e0aae7SDavid Rees    # frequently.  As it turns out, pre-caching all possible Items results in a
191*99e0aae7SDavid Rees    # ~35% overall speedup to Grammar.parser().
192*99e0aae7SDavid Rees    self._item_cache = {}
193*99e0aae7SDavid Rees    for symbol in self.terminals:
194*99e0aae7SDavid Rees      for production in self.productions:
195*99e0aae7SDavid Rees        for dot in range(len(production.rhs) + 1):
196*99e0aae7SDavid Rees          self._item_cache[production, dot, symbol] = make_item(
197*99e0aae7SDavid Rees              production, dot, symbol)
198*99e0aae7SDavid Rees
199*99e0aae7SDavid Rees  def _compute_symbols(self):
200*99e0aae7SDavid Rees    """Finds all grammar symbols, and sorts them into terminal and non-terminal.
201*99e0aae7SDavid Rees
202*99e0aae7SDavid Rees    Nonterminal symbols are those which appear on the left side of any
203*99e0aae7SDavid Rees    production.  Terminal symbols are those which do not.
204*99e0aae7SDavid Rees
205*99e0aae7SDavid Rees    _compute_symbols is used during __init__.
206*99e0aae7SDavid Rees    """
207*99e0aae7SDavid Rees    self.symbols = {END_OF_INPUT}
208*99e0aae7SDavid Rees    self.nonterminals = set()
209*99e0aae7SDavid Rees    for production in self.productions:
210*99e0aae7SDavid Rees      self.symbols.add(production.lhs)
211*99e0aae7SDavid Rees      self.nonterminals.add(production.lhs)
212*99e0aae7SDavid Rees      for symbol in production.rhs:
213*99e0aae7SDavid Rees        self.symbols.add(symbol)
214*99e0aae7SDavid Rees    self.terminals = self.symbols - self.nonterminals
215*99e0aae7SDavid Rees
216*99e0aae7SDavid Rees  def _compute_seed_firsts(self):
217*99e0aae7SDavid Rees    """Computes FIRST (ALSU p221) for all terminal and nonterminal symbols.
218*99e0aae7SDavid Rees
219*99e0aae7SDavid Rees    The algorithm for computing FIRST is an iterative one that terminates when
220*99e0aae7SDavid Rees    it reaches a fixed point (that is, when further iterations stop changing
221*99e0aae7SDavid Rees    state).  _compute_seed_firsts computes the fixed point for all single-symbol
222*99e0aae7SDavid Rees    strings, by repeatedly calling _first and updating the internal _firsts
223*99e0aae7SDavid Rees    table with the results.
224*99e0aae7SDavid Rees
225*99e0aae7SDavid Rees    Once _compute_seed_firsts has completed, _first will return correct results
226*99e0aae7SDavid Rees    for both single- and multi-symbol strings.
227*99e0aae7SDavid Rees
228*99e0aae7SDavid Rees    _compute_seed_firsts is used during __init__.
229*99e0aae7SDavid Rees    """
230*99e0aae7SDavid Rees    self.firsts = {}
231*99e0aae7SDavid Rees    # FIRST for a terminal symbol is always just that terminal symbol.
232*99e0aae7SDavid Rees    for terminal in self.terminals:
233*99e0aae7SDavid Rees      self.firsts[terminal] = set([terminal])
234*99e0aae7SDavid Rees    for nonterminal in self.nonterminals:
235*99e0aae7SDavid Rees      self.firsts[nonterminal] = set()
236*99e0aae7SDavid Rees    while True:
237*99e0aae7SDavid Rees      # The first iteration picks up all the productions that start with
238*99e0aae7SDavid Rees      # terminal symbols.  The second iteration picks up productions that start
239*99e0aae7SDavid Rees      # with nonterminals that the first iteration picked up.  The third
240*99e0aae7SDavid Rees      # iteration picks up nonterminals that the first and second picked up, and
241*99e0aae7SDavid Rees      # so on.
242*99e0aae7SDavid Rees      #
243*99e0aae7SDavid Rees      # This is guaranteed to end, in the worst case, when every terminal
244*99e0aae7SDavid Rees      # symbol and epsilon has been added to the _firsts set for every
245*99e0aae7SDavid Rees      # nonterminal symbol.  This would be slow, but requires a pathological
246*99e0aae7SDavid Rees      # grammar; useful grammars should complete in only a few iterations.
247*99e0aae7SDavid Rees      firsts_to_add = {}
248*99e0aae7SDavid Rees      for production in self.productions:
249*99e0aae7SDavid Rees        for first in self._first(production.rhs):
250*99e0aae7SDavid Rees          if first not in self.firsts[production.lhs]:
251*99e0aae7SDavid Rees            if production.lhs not in firsts_to_add:
252*99e0aae7SDavid Rees              firsts_to_add[production.lhs] = set()
253*99e0aae7SDavid Rees            firsts_to_add[production.lhs].add(first)
254*99e0aae7SDavid Rees      if not firsts_to_add:
255*99e0aae7SDavid Rees        break
256*99e0aae7SDavid Rees      for symbol in firsts_to_add:
257*99e0aae7SDavid Rees        self.firsts[symbol].update(firsts_to_add[symbol])
258*99e0aae7SDavid Rees
259*99e0aae7SDavid Rees  def _first(self, symbols):
260*99e0aae7SDavid Rees    """The FIRST function from ALSU p221.
261*99e0aae7SDavid Rees
262*99e0aae7SDavid Rees    _first takes a string of symbols (both terminals and nonterminals) and
263*99e0aae7SDavid Rees    returns the set of terminal symbols which could be the first terminal symbol
264*99e0aae7SDavid Rees    of a string produced by the given list of symbols.
265*99e0aae7SDavid Rees
266*99e0aae7SDavid Rees    _first will not give fully-correct results until _compute_seed_firsts
267*99e0aae7SDavid Rees    finishes, but is called by _compute_seed_firsts, and must provide partial
268*99e0aae7SDavid Rees    results during that method's execution.
269*99e0aae7SDavid Rees
270*99e0aae7SDavid Rees    Args:
271*99e0aae7SDavid Rees      symbols: A list of symbols.
272*99e0aae7SDavid Rees
273*99e0aae7SDavid Rees    Returns:
274*99e0aae7SDavid Rees      A set of terminals which could be the first terminal in "symbols."
275*99e0aae7SDavid Rees    """
276*99e0aae7SDavid Rees    result = set()
277*99e0aae7SDavid Rees    all_contain_epsilon = True
278*99e0aae7SDavid Rees    for symbol in symbols:
279*99e0aae7SDavid Rees      for first in self.firsts[symbol]:
280*99e0aae7SDavid Rees        if first:
281*99e0aae7SDavid Rees          result.add(first)
282*99e0aae7SDavid Rees      if None not in self.firsts[symbol]:
283*99e0aae7SDavid Rees        all_contain_epsilon = False
284*99e0aae7SDavid Rees        break
285*99e0aae7SDavid Rees    if all_contain_epsilon:
286*99e0aae7SDavid Rees      # "None" seems like a Pythonic way of representing epsilon (no symbol).
287*99e0aae7SDavid Rees      result.add(None)
288*99e0aae7SDavid Rees    return result
289*99e0aae7SDavid Rees
290*99e0aae7SDavid Rees  def _closure_of_item(self, root_item):
291*99e0aae7SDavid Rees    """Modified implementation of CLOSURE from ALSU p261.
292*99e0aae7SDavid Rees
293*99e0aae7SDavid Rees    _closure_of_item performs the CLOSURE function with a single seed item, with
294*99e0aae7SDavid Rees    memoization.  In the algorithm as presented in ALSU, CLOSURE is called with
295*99e0aae7SDavid Rees    a different set of items every time, which is unhelpful for memoization.
296*99e0aae7SDavid Rees    Instead, we let _parallel_goto merge the sets returned by _closure_of_item,
297*99e0aae7SDavid Rees    which results in a ~40% speedup.
298*99e0aae7SDavid Rees
299*99e0aae7SDavid Rees    CLOSURE, roughly, computes the set of LR(1) Items which might be active when
300*99e0aae7SDavid Rees    a "seed" set of Items is active.
301*99e0aae7SDavid Rees
302*99e0aae7SDavid Rees    Technically, it is the epsilon-closure of the NFA states represented by
303*99e0aae7SDavid Rees    "items," where an epsilon transition (a transition that does not consume any
304*99e0aae7SDavid Rees    symbols) occurs from a->Z.bY,q to b->.X,p when p is in FIRST(Yq).  (a and b
305*99e0aae7SDavid Rees    are nonterminals, X, Y, and Z are arbitrary strings of symbols, and p and q
306*99e0aae7SDavid Rees    are terminals.)  That is, it is the set of all NFA states which can be
307*99e0aae7SDavid Rees    reached from "items" without consuming any input.  This set corresponds to a
308*99e0aae7SDavid Rees    single DFA state.
309*99e0aae7SDavid Rees
310*99e0aae7SDavid Rees    Args:
311*99e0aae7SDavid Rees      root_item: The initial LR(1) Item.
312*99e0aae7SDavid Rees
313*99e0aae7SDavid Rees    Returns:
314*99e0aae7SDavid Rees      A set of LR(1) items which may be active at the time when the provided
315*99e0aae7SDavid Rees      item is active.
316*99e0aae7SDavid Rees    """
317*99e0aae7SDavid Rees    if root_item in self._closure_of_item_cache:
318*99e0aae7SDavid Rees      return self._closure_of_item_cache[root_item]
319*99e0aae7SDavid Rees    item_set = set([root_item])
320*99e0aae7SDavid Rees    item_list = [root_item]
321*99e0aae7SDavid Rees    i = 0
322*99e0aae7SDavid Rees    # Each newly-added Item may trigger the addition of further Items, so
323*99e0aae7SDavid Rees    # iterate until no new Items are added.  In the worst case, a new Item will
324*99e0aae7SDavid Rees    # be added for each production.
325*99e0aae7SDavid Rees    #
326*99e0aae7SDavid Rees    # This algorithm is really looking for "next" nonterminals in the existing
327*99e0aae7SDavid Rees    # items, and adding new items corresponding to their productions.
328*99e0aae7SDavid Rees    while i < len(item_list):
329*99e0aae7SDavid Rees      item = item_list[i]
330*99e0aae7SDavid Rees      i += 1
331*99e0aae7SDavid Rees      if not item.next_symbol:
332*99e0aae7SDavid Rees        continue
333*99e0aae7SDavid Rees      # If _closure_of_item_cache contains the full closure of item, then we can
334*99e0aae7SDavid Rees      # add its full closure to the result set, and skip checking any of its
335*99e0aae7SDavid Rees      # items: any item that would be added by any item in the cached result
336*99e0aae7SDavid Rees      # will already be in the _closure_of_item_cache entry.
337*99e0aae7SDavid Rees      if item in self._closure_of_item_cache:
338*99e0aae7SDavid Rees        item_set |= self._closure_of_item_cache[item]
339*99e0aae7SDavid Rees        continue
340*99e0aae7SDavid Rees      # Even if we don't have the full closure of item, we may have the
341*99e0aae7SDavid Rees      # immediate closure of item.  It turns out that memoizing just this step
342*99e0aae7SDavid Rees      # speeds up this function by about 50%, even after the
343*99e0aae7SDavid Rees      # _closure_of_item_cache check.
344*99e0aae7SDavid Rees      if item not in self._single_level_closure_of_item_cache:
345*99e0aae7SDavid Rees        new_items = set()
346*99e0aae7SDavid Rees        for production in self._productions_by_lhs.get(item.next_symbol, []):
347*99e0aae7SDavid Rees          for terminal in self._first(item.production.rhs[item.dot + 1:] +
348*99e0aae7SDavid Rees                                      (item.terminal,)):
349*99e0aae7SDavid Rees            new_items.add(self._item_cache[production, 0, terminal])
350*99e0aae7SDavid Rees        self._single_level_closure_of_item_cache[item] = new_items
351*99e0aae7SDavid Rees      for new_item in self._single_level_closure_of_item_cache[item]:
352*99e0aae7SDavid Rees        if new_item not in item_set:
353*99e0aae7SDavid Rees          item_set.add(new_item)
354*99e0aae7SDavid Rees          item_list.append(new_item)
355*99e0aae7SDavid Rees    self._closure_of_item_cache[root_item] = item_set
356*99e0aae7SDavid Rees    # Typically, _closure_of_item() will be called on items whose closures
357*99e0aae7SDavid Rees    # bring in the greatest number of additional items, then on items which
358*99e0aae7SDavid Rees    # close over fewer and fewer other items.  Since items are not added to
359*99e0aae7SDavid Rees    # _closure_of_item_cache unless _closure_of_item() is called directly on
360*99e0aae7SDavid Rees    # them, this means that it is unlikely that items brought in will (without
361*99e0aae7SDavid Rees    # intervention) have entries in _closure_of_item_cache, which slows down the
362*99e0aae7SDavid Rees    # computation of the larger closures.
363*99e0aae7SDavid Rees    #
364*99e0aae7SDavid Rees    # Although it is not guaranteed, items added to item_list last will tend to
365*99e0aae7SDavid Rees    # close over fewer items, and therefore be easier to compute.  By forcibly
366*99e0aae7SDavid Rees    # re-calculating closures from last to first, and adding the results to
367*99e0aae7SDavid Rees    # _closure_of_item_cache at each step, we get a modest performance
368*99e0aae7SDavid Rees    # improvement: roughly 50% less time spent in _closure_of_item, which
369*99e0aae7SDavid Rees    # translates to about 5% less time in parser().
370*99e0aae7SDavid Rees    for item in item_list[::-1]:
371*99e0aae7SDavid Rees      self._closure_of_item(item)
372*99e0aae7SDavid Rees    return item_set
373*99e0aae7SDavid Rees
374*99e0aae7SDavid Rees  def _parallel_goto(self, items):
375*99e0aae7SDavid Rees    """The GOTO function from ALSU p261, executed on all symbols.
376*99e0aae7SDavid Rees
377*99e0aae7SDavid Rees    _parallel_goto takes a set of Items, and returns a dict from every symbol in
378*99e0aae7SDavid Rees    self.symbols to the set of Items that would be active after a shift
379*99e0aae7SDavid Rees    operation (if symbol is a terminal) or after a reduction operation (if
380*99e0aae7SDavid Rees    symbol is a nonterminal).
381*99e0aae7SDavid Rees
382*99e0aae7SDavid Rees    _parallel_goto is used in lieu of the single-symbol GOTO from ALSU because
383*99e0aae7SDavid Rees    it eliminates the outer loop over self.terminals, and thereby reduces the
384*99e0aae7SDavid Rees    number of next_symbol calls by a factor of len(self.terminals).
385*99e0aae7SDavid Rees
386*99e0aae7SDavid Rees    Args:
387*99e0aae7SDavid Rees      items: The set of items representing the initial DFA state.
388*99e0aae7SDavid Rees
389*99e0aae7SDavid Rees    Returns:
390*99e0aae7SDavid Rees      A dict from symbols to sets of items representing the new DFA states.
391*99e0aae7SDavid Rees    """
392*99e0aae7SDavid Rees    results = collections.defaultdict(set)
393*99e0aae7SDavid Rees    for item in items:
394*99e0aae7SDavid Rees      next_symbol = item.next_symbol
395*99e0aae7SDavid Rees      if next_symbol is None:
396*99e0aae7SDavid Rees        continue
397*99e0aae7SDavid Rees      item = self._item_cache[item.production, item.dot + 1, item.terminal]
398*99e0aae7SDavid Rees      # Inlining the cache check results in a ~25% speedup in this function, and
399*99e0aae7SDavid Rees      # about 10% overall speedup to parser().
400*99e0aae7SDavid Rees      if item in self._closure_of_item_cache:
401*99e0aae7SDavid Rees        closure = self._closure_of_item_cache[item]
402*99e0aae7SDavid Rees      else:
403*99e0aae7SDavid Rees        closure = self._closure_of_item(item)
404*99e0aae7SDavid Rees      # _closure will add newly-started Items (Items with dot=0) to the result
405*99e0aae7SDavid Rees      # set.  After this operation, the result set will correspond to the new
406*99e0aae7SDavid Rees      # state.
407*99e0aae7SDavid Rees      results[next_symbol].update(closure)
408*99e0aae7SDavid Rees    return results
409*99e0aae7SDavid Rees
410*99e0aae7SDavid Rees  def _items(self):
411*99e0aae7SDavid Rees    """The items function from ALSU p261.
412*99e0aae7SDavid Rees
413*99e0aae7SDavid Rees    _items computes the set of sets of LR(1) items for a shift-reduce parser
414*99e0aae7SDavid Rees    that matches the grammar.  Each set of LR(1) items corresponds to a single
415*99e0aae7SDavid Rees    DFA state.
416*99e0aae7SDavid Rees
417*99e0aae7SDavid Rees    Returns:
418*99e0aae7SDavid Rees      A tuple.
419*99e0aae7SDavid Rees
420*99e0aae7SDavid Rees      The first element of the tuple is a list of sets of LR(1) items (each set
421*99e0aae7SDavid Rees      corresponding to a DFA state).
422*99e0aae7SDavid Rees
423*99e0aae7SDavid Rees      The second element of the tuple is a dictionary from (int, symbol) pairs
424*99e0aae7SDavid Rees      to ints, where all the ints are indexes into the list of sets of LR(1)
425*99e0aae7SDavid Rees      items.  This dictionary is based on the results of the _Goto function,
426*99e0aae7SDavid Rees      where item_sets[dict[i, sym]] == self._Goto(item_sets[i], sym).
427*99e0aae7SDavid Rees    """
428*99e0aae7SDavid Rees    # The list of states is seeded with the marker S' production.
429*99e0aae7SDavid Rees    item_list = [
430*99e0aae7SDavid Rees        frozenset(self._closure_of_item(
431*99e0aae7SDavid Rees            self._item_cache[self._seed_production, 0, END_OF_INPUT]))
432*99e0aae7SDavid Rees    ]
433*99e0aae7SDavid Rees    items = {item_list[0]: 0}
434*99e0aae7SDavid Rees    goto_table = {}
435*99e0aae7SDavid Rees    i = 0
436*99e0aae7SDavid Rees    # For each state, figure out what the new state when each symbol is added to
437*99e0aae7SDavid Rees    # the top of the parsing stack (see the comments in parser._parse).  See
438*99e0aae7SDavid Rees    # _Goto for an explanation of how that is actually computed.
439*99e0aae7SDavid Rees    while i < len(item_list):
440*99e0aae7SDavid Rees      item_set = item_list[i]
441*99e0aae7SDavid Rees      gotos = self._parallel_goto(item_set)
442*99e0aae7SDavid Rees      for symbol, goto in gotos.items():
443*99e0aae7SDavid Rees        goto = frozenset(goto)
444*99e0aae7SDavid Rees        if goto not in items:
445*99e0aae7SDavid Rees          items[goto] = len(item_list)
446*99e0aae7SDavid Rees          item_list.append(goto)
447*99e0aae7SDavid Rees        goto_table[i, symbol] = items[goto]
448*99e0aae7SDavid Rees      i += 1
449*99e0aae7SDavid Rees    return item_list, goto_table
450*99e0aae7SDavid Rees
451*99e0aae7SDavid Rees  def parser(self):
452*99e0aae7SDavid Rees    """parser returns an LR(1) parser for the Grammar.
453*99e0aae7SDavid Rees
454*99e0aae7SDavid Rees    This implements the Canonical LR(1) ("LR(1)") parser algorithm ("Algorithm
455*99e0aae7SDavid Rees    4.56", ALSU p265), rather than the more common Lookahead LR(1) ("LALR(1)")
456*99e0aae7SDavid Rees    algorithm.  LALR(1) produces smaller tables, but is more complex and does
457*99e0aae7SDavid Rees    not cover all LR(1) grammars.  When the LR(1) and LALR(1) algorithms were
458*99e0aae7SDavid Rees    invented, table sizes were an important consideration; now, the difference
459*99e0aae7SDavid Rees    between a few hundred and a few thousand entries is unlikely to matter.
460*99e0aae7SDavid Rees
461*99e0aae7SDavid Rees    At this time, Grammar does not handle ambiguous grammars, which are commonly
462*99e0aae7SDavid Rees    used to handle precedence, associativity, and the "dangling else" problem.
463*99e0aae7SDavid Rees    Formally, these can always be handled by an unambiguous grammar, though
464*99e0aae7SDavid Rees    doing so can be cumbersome, particularly for expression languages with many
465*99e0aae7SDavid Rees    levels of precedence.  ALSU section 4.8 (pp278-287) contains some techniques
466*99e0aae7SDavid Rees    for handling these kinds of ambiguity.
467*99e0aae7SDavid Rees
468*99e0aae7SDavid Rees    Returns:
469*99e0aae7SDavid Rees      A Parser.
470*99e0aae7SDavid Rees    """
471*99e0aae7SDavid Rees    item_sets, goto = self._items()
472*99e0aae7SDavid Rees    action = {}
473*99e0aae7SDavid Rees    conflicts = set()
474*99e0aae7SDavid Rees    end_item = self._item_cache[self._seed_production, 1, END_OF_INPUT]
475*99e0aae7SDavid Rees    for i in range(len(item_sets)):
476*99e0aae7SDavid Rees      for item in item_sets[i]:
477*99e0aae7SDavid Rees        new_action = None
478*99e0aae7SDavid Rees        if (item.next_symbol is None and
479*99e0aae7SDavid Rees            item.production != self._seed_production):
480*99e0aae7SDavid Rees          terminal = item.terminal
481*99e0aae7SDavid Rees          new_action = Reduce(item.production)
482*99e0aae7SDavid Rees        elif item.next_symbol in self.terminals:
483*99e0aae7SDavid Rees          terminal = item.next_symbol
484*99e0aae7SDavid Rees          assert goto[i, terminal] is not None
485*99e0aae7SDavid Rees          new_action = Shift(goto[i, terminal], item_sets[goto[i, terminal]])
486*99e0aae7SDavid Rees        if new_action:
487*99e0aae7SDavid Rees          if (i, terminal) in action and action[i, terminal] != new_action:
488*99e0aae7SDavid Rees            conflicts.add(
489*99e0aae7SDavid Rees                Conflict(i, terminal,
490*99e0aae7SDavid Rees                         frozenset([action[i, terminal], new_action])))
491*99e0aae7SDavid Rees          action[i, terminal] = new_action
492*99e0aae7SDavid Rees        if item == end_item:
493*99e0aae7SDavid Rees          new_action = Accept()
494*99e0aae7SDavid Rees          assert (i, END_OF_INPUT
495*99e0aae7SDavid Rees                 ) not in action or action[i, END_OF_INPUT] == new_action
496*99e0aae7SDavid Rees          action[i, END_OF_INPUT] = new_action
497*99e0aae7SDavid Rees    trimmed_goto = {}
498*99e0aae7SDavid Rees    for k in goto:
499*99e0aae7SDavid Rees      if k[1] in self.nonterminals:
500*99e0aae7SDavid Rees        trimmed_goto[k] = goto[k]
501*99e0aae7SDavid Rees    expected = {}
502*99e0aae7SDavid Rees    for state, terminal in action:
503*99e0aae7SDavid Rees      if state not in expected:
504*99e0aae7SDavid Rees        expected[state] = set()
505*99e0aae7SDavid Rees      expected[state].add(terminal)
506*99e0aae7SDavid Rees    return Parser(item_sets, trimmed_goto, action, expected, conflicts,
507*99e0aae7SDavid Rees                  self.terminals, self.nonterminals, self.productions)
508*99e0aae7SDavid Rees
509*99e0aae7SDavid Rees
510*99e0aae7SDavid ReesParseError = collections.namedtuple("ParseError", ["code", "index", "token",
511*99e0aae7SDavid Rees                                                   "state", "expected_tokens"])
512*99e0aae7SDavid ReesParseResult = collections.namedtuple("ParseResult", ["parse_tree", "error"])
513*99e0aae7SDavid Rees
514*99e0aae7SDavid Rees
515*99e0aae7SDavid Reesclass Parser(object):
516*99e0aae7SDavid Rees  """Parser is a shift-reduce LR(1) parser.
517*99e0aae7SDavid Rees
518*99e0aae7SDavid Rees  Generally, clients will want to get a Parser from a Grammar, rather than
519*99e0aae7SDavid Rees  directly instantiating one.
520*99e0aae7SDavid Rees
521*99e0aae7SDavid Rees  Parser exposes the raw tables needed to feed into a Shift-Reduce parser,
522*99e0aae7SDavid Rees  but can also be used directly for parsing.
523*99e0aae7SDavid Rees
524*99e0aae7SDavid Rees  Attributes:
525*99e0aae7SDavid Rees    item_sets: A list of item sets which correspond to the state numbers in
526*99e0aae7SDavid Rees      the action and goto tables.  This is not necessary for parsing, but is
527*99e0aae7SDavid Rees      useful for debugging parsers.
528*99e0aae7SDavid Rees    goto: The GOTO table for this parser.
529*99e0aae7SDavid Rees    action: The ACTION table for this parser.
530*99e0aae7SDavid Rees    expected: A table of terminal symbols that are expected (that is, that
531*99e0aae7SDavid Rees      have a non-Error action) for each state.  This can be used to provide
532*99e0aae7SDavid Rees      more helpful error messages for parse errors.
533*99e0aae7SDavid Rees    conflicts: A set of unresolved conflicts found during table generation.
534*99e0aae7SDavid Rees    terminals: A set of terminal symbols in the grammar.
535*99e0aae7SDavid Rees    nonterminals: A set of nonterminal symbols in the grammar.
536*99e0aae7SDavid Rees    productions: A list of productions in the grammar.
537*99e0aae7SDavid Rees    default_errors: A dict of states to default error codes to use when
538*99e0aae7SDavid Rees      encountering an error in that state, when a more-specific Error for the
539*99e0aae7SDavid Rees      state/terminal pair has not been set.
540*99e0aae7SDavid Rees  """
541*99e0aae7SDavid Rees
542*99e0aae7SDavid Rees  def __init__(self, item_sets, goto, action, expected, conflicts, terminals,
543*99e0aae7SDavid Rees               nonterminals, productions):
544*99e0aae7SDavid Rees    super(Parser, self).__init__()
545*99e0aae7SDavid Rees    self.item_sets = item_sets
546*99e0aae7SDavid Rees    self.goto = goto
547*99e0aae7SDavid Rees    self.action = action
548*99e0aae7SDavid Rees    self.expected = expected
549*99e0aae7SDavid Rees    self.conflicts = conflicts
550*99e0aae7SDavid Rees    self.terminals = terminals
551*99e0aae7SDavid Rees    self.nonterminals = nonterminals
552*99e0aae7SDavid Rees    self.productions = productions
553*99e0aae7SDavid Rees    self.default_errors = {}
554*99e0aae7SDavid Rees
555*99e0aae7SDavid Rees  def _parse(self, tokens):
556*99e0aae7SDavid Rees    """_parse implements Shift-Reduce parsing algorithm.
557*99e0aae7SDavid Rees
558*99e0aae7SDavid Rees    _parse implements the standard shift-reduce algorithm outlined on ASLU
559*99e0aae7SDavid Rees    pp236-237.
560*99e0aae7SDavid Rees
561*99e0aae7SDavid Rees    Arguments:
562*99e0aae7SDavid Rees      tokens: the list of token objects to parse.
563*99e0aae7SDavid Rees
564*99e0aae7SDavid Rees    Returns:
565*99e0aae7SDavid Rees      A ParseResult.
566*99e0aae7SDavid Rees    """
567*99e0aae7SDavid Rees    # The END_OF_INPUT token is explicitly added to avoid explicit "cursor <
568*99e0aae7SDavid Rees    # len(tokens)" checks.
569*99e0aae7SDavid Rees    tokens = list(tokens) + [Symbol(END_OF_INPUT)]
570*99e0aae7SDavid Rees
571*99e0aae7SDavid Rees    # Each element of stack is a parse state and a (possibly partial) parse
572*99e0aae7SDavid Rees    # tree.  The state at the top of the stack encodes which productions are
573*99e0aae7SDavid Rees    # "active" (that is, which ones the parser has seen partial input which
574*99e0aae7SDavid Rees    # matches some prefix of the production, in a place where that production
575*99e0aae7SDavid Rees    # might be valid), and, for each active production, how much of the
576*99e0aae7SDavid Rees    # production has been completed.
577*99e0aae7SDavid Rees    stack = [(0, None)]
578*99e0aae7SDavid Rees
579*99e0aae7SDavid Rees    def state():
580*99e0aae7SDavid Rees      return stack[-1][0]
581*99e0aae7SDavid Rees
582*99e0aae7SDavid Rees    cursor = 0
583*99e0aae7SDavid Rees
584*99e0aae7SDavid Rees    # On each iteration, look at the next symbol and the current state, and
585*99e0aae7SDavid Rees    # perform the corresponding action.
586*99e0aae7SDavid Rees    while True:
587*99e0aae7SDavid Rees      if (state(), tokens[cursor].symbol) not in self.action:
588*99e0aae7SDavid Rees        # Most state/symbol entries would be Errors, so rather than exhaustively
589*99e0aae7SDavid Rees        # adding error entries, we just check here.
590*99e0aae7SDavid Rees        if state() in self.default_errors:
591*99e0aae7SDavid Rees          next_action = Error(self.default_errors[state()])
592*99e0aae7SDavid Rees        else:
593*99e0aae7SDavid Rees          next_action = Error(None)
594*99e0aae7SDavid Rees      else:
595*99e0aae7SDavid Rees        next_action = self.action[state(), tokens[cursor].symbol]
596*99e0aae7SDavid Rees
597*99e0aae7SDavid Rees      if isinstance(next_action, Shift):
598*99e0aae7SDavid Rees        # Shift means that there are no "complete" productions on the stack,
599*99e0aae7SDavid Rees        # and so the current token should be shifted onto the stack, with a new
600*99e0aae7SDavid Rees        # state indicating the new set of "active" productions.
601*99e0aae7SDavid Rees        stack.append((next_action.state, tokens[cursor]))
602*99e0aae7SDavid Rees        cursor += 1
603*99e0aae7SDavid Rees      elif isinstance(next_action, Accept):
604*99e0aae7SDavid Rees        # Accept means that parsing is over, successfully.
605*99e0aae7SDavid Rees        assert len(stack) == 2, "Accepted incompletely-reduced input."
606*99e0aae7SDavid Rees        assert tokens[cursor].symbol == END_OF_INPUT, ("Accepted parse before "
607*99e0aae7SDavid Rees                                                       "end of input.")
608*99e0aae7SDavid Rees        return ParseResult(stack[-1][1], None)
609*99e0aae7SDavid Rees      elif isinstance(next_action, Reduce):
610*99e0aae7SDavid Rees        # Reduce means that there is a complete production on the stack, and
611*99e0aae7SDavid Rees        # that the next symbol implies that the completed production is the
612*99e0aae7SDavid Rees        # correct production.
613*99e0aae7SDavid Rees        #
614*99e0aae7SDavid Rees        # Per ALSU, we would simply pop an element off the state stack for each
615*99e0aae7SDavid Rees        # symbol on the rhs of the production, and then push a new state by
616*99e0aae7SDavid Rees        # looking up the (post-pop) current state and the lhs of the production
617*99e0aae7SDavid Rees        # in GOTO.  The GOTO table, in some sense, is equivalent to shift
618*99e0aae7SDavid Rees        # actions for nonterminal symbols.
619*99e0aae7SDavid Rees        #
620*99e0aae7SDavid Rees        # Here, we attach a new partial parse tree, with the production lhs as
621*99e0aae7SDavid Rees        # the "name" of the tree, and the popped trees as the "children" of the
622*99e0aae7SDavid Rees        # new tree.
623*99e0aae7SDavid Rees        children = [
624*99e0aae7SDavid Rees            item[1] for item in stack[len(stack) - len(next_action.rule.rhs):]
625*99e0aae7SDavid Rees        ]
626*99e0aae7SDavid Rees        # Attach source_location, if known.  The source location will not be
627*99e0aae7SDavid Rees        # known if the reduction consumes no symbols (empty rhs) or if the
628*99e0aae7SDavid Rees        # client did not specify source_locations for tokens.
629*99e0aae7SDavid Rees        #
630*99e0aae7SDavid Rees        # It is necessary to loop in order to handle cases like:
631*99e0aae7SDavid Rees        #
632*99e0aae7SDavid Rees        # C -> c D
633*99e0aae7SDavid Rees        # D ->
634*99e0aae7SDavid Rees        #
635*99e0aae7SDavid Rees        # The D child of the C reduction will not have a source location
636*99e0aae7SDavid Rees        # (because it is not produced from any source), so it is necessary to
637*99e0aae7SDavid Rees        # scan backwards through C's children to find the end position.  The
638*99e0aae7SDavid Rees        # opposite is required in the case where initial children have no
639*99e0aae7SDavid Rees        # source.
640*99e0aae7SDavid Rees        #
641*99e0aae7SDavid Rees        # These loops implicitly handle the case where the reduction has no
642*99e0aae7SDavid Rees        # children, setting the source_location to None in that case.
643*99e0aae7SDavid Rees        start_position = None
644*99e0aae7SDavid Rees        end_position = None
645*99e0aae7SDavid Rees        for child in children:
646*99e0aae7SDavid Rees          if hasattr(child,
647*99e0aae7SDavid Rees                     "source_location") and child.source_location is not None:
648*99e0aae7SDavid Rees            start_position = child.source_location.start
649*99e0aae7SDavid Rees            break
650*99e0aae7SDavid Rees        for child in reversed(children):
651*99e0aae7SDavid Rees          if hasattr(child,
652*99e0aae7SDavid Rees                     "source_location") and child.source_location is not None:
653*99e0aae7SDavid Rees            end_position = child.source_location.end
654*99e0aae7SDavid Rees            break
655*99e0aae7SDavid Rees        if start_position is None:
656*99e0aae7SDavid Rees          source_location = None
657*99e0aae7SDavid Rees        else:
658*99e0aae7SDavid Rees          source_location = parser_types.make_location(start_position,
659*99e0aae7SDavid Rees                                                       end_position)
660*99e0aae7SDavid Rees        reduction = Reduction(next_action.rule.lhs, children, next_action.rule,
661*99e0aae7SDavid Rees                              source_location)
662*99e0aae7SDavid Rees        del stack[len(stack) - len(next_action.rule.rhs):]
663*99e0aae7SDavid Rees        stack.append((self.goto[state(), next_action.rule.lhs], reduction))
664*99e0aae7SDavid Rees      elif isinstance(next_action, Error):
665*99e0aae7SDavid Rees        # Error means that the parse is impossible.  For typical grammars and
666*99e0aae7SDavid Rees        # texts, this usually happens within a few tokens after the mistake in
667*99e0aae7SDavid Rees        # the input stream, which is convenient (though imperfect) for error
668*99e0aae7SDavid Rees        # reporting.
669*99e0aae7SDavid Rees        return ParseResult(None,
670*99e0aae7SDavid Rees                           ParseError(next_action.code, cursor, tokens[cursor],
671*99e0aae7SDavid Rees                                      state(), self.expected[state()]))
672*99e0aae7SDavid Rees      else:
673*99e0aae7SDavid Rees        assert False, "Shouldn't be here."
674*99e0aae7SDavid Rees
675*99e0aae7SDavid Rees  def mark_error(self, tokens, error_token, error_code):
676*99e0aae7SDavid Rees    """Marks an error state with the given error code.
677*99e0aae7SDavid Rees
678*99e0aae7SDavid Rees    mark_error implements the equivalent of the "Merr" system presented in
679*99e0aae7SDavid Rees    "Generating LR Syntax error Messages from Examples" (Jeffery, 2003).
680*99e0aae7SDavid Rees    This system has limitations, but has the primary advantage that error
681*99e0aae7SDavid Rees    messages can be specified by giving an example of the error and the
682*99e0aae7SDavid Rees    message itself.
683*99e0aae7SDavid Rees
684*99e0aae7SDavid Rees    Arguments:
685*99e0aae7SDavid Rees      tokens: a list of tokens to parse.
686*99e0aae7SDavid Rees      error_token: the token where the parse should fail, or None if the parse
687*99e0aae7SDavid Rees        should fail at the implicit end-of-input token.
688*99e0aae7SDavid Rees
689*99e0aae7SDavid Rees        If the error_token is the special ANY_TOKEN, then the error will be
690*99e0aae7SDavid Rees        recorded as the default error for the error state.
691*99e0aae7SDavid Rees      error_code: a value to record for the error state reached by parsing
692*99e0aae7SDavid Rees        tokens.
693*99e0aae7SDavid Rees
694*99e0aae7SDavid Rees    Returns:
695*99e0aae7SDavid Rees      None if error_code was successfully recorded, or an error message if there
696*99e0aae7SDavid Rees      was a problem.
697*99e0aae7SDavid Rees    """
698*99e0aae7SDavid Rees    result = self._parse(tokens)
699*99e0aae7SDavid Rees
700*99e0aae7SDavid Rees    # There is no error state to mark on a successful parse.
701*99e0aae7SDavid Rees    if not result.error:
702*99e0aae7SDavid Rees      return "Input successfully parsed."
703*99e0aae7SDavid Rees
704*99e0aae7SDavid Rees    # Check if the error occurred at the specified token; if not, then this was
705*99e0aae7SDavid Rees    # not the expected error.
706*99e0aae7SDavid Rees    if error_token is None:
707*99e0aae7SDavid Rees      error_symbol = END_OF_INPUT
708*99e0aae7SDavid Rees      if result.error.token.symbol != END_OF_INPUT:
709*99e0aae7SDavid Rees        return "error occurred on {} token, not end of input.".format(
710*99e0aae7SDavid Rees            result.error.token.symbol)
711*99e0aae7SDavid Rees    else:
712*99e0aae7SDavid Rees      error_symbol = error_token.symbol
713*99e0aae7SDavid Rees      if result.error.token != error_token:
714*99e0aae7SDavid Rees        return "error occurred on {} token, not {} token.".format(
715*99e0aae7SDavid Rees            result.error.token.symbol, error_token.symbol)
716*99e0aae7SDavid Rees
717*99e0aae7SDavid Rees    # If the expected error was found, attempt to mark it.  It is acceptable if
718*99e0aae7SDavid Rees    # the given error_code is already set as the error code for the given parse,
719*99e0aae7SDavid Rees    # but not if a different code is set.
720*99e0aae7SDavid Rees    if result.error.token == ANY_TOKEN:
721*99e0aae7SDavid Rees      # For ANY_TOKEN, mark it as a default error.
722*99e0aae7SDavid Rees      if result.error.state in self.default_errors:
723*99e0aae7SDavid Rees        if self.default_errors[result.error.state] == error_code:
724*99e0aae7SDavid Rees          return None
725*99e0aae7SDavid Rees        else:
726*99e0aae7SDavid Rees          return ("Attempted to overwrite existing default error code {!r} "
727*99e0aae7SDavid Rees                  "with new error code {!r} for state {}".format(
728*99e0aae7SDavid Rees                      self.default_errors[result.error.state], error_code,
729*99e0aae7SDavid Rees                      result.error.state))
730*99e0aae7SDavid Rees      else:
731*99e0aae7SDavid Rees        self.default_errors[result.error.state] = error_code
732*99e0aae7SDavid Rees        return None
733*99e0aae7SDavid Rees    else:
734*99e0aae7SDavid Rees      if (result.error.state, error_symbol) in self.action:
735*99e0aae7SDavid Rees        existing_error = self.action[result.error.state, error_symbol]
736*99e0aae7SDavid Rees        assert isinstance(existing_error, Error), "Bug"
737*99e0aae7SDavid Rees        if existing_error.code == error_code:
738*99e0aae7SDavid Rees          return None
739*99e0aae7SDavid Rees        else:
740*99e0aae7SDavid Rees          return ("Attempted to overwrite existing error code {!r} with new "
741*99e0aae7SDavid Rees                  "error code {!r} for state {}, terminal {}".format(
742*99e0aae7SDavid Rees                      existing_error.code, error_code, result.error.state,
743*99e0aae7SDavid Rees                      error_symbol))
744*99e0aae7SDavid Rees      else:
745*99e0aae7SDavid Rees        self.action[result.error.state, error_symbol] = Error(error_code)
746*99e0aae7SDavid Rees        return None
747*99e0aae7SDavid Rees    assert False, "All other paths should lead to return."
748*99e0aae7SDavid Rees
749*99e0aae7SDavid Rees  def parse(self, tokens):
750*99e0aae7SDavid Rees    """Parses a list of tokens.
751*99e0aae7SDavid Rees
752*99e0aae7SDavid Rees    Arguments:
753*99e0aae7SDavid Rees      tokens: a list of tokens to parse.
754*99e0aae7SDavid Rees
755*99e0aae7SDavid Rees    Returns:
756*99e0aae7SDavid Rees      A ParseResult.
757*99e0aae7SDavid Rees    """
758*99e0aae7SDavid Rees    result = self._parse(tokens)
759*99e0aae7SDavid Rees    return result
760