compiler/front_end/lr1.py

*99e0aae7SDavid Rees# Copyright 2019 Google LLC
*99e0aae7SDavid Rees#
*99e0aae7SDavid Rees# Licensed under the Apache License, Version 2.0 (the "License");
*99e0aae7SDavid Rees# you may not use this file except in compliance with the License.
*99e0aae7SDavid Rees# You may obtain a copy of the License at
*99e0aae7SDavid Rees#
*99e0aae7SDavid Rees#     https://www.apache.org/licenses/LICENSE-2.0
*99e0aae7SDavid Rees#
*99e0aae7SDavid Rees# Unless required by applicable law or agreed to in writing, software
*99e0aae7SDavid Rees# distributed under the License is distributed on an "AS IS" BASIS,
*99e0aae7SDavid Rees# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*99e0aae7SDavid Rees# See the License for the specific language governing permissions and
*99e0aae7SDavid Rees# limitations under the License.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees"""LR(1) parser generator.
*99e0aae7SDavid Rees
*99e0aae7SDavid ReesThe primary class in this module, Grammar, takes a list of context-free grammar
*99e0aae7SDavid Reesproductions, and produces the corresponding LR(1) shift-reduce parser.  This is
*99e0aae7SDavid Reesan implementation of the algorithm on pages 221 and 261-265 of "Compilers:
*99e0aae7SDavid ReesPrinciples, Techniques, & Tools" (Second Edition) by Aho, Lam, Sethi, and
*99e0aae7SDavid ReesUllman, also known as "The Dragon Book," hereafter referred to as "ALSU."
*99e0aae7SDavid Rees
*99e0aae7SDavid ReesThis module only implements the LR(1) algorithms; unlike tools such as yacc, it
*99e0aae7SDavid Reesdoes not implement the various bits of glue necessary to actually use a parser.
*99e0aae7SDavid ReesClients are expected to provide their own tokenizers and handle turning a raw
*99e0aae7SDavid Reesparse tree into an intermediate representation on their own.
*99e0aae7SDavid Rees"""
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesimport collections
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesfrom compiler.util import parser_types
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesclass Item(collections.namedtuple("Item", ["production", "dot", "terminal",
*99e0aae7SDavid Rees                                           "next_symbol"])):
*99e0aae7SDavid Rees  """An Item is an LR(1) Item: a production, a cursor location, and a terminal.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  An Item represents a partially-parsed production, and a lookahead symbol.  The
*99e0aae7SDavid Rees  position of the dot indicates what portion of the production has been parsed.
*99e0aae7SDavid Rees  Generally, Items are an internal implementation detail, but they can be useful
*99e0aae7SDavid Rees  elsewhere, particularly for debugging.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Attributes:
*99e0aae7SDavid Rees    production: The Production this Item covers.
*99e0aae7SDavid Rees    dot: The index of the "dot" in production's rhs.
*99e0aae7SDavid Rees    terminal: The terminal lookahead symbol that follows the production in the
*99e0aae7SDavid Rees        input stream.
*99e0aae7SDavid Rees  """
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def __str__(self):
*99e0aae7SDavid Rees    """__str__ generates ASLU notation."""
*99e0aae7SDavid Rees    return (str(self.production.lhs) + " -> " + " ".join(
*99e0aae7SDavid Rees        [str(r) for r in self.production.rhs[0:self.dot] + (".",) +
*99e0aae7SDavid Rees         self.production.rhs[self.dot:]]) + ", " + str(self.terminal))
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  @staticmethod
*99e0aae7SDavid Rees  def parse(text):
*99e0aae7SDavid Rees    """Parses an Item in ALSU notation.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Parses an Item from notation like:
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees       symbol -> foo . bar baz, qux
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    where "symbol -> foo bar baz" will be taken as the production, the position
*99e0aae7SDavid Rees    of the "." is taken as "dot" (in this case 1), and the symbol after "," is
*99e0aae7SDavid Rees    taken as the "terminal".  The following are also valid items:
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees       sym -> ., foo
*99e0aae7SDavid Rees       sym -> . foo bar, baz
*99e0aae7SDavid Rees       sym -> foo bar ., baz
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Symbols on the right-hand side of the production should be separated by
*99e0aae7SDavid Rees    whitespace.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Arguments:
*99e0aae7SDavid Rees      text: The text to parse into an Item.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      An Item.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    production, terminal = text.split(",")
*99e0aae7SDavid Rees    terminal = terminal.strip()
*99e0aae7SDavid Rees    if terminal == "$":
*99e0aae7SDavid Rees      terminal = END_OF_INPUT
*99e0aae7SDavid Rees    lhs, rhs = production.split("->")
*99e0aae7SDavid Rees    lhs = lhs.strip()
*99e0aae7SDavid Rees    if lhs == "S'":
*99e0aae7SDavid Rees      lhs = START_PRIME
*99e0aae7SDavid Rees    before_dot, after_dot = rhs.split(".")
*99e0aae7SDavid Rees    handle = before_dot.split()
*99e0aae7SDavid Rees    tail = after_dot.split()
*99e0aae7SDavid Rees    return make_item(parser_types.Production(lhs, tuple(handle + tail)),
*99e0aae7SDavid Rees                     len(handle), terminal)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesdef make_item(production, dot, symbol):
*99e0aae7SDavid Rees  return Item(production, dot, symbol,
*99e0aae7SDavid Rees              None if dot >= len(production.rhs) else production.rhs[dot])
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesclass Conflict(
*99e0aae7SDavid Rees    collections.namedtuple("Conflict", ["state", "symbol", "actions"])
*99e0aae7SDavid Rees):
*99e0aae7SDavid Rees  """Conflict represents a parse conflict."""
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def __str__(self):
*99e0aae7SDavid Rees    return "Conflict for {} in state {}: ".format(
*99e0aae7SDavid Rees        self.symbol, self.state) + " vs ".join([str(a) for a in self.actions])
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid ReesShift = collections.namedtuple("Shift", ["state", "items"])
*99e0aae7SDavid ReesReduce = collections.namedtuple("Reduce", ["rule"])
*99e0aae7SDavid ReesAccept = collections.namedtuple("Accept", [])
*99e0aae7SDavid ReesError = collections.namedtuple("Error", ["code"])
*99e0aae7SDavid Rees
*99e0aae7SDavid ReesSymbol = collections.namedtuple("Symbol", ["symbol"])
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees# START_PRIME is the implicit 'real' root symbol for the grammar.
*99e0aae7SDavid ReesSTART_PRIME = "S'"
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees# END_OF_INPUT is the implicit symbol at the end of input.
*99e0aae7SDavid ReesEND_OF_INPUT = "$"
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees# ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced
*99e0aae7SDavid Rees# by every other token.
*99e0aae7SDavid ReesANY_TOKEN = parser_types.Token(object(), "*",
*99e0aae7SDavid Rees                               parser_types.parse_location("0:0-0:0"))
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesclass Reduction(collections.namedtuple("Reduction",
*99e0aae7SDavid Rees                                       ["symbol", "children", "production",
*99e0aae7SDavid Rees                                        "source_location"])):
*99e0aae7SDavid Rees  """A Reduction is a non-leaf node in a parse tree.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Attributes:
*99e0aae7SDavid Rees    symbol: The name of this element in the parse.
*99e0aae7SDavid Rees    children: The child elements of this parse.
*99e0aae7SDavid Rees    production: The grammar production to which this reduction corresponds.
*99e0aae7SDavid Rees    source_location: If known, the range in the source text corresponding to the
*99e0aae7SDavid Rees      tokens from which this reduction was parsed.  May be 'None' if this
*99e0aae7SDavid Rees      reduction was produced from no symbols, or if the tokens fed to `parse`
*99e0aae7SDavid Rees      did not include source_location.
*99e0aae7SDavid Rees  """
*99e0aae7SDavid Rees  pass
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesclass Grammar(object):
*99e0aae7SDavid Rees  """Grammar is an LR(1) context-free grammar.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Attributes:
*99e0aae7SDavid Rees    start: The start symbol for the grammar.
*99e0aae7SDavid Rees    productions: A list of productions in the grammar, including the S' -> start
*99e0aae7SDavid Rees      production.
*99e0aae7SDavid Rees    symbols: A set of all symbols in the grammar, including $ and S'.
*99e0aae7SDavid Rees    nonterminals: A set of all nonterminal symbols in the grammar, including S'.
*99e0aae7SDavid Rees    terminals: A set of all terminal symbols in the grammar, including $.
*99e0aae7SDavid Rees  """
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def __init__(self, start_symbol, productions):
*99e0aae7SDavid Rees    """Constructs a Grammar object.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Arguments:
*99e0aae7SDavid Rees      start_symbol: The start symbol for the grammar.
*99e0aae7SDavid Rees      productions: A list of productions (not including the "S' -> start_symbol"
*99e0aae7SDavid Rees          production).
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    object.__init__(self)
*99e0aae7SDavid Rees    self.start = start_symbol
*99e0aae7SDavid Rees    self._seed_production = parser_types.Production(START_PRIME, (self.start,))
*99e0aae7SDavid Rees    self.productions = productions + [self._seed_production]
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    self._single_level_closure_of_item_cache = {}
*99e0aae7SDavid Rees    self._closure_of_item_cache = {}
*99e0aae7SDavid Rees    self._compute_symbols()
*99e0aae7SDavid Rees    self._compute_seed_firsts()
*99e0aae7SDavid Rees    self._set_productions_by_lhs()
*99e0aae7SDavid Rees    self._populate_item_cache()
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _set_productions_by_lhs(self):
*99e0aae7SDavid Rees    # Prepopulating _productions_by_lhs speeds up _closure_of_item by about 30%,
*99e0aae7SDavid Rees    # which is significant on medium-to-large grammars.
*99e0aae7SDavid Rees    self._productions_by_lhs = {}
*99e0aae7SDavid Rees    for production in self.productions:
*99e0aae7SDavid Rees      self._productions_by_lhs.setdefault(production.lhs, list()).append(
*99e0aae7SDavid Rees          production)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _populate_item_cache(self):
*99e0aae7SDavid Rees    # There are a relatively small number of possible Items for a grammar, and
*99e0aae7SDavid Rees    # the algorithm needs to get Items from their constituent components very
*99e0aae7SDavid Rees    # frequently.  As it turns out, pre-caching all possible Items results in a
*99e0aae7SDavid Rees    # ~35% overall speedup to Grammar.parser().
*99e0aae7SDavid Rees    self._item_cache = {}
*99e0aae7SDavid Rees    for symbol in self.terminals:
*99e0aae7SDavid Rees      for production in self.productions:
*99e0aae7SDavid Rees        for dot in range(len(production.rhs) + 1):
*99e0aae7SDavid Rees          self._item_cache[production, dot, symbol] = make_item(
*99e0aae7SDavid Rees              production, dot, symbol)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _compute_symbols(self):
*99e0aae7SDavid Rees    """Finds all grammar symbols, and sorts them into terminal and non-terminal.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Nonterminal symbols are those which appear on the left side of any
*99e0aae7SDavid Rees    production.  Terminal symbols are those which do not.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _compute_symbols is used during __init__.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    self.symbols = {END_OF_INPUT}
*99e0aae7SDavid Rees    self.nonterminals = set()
*99e0aae7SDavid Rees    for production in self.productions:
*99e0aae7SDavid Rees      self.symbols.add(production.lhs)
*99e0aae7SDavid Rees      self.nonterminals.add(production.lhs)
*99e0aae7SDavid Rees      for symbol in production.rhs:
*99e0aae7SDavid Rees        self.symbols.add(symbol)
*99e0aae7SDavid Rees    self.terminals = self.symbols - self.nonterminals
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _compute_seed_firsts(self):
*99e0aae7SDavid Rees    """Computes FIRST (ALSU p221) for all terminal and nonterminal symbols.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    The algorithm for computing FIRST is an iterative one that terminates when
*99e0aae7SDavid Rees    it reaches a fixed point (that is, when further iterations stop changing
*99e0aae7SDavid Rees    state).  _compute_seed_firsts computes the fixed point for all single-symbol
*99e0aae7SDavid Rees    strings, by repeatedly calling _first and updating the internal _firsts
*99e0aae7SDavid Rees    table with the results.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Once _compute_seed_firsts has completed, _first will return correct results
*99e0aae7SDavid Rees    for both single- and multi-symbol strings.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _compute_seed_firsts is used during __init__.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    self.firsts = {}
*99e0aae7SDavid Rees    # FIRST for a terminal symbol is always just that terminal symbol.
*99e0aae7SDavid Rees    for terminal in self.terminals:
*99e0aae7SDavid Rees      self.firsts[terminal] = set([terminal])
*99e0aae7SDavid Rees    for nonterminal in self.nonterminals:
*99e0aae7SDavid Rees      self.firsts[nonterminal] = set()
*99e0aae7SDavid Rees    while True:
*99e0aae7SDavid Rees      # The first iteration picks up all the productions that start with
*99e0aae7SDavid Rees      # terminal symbols.  The second iteration picks up productions that start
*99e0aae7SDavid Rees      # with nonterminals that the first iteration picked up.  The third
*99e0aae7SDavid Rees      # iteration picks up nonterminals that the first and second picked up, and
*99e0aae7SDavid Rees      # so on.
*99e0aae7SDavid Rees      #
*99e0aae7SDavid Rees      # This is guaranteed to end, in the worst case, when every terminal
*99e0aae7SDavid Rees      # symbol and epsilon has been added to the _firsts set for every
*99e0aae7SDavid Rees      # nonterminal symbol.  This would be slow, but requires a pathological
*99e0aae7SDavid Rees      # grammar; useful grammars should complete in only a few iterations.
*99e0aae7SDavid Rees      firsts_to_add = {}
*99e0aae7SDavid Rees      for production in self.productions:
*99e0aae7SDavid Rees        for first in self._first(production.rhs):
*99e0aae7SDavid Rees          if first not in self.firsts[production.lhs]:
*99e0aae7SDavid Rees            if production.lhs not in firsts_to_add:
*99e0aae7SDavid Rees              firsts_to_add[production.lhs] = set()
*99e0aae7SDavid Rees            firsts_to_add[production.lhs].add(first)
*99e0aae7SDavid Rees      if not firsts_to_add:
*99e0aae7SDavid Rees        break
*99e0aae7SDavid Rees      for symbol in firsts_to_add:
*99e0aae7SDavid Rees        self.firsts[symbol].update(firsts_to_add[symbol])
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _first(self, symbols):
*99e0aae7SDavid Rees    """The FIRST function from ALSU p221.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _first takes a string of symbols (both terminals and nonterminals) and
*99e0aae7SDavid Rees    returns the set of terminal symbols which could be the first terminal symbol
*99e0aae7SDavid Rees    of a string produced by the given list of symbols.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _first will not give fully-correct results until _compute_seed_firsts
*99e0aae7SDavid Rees    finishes, but is called by _compute_seed_firsts, and must provide partial
*99e0aae7SDavid Rees    results during that method's execution.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Args:
*99e0aae7SDavid Rees      symbols: A list of symbols.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A set of terminals which could be the first terminal in "symbols."
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    result = set()
*99e0aae7SDavid Rees    all_contain_epsilon = True
*99e0aae7SDavid Rees    for symbol in symbols:
*99e0aae7SDavid Rees      for first in self.firsts[symbol]:
*99e0aae7SDavid Rees        if first:
*99e0aae7SDavid Rees          result.add(first)
*99e0aae7SDavid Rees      if None not in self.firsts[symbol]:
*99e0aae7SDavid Rees        all_contain_epsilon = False
*99e0aae7SDavid Rees        break
*99e0aae7SDavid Rees    if all_contain_epsilon:
*99e0aae7SDavid Rees      # "None" seems like a Pythonic way of representing epsilon (no symbol).
*99e0aae7SDavid Rees      result.add(None)
*99e0aae7SDavid Rees    return result
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _closure_of_item(self, root_item):
*99e0aae7SDavid Rees    """Modified implementation of CLOSURE from ALSU p261.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _closure_of_item performs the CLOSURE function with a single seed item, with
*99e0aae7SDavid Rees    memoization.  In the algorithm as presented in ALSU, CLOSURE is called with
*99e0aae7SDavid Rees    a different set of items every time, which is unhelpful for memoization.
*99e0aae7SDavid Rees    Instead, we let _parallel_goto merge the sets returned by _closure_of_item,
*99e0aae7SDavid Rees    which results in a ~40% speedup.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    CLOSURE, roughly, computes the set of LR(1) Items which might be active when
*99e0aae7SDavid Rees    a "seed" set of Items is active.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Technically, it is the epsilon-closure of the NFA states represented by
*99e0aae7SDavid Rees    "items," where an epsilon transition (a transition that does not consume any
*99e0aae7SDavid Rees    symbols) occurs from a->Z.bY,q to b->.X,p when p is in FIRST(Yq).  (a and b
*99e0aae7SDavid Rees    are nonterminals, X, Y, and Z are arbitrary strings of symbols, and p and q
*99e0aae7SDavid Rees    are terminals.)  That is, it is the set of all NFA states which can be
*99e0aae7SDavid Rees    reached from "items" without consuming any input.  This set corresponds to a
*99e0aae7SDavid Rees    single DFA state.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Args:
*99e0aae7SDavid Rees      root_item: The initial LR(1) Item.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A set of LR(1) items which may be active at the time when the provided
*99e0aae7SDavid Rees      item is active.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    if root_item in self._closure_of_item_cache:
*99e0aae7SDavid Rees      return self._closure_of_item_cache[root_item]
*99e0aae7SDavid Rees    item_set = set([root_item])
*99e0aae7SDavid Rees    item_list = [root_item]
*99e0aae7SDavid Rees    i = 0
*99e0aae7SDavid Rees    # Each newly-added Item may trigger the addition of further Items, so
*99e0aae7SDavid Rees    # iterate until no new Items are added.  In the worst case, a new Item will
*99e0aae7SDavid Rees    # be added for each production.
*99e0aae7SDavid Rees    #
*99e0aae7SDavid Rees    # This algorithm is really looking for "next" nonterminals in the existing
*99e0aae7SDavid Rees    # items, and adding new items corresponding to their productions.
*99e0aae7SDavid Rees    while i < len(item_list):
*99e0aae7SDavid Rees      item = item_list[i]
*99e0aae7SDavid Rees      i += 1
*99e0aae7SDavid Rees      if not item.next_symbol:
*99e0aae7SDavid Rees        continue
*99e0aae7SDavid Rees      # If _closure_of_item_cache contains the full closure of item, then we can
*99e0aae7SDavid Rees      # add its full closure to the result set, and skip checking any of its
*99e0aae7SDavid Rees      # items: any item that would be added by any item in the cached result
*99e0aae7SDavid Rees      # will already be in the _closure_of_item_cache entry.
*99e0aae7SDavid Rees      if item in self._closure_of_item_cache:
*99e0aae7SDavid Rees        item_set |= self._closure_of_item_cache[item]
*99e0aae7SDavid Rees        continue
*99e0aae7SDavid Rees      # Even if we don't have the full closure of item, we may have the
*99e0aae7SDavid Rees      # immediate closure of item.  It turns out that memoizing just this step
*99e0aae7SDavid Rees      # speeds up this function by about 50%, even after the
*99e0aae7SDavid Rees      # _closure_of_item_cache check.
*99e0aae7SDavid Rees      if item not in self._single_level_closure_of_item_cache:
*99e0aae7SDavid Rees        new_items = set()
*99e0aae7SDavid Rees        for production in self._productions_by_lhs.get(item.next_symbol, []):
*99e0aae7SDavid Rees          for terminal in self._first(item.production.rhs[item.dot + 1:] +
*99e0aae7SDavid Rees                                      (item.terminal,)):
*99e0aae7SDavid Rees            new_items.add(self._item_cache[production, 0, terminal])
*99e0aae7SDavid Rees        self._single_level_closure_of_item_cache[item] = new_items
*99e0aae7SDavid Rees      for new_item in self._single_level_closure_of_item_cache[item]:
*99e0aae7SDavid Rees        if new_item not in item_set:
*99e0aae7SDavid Rees          item_set.add(new_item)
*99e0aae7SDavid Rees          item_list.append(new_item)
*99e0aae7SDavid Rees    self._closure_of_item_cache[root_item] = item_set
*99e0aae7SDavid Rees    # Typically, _closure_of_item() will be called on items whose closures
*99e0aae7SDavid Rees    # bring in the greatest number of additional items, then on items which
*99e0aae7SDavid Rees    # close over fewer and fewer other items.  Since items are not added to
*99e0aae7SDavid Rees    # _closure_of_item_cache unless _closure_of_item() is called directly on
*99e0aae7SDavid Rees    # them, this means that it is unlikely that items brought in will (without
*99e0aae7SDavid Rees    # intervention) have entries in _closure_of_item_cache, which slows down the
*99e0aae7SDavid Rees    # computation of the larger closures.
*99e0aae7SDavid Rees    #
*99e0aae7SDavid Rees    # Although it is not guaranteed, items added to item_list last will tend to
*99e0aae7SDavid Rees    # close over fewer items, and therefore be easier to compute.  By forcibly
*99e0aae7SDavid Rees    # re-calculating closures from last to first, and adding the results to
*99e0aae7SDavid Rees    # _closure_of_item_cache at each step, we get a modest performance
*99e0aae7SDavid Rees    # improvement: roughly 50% less time spent in _closure_of_item, which
*99e0aae7SDavid Rees    # translates to about 5% less time in parser().
*99e0aae7SDavid Rees    for item in item_list[::-1]:
*99e0aae7SDavid Rees      self._closure_of_item(item)
*99e0aae7SDavid Rees    return item_set
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _parallel_goto(self, items):
*99e0aae7SDavid Rees    """The GOTO function from ALSU p261, executed on all symbols.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _parallel_goto takes a set of Items, and returns a dict from every symbol in
*99e0aae7SDavid Rees    self.symbols to the set of Items that would be active after a shift
*99e0aae7SDavid Rees    operation (if symbol is a terminal) or after a reduction operation (if
*99e0aae7SDavid Rees    symbol is a nonterminal).
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _parallel_goto is used in lieu of the single-symbol GOTO from ALSU because
*99e0aae7SDavid Rees    it eliminates the outer loop over self.terminals, and thereby reduces the
*99e0aae7SDavid Rees    number of next_symbol calls by a factor of len(self.terminals).
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Args:
*99e0aae7SDavid Rees      items: The set of items representing the initial DFA state.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A dict from symbols to sets of items representing the new DFA states.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    results = collections.defaultdict(set)
*99e0aae7SDavid Rees    for item in items:
*99e0aae7SDavid Rees      next_symbol = item.next_symbol
*99e0aae7SDavid Rees      if next_symbol is None:
*99e0aae7SDavid Rees        continue
*99e0aae7SDavid Rees      item = self._item_cache[item.production, item.dot + 1, item.terminal]
*99e0aae7SDavid Rees      # Inlining the cache check results in a ~25% speedup in this function, and
*99e0aae7SDavid Rees      # about 10% overall speedup to parser().
*99e0aae7SDavid Rees      if item in self._closure_of_item_cache:
*99e0aae7SDavid Rees        closure = self._closure_of_item_cache[item]
*99e0aae7SDavid Rees      else:
*99e0aae7SDavid Rees        closure = self._closure_of_item(item)
*99e0aae7SDavid Rees      # _closure will add newly-started Items (Items with dot=0) to the result
*99e0aae7SDavid Rees      # set.  After this operation, the result set will correspond to the new
*99e0aae7SDavid Rees      # state.
*99e0aae7SDavid Rees      results[next_symbol].update(closure)
*99e0aae7SDavid Rees    return results
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _items(self):
*99e0aae7SDavid Rees    """The items function from ALSU p261.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _items computes the set of sets of LR(1) items for a shift-reduce parser
*99e0aae7SDavid Rees    that matches the grammar.  Each set of LR(1) items corresponds to a single
*99e0aae7SDavid Rees    DFA state.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A tuple.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees      The first element of the tuple is a list of sets of LR(1) items (each set
*99e0aae7SDavid Rees      corresponding to a DFA state).
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees      The second element of the tuple is a dictionary from (int, symbol) pairs
*99e0aae7SDavid Rees      to ints, where all the ints are indexes into the list of sets of LR(1)
*99e0aae7SDavid Rees      items.  This dictionary is based on the results of the _Goto function,
*99e0aae7SDavid Rees      where item_sets[dict[i, sym]] == self._Goto(item_sets[i], sym).
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    # The list of states is seeded with the marker S' production.
*99e0aae7SDavid Rees    item_list = [
*99e0aae7SDavid Rees        frozenset(self._closure_of_item(
*99e0aae7SDavid Rees            self._item_cache[self._seed_production, 0, END_OF_INPUT]))
*99e0aae7SDavid Rees    ]
*99e0aae7SDavid Rees    items = {item_list[0]: 0}
*99e0aae7SDavid Rees    goto_table = {}
*99e0aae7SDavid Rees    i = 0
*99e0aae7SDavid Rees    # For each state, figure out what the new state when each symbol is added to
*99e0aae7SDavid Rees    # the top of the parsing stack (see the comments in parser._parse).  See
*99e0aae7SDavid Rees    # _Goto for an explanation of how that is actually computed.
*99e0aae7SDavid Rees    while i < len(item_list):
*99e0aae7SDavid Rees      item_set = item_list[i]
*99e0aae7SDavid Rees      gotos = self._parallel_goto(item_set)
*99e0aae7SDavid Rees      for symbol, goto in gotos.items():
*99e0aae7SDavid Rees        goto = frozenset(goto)
*99e0aae7SDavid Rees        if goto not in items:
*99e0aae7SDavid Rees          items[goto] = len(item_list)
*99e0aae7SDavid Rees          item_list.append(goto)
*99e0aae7SDavid Rees        goto_table[i, symbol] = items[goto]
*99e0aae7SDavid Rees      i += 1
*99e0aae7SDavid Rees    return item_list, goto_table
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def parser(self):
*99e0aae7SDavid Rees    """parser returns an LR(1) parser for the Grammar.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    This implements the Canonical LR(1) ("LR(1)") parser algorithm ("Algorithm
*99e0aae7SDavid Rees    4.56", ALSU p265), rather than the more common Lookahead LR(1) ("LALR(1)")
*99e0aae7SDavid Rees    algorithm.  LALR(1) produces smaller tables, but is more complex and does
*99e0aae7SDavid Rees    not cover all LR(1) grammars.  When the LR(1) and LALR(1) algorithms were
*99e0aae7SDavid Rees    invented, table sizes were an important consideration; now, the difference
*99e0aae7SDavid Rees    between a few hundred and a few thousand entries is unlikely to matter.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    At this time, Grammar does not handle ambiguous grammars, which are commonly
*99e0aae7SDavid Rees    used to handle precedence, associativity, and the "dangling else" problem.
*99e0aae7SDavid Rees    Formally, these can always be handled by an unambiguous grammar, though
*99e0aae7SDavid Rees    doing so can be cumbersome, particularly for expression languages with many
*99e0aae7SDavid Rees    levels of precedence.  ALSU section 4.8 (pp278-287) contains some techniques
*99e0aae7SDavid Rees    for handling these kinds of ambiguity.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A Parser.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    item_sets, goto = self._items()
*99e0aae7SDavid Rees    action = {}
*99e0aae7SDavid Rees    conflicts = set()
*99e0aae7SDavid Rees    end_item = self._item_cache[self._seed_production, 1, END_OF_INPUT]
*99e0aae7SDavid Rees    for i in range(len(item_sets)):
*99e0aae7SDavid Rees      for item in item_sets[i]:
*99e0aae7SDavid Rees        new_action = None
*99e0aae7SDavid Rees        if (item.next_symbol is None and
*99e0aae7SDavid Rees            item.production != self._seed_production):
*99e0aae7SDavid Rees          terminal = item.terminal
*99e0aae7SDavid Rees          new_action = Reduce(item.production)
*99e0aae7SDavid Rees        elif item.next_symbol in self.terminals:
*99e0aae7SDavid Rees          terminal = item.next_symbol
*99e0aae7SDavid Rees          assert goto[i, terminal] is not None
*99e0aae7SDavid Rees          new_action = Shift(goto[i, terminal], item_sets[goto[i, terminal]])
*99e0aae7SDavid Rees        if new_action:
*99e0aae7SDavid Rees          if (i, terminal) in action and action[i, terminal] != new_action:
*99e0aae7SDavid Rees            conflicts.add(
*99e0aae7SDavid Rees                Conflict(i, terminal,
*99e0aae7SDavid Rees                         frozenset([action[i, terminal], new_action])))
*99e0aae7SDavid Rees          action[i, terminal] = new_action
*99e0aae7SDavid Rees        if item == end_item:
*99e0aae7SDavid Rees          new_action = Accept()
*99e0aae7SDavid Rees          assert (i, END_OF_INPUT
*99e0aae7SDavid Rees                 ) not in action or action[i, END_OF_INPUT] == new_action
*99e0aae7SDavid Rees          action[i, END_OF_INPUT] = new_action
*99e0aae7SDavid Rees    trimmed_goto = {}
*99e0aae7SDavid Rees    for k in goto:
*99e0aae7SDavid Rees      if k[1] in self.nonterminals:
*99e0aae7SDavid Rees        trimmed_goto[k] = goto[k]
*99e0aae7SDavid Rees    expected = {}
*99e0aae7SDavid Rees    for state, terminal in action:
*99e0aae7SDavid Rees      if state not in expected:
*99e0aae7SDavid Rees        expected[state] = set()
*99e0aae7SDavid Rees      expected[state].add(terminal)
*99e0aae7SDavid Rees    return Parser(item_sets, trimmed_goto, action, expected, conflicts,
*99e0aae7SDavid Rees                  self.terminals, self.nonterminals, self.productions)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid ReesParseError = collections.namedtuple("ParseError", ["code", "index", "token",
*99e0aae7SDavid Rees                                                   "state", "expected_tokens"])
*99e0aae7SDavid ReesParseResult = collections.namedtuple("ParseResult", ["parse_tree", "error"])
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees
*99e0aae7SDavid Reesclass Parser(object):
*99e0aae7SDavid Rees  """Parser is a shift-reduce LR(1) parser.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Generally, clients will want to get a Parser from a Grammar, rather than
*99e0aae7SDavid Rees  directly instantiating one.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Parser exposes the raw tables needed to feed into a Shift-Reduce parser,
*99e0aae7SDavid Rees  but can also be used directly for parsing.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  Attributes:
*99e0aae7SDavid Rees    item_sets: A list of item sets which correspond to the state numbers in
*99e0aae7SDavid Rees      the action and goto tables.  This is not necessary for parsing, but is
*99e0aae7SDavid Rees      useful for debugging parsers.
*99e0aae7SDavid Rees    goto: The GOTO table for this parser.
*99e0aae7SDavid Rees    action: The ACTION table for this parser.
*99e0aae7SDavid Rees    expected: A table of terminal symbols that are expected (that is, that
*99e0aae7SDavid Rees      have a non-Error action) for each state.  This can be used to provide
*99e0aae7SDavid Rees      more helpful error messages for parse errors.
*99e0aae7SDavid Rees    conflicts: A set of unresolved conflicts found during table generation.
*99e0aae7SDavid Rees    terminals: A set of terminal symbols in the grammar.
*99e0aae7SDavid Rees    nonterminals: A set of nonterminal symbols in the grammar.
*99e0aae7SDavid Rees    productions: A list of productions in the grammar.
*99e0aae7SDavid Rees    default_errors: A dict of states to default error codes to use when
*99e0aae7SDavid Rees      encountering an error in that state, when a more-specific Error for the
*99e0aae7SDavid Rees      state/terminal pair has not been set.
*99e0aae7SDavid Rees  """
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def __init__(self, item_sets, goto, action, expected, conflicts, terminals,
*99e0aae7SDavid Rees               nonterminals, productions):
*99e0aae7SDavid Rees    super(Parser, self).__init__()
*99e0aae7SDavid Rees    self.item_sets = item_sets
*99e0aae7SDavid Rees    self.goto = goto
*99e0aae7SDavid Rees    self.action = action
*99e0aae7SDavid Rees    self.expected = expected
*99e0aae7SDavid Rees    self.conflicts = conflicts
*99e0aae7SDavid Rees    self.terminals = terminals
*99e0aae7SDavid Rees    self.nonterminals = nonterminals
*99e0aae7SDavid Rees    self.productions = productions
*99e0aae7SDavid Rees    self.default_errors = {}
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def _parse(self, tokens):
*99e0aae7SDavid Rees    """_parse implements Shift-Reduce parsing algorithm.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    _parse implements the standard shift-reduce algorithm outlined on ASLU
*99e0aae7SDavid Rees    pp236-237.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Arguments:
*99e0aae7SDavid Rees      tokens: the list of token objects to parse.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A ParseResult.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    # The END_OF_INPUT token is explicitly added to avoid explicit "cursor <
*99e0aae7SDavid Rees    # len(tokens)" checks.
*99e0aae7SDavid Rees    tokens = list(tokens) + [Symbol(END_OF_INPUT)]
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    # Each element of stack is a parse state and a (possibly partial) parse
*99e0aae7SDavid Rees    # tree.  The state at the top of the stack encodes which productions are
*99e0aae7SDavid Rees    # "active" (that is, which ones the parser has seen partial input which
*99e0aae7SDavid Rees    # matches some prefix of the production, in a place where that production
*99e0aae7SDavid Rees    # might be valid), and, for each active production, how much of the
*99e0aae7SDavid Rees    # production has been completed.
*99e0aae7SDavid Rees    stack = [(0, None)]
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    def state():
*99e0aae7SDavid Rees      return stack[-1][0]
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    cursor = 0
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    # On each iteration, look at the next symbol and the current state, and
*99e0aae7SDavid Rees    # perform the corresponding action.
*99e0aae7SDavid Rees    while True:
*99e0aae7SDavid Rees      if (state(), tokens[cursor].symbol) not in self.action:
*99e0aae7SDavid Rees        # Most state/symbol entries would be Errors, so rather than exhaustively
*99e0aae7SDavid Rees        # adding error entries, we just check here.
*99e0aae7SDavid Rees        if state() in self.default_errors:
*99e0aae7SDavid Rees          next_action = Error(self.default_errors[state()])
*99e0aae7SDavid Rees        else:
*99e0aae7SDavid Rees          next_action = Error(None)
*99e0aae7SDavid Rees      else:
*99e0aae7SDavid Rees        next_action = self.action[state(), tokens[cursor].symbol]
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees      if isinstance(next_action, Shift):
*99e0aae7SDavid Rees        # Shift means that there are no "complete" productions on the stack,
*99e0aae7SDavid Rees        # and so the current token should be shifted onto the stack, with a new
*99e0aae7SDavid Rees        # state indicating the new set of "active" productions.
*99e0aae7SDavid Rees        stack.append((next_action.state, tokens[cursor]))
*99e0aae7SDavid Rees        cursor += 1
*99e0aae7SDavid Rees      elif isinstance(next_action, Accept):
*99e0aae7SDavid Rees        # Accept means that parsing is over, successfully.
*99e0aae7SDavid Rees        assert len(stack) == 2, "Accepted incompletely-reduced input."
*99e0aae7SDavid Rees        assert tokens[cursor].symbol == END_OF_INPUT, ("Accepted parse before "
*99e0aae7SDavid Rees                                                       "end of input.")
*99e0aae7SDavid Rees        return ParseResult(stack[-1][1], None)
*99e0aae7SDavid Rees      elif isinstance(next_action, Reduce):
*99e0aae7SDavid Rees        # Reduce means that there is a complete production on the stack, and
*99e0aae7SDavid Rees        # that the next symbol implies that the completed production is the
*99e0aae7SDavid Rees        # correct production.
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # Per ALSU, we would simply pop an element off the state stack for each
*99e0aae7SDavid Rees        # symbol on the rhs of the production, and then push a new state by
*99e0aae7SDavid Rees        # looking up the (post-pop) current state and the lhs of the production
*99e0aae7SDavid Rees        # in GOTO.  The GOTO table, in some sense, is equivalent to shift
*99e0aae7SDavid Rees        # actions for nonterminal symbols.
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # Here, we attach a new partial parse tree, with the production lhs as
*99e0aae7SDavid Rees        # the "name" of the tree, and the popped trees as the "children" of the
*99e0aae7SDavid Rees        # new tree.
*99e0aae7SDavid Rees        children = [
*99e0aae7SDavid Rees            item[1] for item in stack[len(stack) - len(next_action.rule.rhs):]
*99e0aae7SDavid Rees        ]
*99e0aae7SDavid Rees        # Attach source_location, if known.  The source location will not be
*99e0aae7SDavid Rees        # known if the reduction consumes no symbols (empty rhs) or if the
*99e0aae7SDavid Rees        # client did not specify source_locations for tokens.
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # It is necessary to loop in order to handle cases like:
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # C -> c D
*99e0aae7SDavid Rees        # D ->
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # The D child of the C reduction will not have a source location
*99e0aae7SDavid Rees        # (because it is not produced from any source), so it is necessary to
*99e0aae7SDavid Rees        # scan backwards through C's children to find the end position.  The
*99e0aae7SDavid Rees        # opposite is required in the case where initial children have no
*99e0aae7SDavid Rees        # source.
*99e0aae7SDavid Rees        #
*99e0aae7SDavid Rees        # These loops implicitly handle the case where the reduction has no
*99e0aae7SDavid Rees        # children, setting the source_location to None in that case.
*99e0aae7SDavid Rees        start_position = None
*99e0aae7SDavid Rees        end_position = None
*99e0aae7SDavid Rees        for child in children:
*99e0aae7SDavid Rees          if hasattr(child,
*99e0aae7SDavid Rees                     "source_location") and child.source_location is not None:
*99e0aae7SDavid Rees            start_position = child.source_location.start
*99e0aae7SDavid Rees            break
*99e0aae7SDavid Rees        for child in reversed(children):
*99e0aae7SDavid Rees          if hasattr(child,
*99e0aae7SDavid Rees                     "source_location") and child.source_location is not None:
*99e0aae7SDavid Rees            end_position = child.source_location.end
*99e0aae7SDavid Rees            break
*99e0aae7SDavid Rees        if start_position is None:
*99e0aae7SDavid Rees          source_location = None
*99e0aae7SDavid Rees        else:
*99e0aae7SDavid Rees          source_location = parser_types.make_location(start_position,
*99e0aae7SDavid Rees                                                       end_position)
*99e0aae7SDavid Rees        reduction = Reduction(next_action.rule.lhs, children, next_action.rule,
*99e0aae7SDavid Rees                              source_location)
*99e0aae7SDavid Rees        del stack[len(stack) - len(next_action.rule.rhs):]
*99e0aae7SDavid Rees        stack.append((self.goto[state(), next_action.rule.lhs], reduction))
*99e0aae7SDavid Rees      elif isinstance(next_action, Error):
*99e0aae7SDavid Rees        # Error means that the parse is impossible.  For typical grammars and
*99e0aae7SDavid Rees        # texts, this usually happens within a few tokens after the mistake in
*99e0aae7SDavid Rees        # the input stream, which is convenient (though imperfect) for error
*99e0aae7SDavid Rees        # reporting.
*99e0aae7SDavid Rees        return ParseResult(None,
*99e0aae7SDavid Rees                           ParseError(next_action.code, cursor, tokens[cursor],
*99e0aae7SDavid Rees                                      state(), self.expected[state()]))
*99e0aae7SDavid Rees      else:
*99e0aae7SDavid Rees        assert False, "Shouldn't be here."
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def mark_error(self, tokens, error_token, error_code):
*99e0aae7SDavid Rees    """Marks an error state with the given error code.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    mark_error implements the equivalent of the "Merr" system presented in
*99e0aae7SDavid Rees    "Generating LR Syntax error Messages from Examples" (Jeffery, 2003).
*99e0aae7SDavid Rees    This system has limitations, but has the primary advantage that error
*99e0aae7SDavid Rees    messages can be specified by giving an example of the error and the
*99e0aae7SDavid Rees    message itself.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Arguments:
*99e0aae7SDavid Rees      tokens: a list of tokens to parse.
*99e0aae7SDavid Rees      error_token: the token where the parse should fail, or None if the parse
*99e0aae7SDavid Rees        should fail at the implicit end-of-input token.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees        If the error_token is the special ANY_TOKEN, then the error will be
*99e0aae7SDavid Rees        recorded as the default error for the error state.
*99e0aae7SDavid Rees      error_code: a value to record for the error state reached by parsing
*99e0aae7SDavid Rees        tokens.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      None if error_code was successfully recorded, or an error message if there
*99e0aae7SDavid Rees      was a problem.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    result = self._parse(tokens)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    # There is no error state to mark on a successful parse.
*99e0aae7SDavid Rees    if not result.error:
*99e0aae7SDavid Rees      return "Input successfully parsed."
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    # Check if the error occurred at the specified token; if not, then this was
*99e0aae7SDavid Rees    # not the expected error.
*99e0aae7SDavid Rees    if error_token is None:
*99e0aae7SDavid Rees      error_symbol = END_OF_INPUT
*99e0aae7SDavid Rees      if result.error.token.symbol != END_OF_INPUT:
*99e0aae7SDavid Rees        return "error occurred on {} token, not end of input.".format(
*99e0aae7SDavid Rees            result.error.token.symbol)
*99e0aae7SDavid Rees    else:
*99e0aae7SDavid Rees      error_symbol = error_token.symbol
*99e0aae7SDavid Rees      if result.error.token != error_token:
*99e0aae7SDavid Rees        return "error occurred on {} token, not {} token.".format(
*99e0aae7SDavid Rees            result.error.token.symbol, error_token.symbol)
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    # If the expected error was found, attempt to mark it.  It is acceptable if
*99e0aae7SDavid Rees    # the given error_code is already set as the error code for the given parse,
*99e0aae7SDavid Rees    # but not if a different code is set.
*99e0aae7SDavid Rees    if result.error.token == ANY_TOKEN:
*99e0aae7SDavid Rees      # For ANY_TOKEN, mark it as a default error.
*99e0aae7SDavid Rees      if result.error.state in self.default_errors:
*99e0aae7SDavid Rees        if self.default_errors[result.error.state] == error_code:
*99e0aae7SDavid Rees          return None
*99e0aae7SDavid Rees        else:
*99e0aae7SDavid Rees          return ("Attempted to overwrite existing default error code {!r} "
*99e0aae7SDavid Rees                  "with new error code {!r} for state {}".format(
*99e0aae7SDavid Rees                      self.default_errors[result.error.state], error_code,
*99e0aae7SDavid Rees                      result.error.state))
*99e0aae7SDavid Rees      else:
*99e0aae7SDavid Rees        self.default_errors[result.error.state] = error_code
*99e0aae7SDavid Rees        return None
*99e0aae7SDavid Rees    else:
*99e0aae7SDavid Rees      if (result.error.state, error_symbol) in self.action:
*99e0aae7SDavid Rees        existing_error = self.action[result.error.state, error_symbol]
*99e0aae7SDavid Rees        assert isinstance(existing_error, Error), "Bug"
*99e0aae7SDavid Rees        if existing_error.code == error_code:
*99e0aae7SDavid Rees          return None
*99e0aae7SDavid Rees        else:
*99e0aae7SDavid Rees          return ("Attempted to overwrite existing error code {!r} with new "
*99e0aae7SDavid Rees                  "error code {!r} for state {}, terminal {}".format(
*99e0aae7SDavid Rees                      existing_error.code, error_code, result.error.state,
*99e0aae7SDavid Rees                      error_symbol))
*99e0aae7SDavid Rees      else:
*99e0aae7SDavid Rees        self.action[result.error.state, error_symbol] = Error(error_code)
*99e0aae7SDavid Rees        return None
*99e0aae7SDavid Rees    assert False, "All other paths should lead to return."
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees  def parse(self, tokens):
*99e0aae7SDavid Rees    """Parses a list of tokens.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Arguments:
*99e0aae7SDavid Rees      tokens: a list of tokens to parse.
*99e0aae7SDavid Rees
*99e0aae7SDavid Rees    Returns:
*99e0aae7SDavid Rees      A ParseResult.
*99e0aae7SDavid Rees    """
*99e0aae7SDavid Rees    result = self._parse(tokens)
*99e0aae7SDavid Rees    return result