1*99e0aae7SDavid Rees# Copyright 2019 Google LLC 2*99e0aae7SDavid Rees# 3*99e0aae7SDavid Rees# Licensed under the Apache License, Version 2.0 (the "License"); 4*99e0aae7SDavid Rees# you may not use this file except in compliance with the License. 5*99e0aae7SDavid Rees# You may obtain a copy of the License at 6*99e0aae7SDavid Rees# 7*99e0aae7SDavid Rees# https://www.apache.org/licenses/LICENSE-2.0 8*99e0aae7SDavid Rees# 9*99e0aae7SDavid Rees# Unless required by applicable law or agreed to in writing, software 10*99e0aae7SDavid Rees# distributed under the License is distributed on an "AS IS" BASIS, 11*99e0aae7SDavid Rees# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*99e0aae7SDavid Rees# See the License for the specific language governing permissions and 13*99e0aae7SDavid Rees# limitations under the License. 14*99e0aae7SDavid Rees 15*99e0aae7SDavid Rees"""LR(1) parser generator. 16*99e0aae7SDavid Rees 17*99e0aae7SDavid ReesThe primary class in this module, Grammar, takes a list of context-free grammar 18*99e0aae7SDavid Reesproductions, and produces the corresponding LR(1) shift-reduce parser. This is 19*99e0aae7SDavid Reesan implementation of the algorithm on pages 221 and 261-265 of "Compilers: 20*99e0aae7SDavid ReesPrinciples, Techniques, & Tools" (Second Edition) by Aho, Lam, Sethi, and 21*99e0aae7SDavid ReesUllman, also known as "The Dragon Book," hereafter referred to as "ALSU." 22*99e0aae7SDavid Rees 23*99e0aae7SDavid ReesThis module only implements the LR(1) algorithms; unlike tools such as yacc, it 24*99e0aae7SDavid Reesdoes not implement the various bits of glue necessary to actually use a parser. 25*99e0aae7SDavid ReesClients are expected to provide their own tokenizers and handle turning a raw 26*99e0aae7SDavid Reesparse tree into an intermediate representation on their own. 27*99e0aae7SDavid Rees""" 28*99e0aae7SDavid Rees 29*99e0aae7SDavid Reesimport collections 30*99e0aae7SDavid Rees 31*99e0aae7SDavid Reesfrom compiler.util import parser_types 32*99e0aae7SDavid Rees 33*99e0aae7SDavid Rees 34*99e0aae7SDavid Reesclass Item(collections.namedtuple("Item", ["production", "dot", "terminal", 35*99e0aae7SDavid Rees "next_symbol"])): 36*99e0aae7SDavid Rees """An Item is an LR(1) Item: a production, a cursor location, and a terminal. 37*99e0aae7SDavid Rees 38*99e0aae7SDavid Rees An Item represents a partially-parsed production, and a lookahead symbol. The 39*99e0aae7SDavid Rees position of the dot indicates what portion of the production has been parsed. 40*99e0aae7SDavid Rees Generally, Items are an internal implementation detail, but they can be useful 41*99e0aae7SDavid Rees elsewhere, particularly for debugging. 42*99e0aae7SDavid Rees 43*99e0aae7SDavid Rees Attributes: 44*99e0aae7SDavid Rees production: The Production this Item covers. 45*99e0aae7SDavid Rees dot: The index of the "dot" in production's rhs. 46*99e0aae7SDavid Rees terminal: The terminal lookahead symbol that follows the production in the 47*99e0aae7SDavid Rees input stream. 48*99e0aae7SDavid Rees """ 49*99e0aae7SDavid Rees 50*99e0aae7SDavid Rees def __str__(self): 51*99e0aae7SDavid Rees """__str__ generates ASLU notation.""" 52*99e0aae7SDavid Rees return (str(self.production.lhs) + " -> " + " ".join( 53*99e0aae7SDavid Rees [str(r) for r in self.production.rhs[0:self.dot] + (".",) + 54*99e0aae7SDavid Rees self.production.rhs[self.dot:]]) + ", " + str(self.terminal)) 55*99e0aae7SDavid Rees 56*99e0aae7SDavid Rees @staticmethod 57*99e0aae7SDavid Rees def parse(text): 58*99e0aae7SDavid Rees """Parses an Item in ALSU notation. 59*99e0aae7SDavid Rees 60*99e0aae7SDavid Rees Parses an Item from notation like: 61*99e0aae7SDavid Rees 62*99e0aae7SDavid Rees symbol -> foo . bar baz, qux 63*99e0aae7SDavid Rees 64*99e0aae7SDavid Rees where "symbol -> foo bar baz" will be taken as the production, the position 65*99e0aae7SDavid Rees of the "." is taken as "dot" (in this case 1), and the symbol after "," is 66*99e0aae7SDavid Rees taken as the "terminal". The following are also valid items: 67*99e0aae7SDavid Rees 68*99e0aae7SDavid Rees sym -> ., foo 69*99e0aae7SDavid Rees sym -> . foo bar, baz 70*99e0aae7SDavid Rees sym -> foo bar ., baz 71*99e0aae7SDavid Rees 72*99e0aae7SDavid Rees Symbols on the right-hand side of the production should be separated by 73*99e0aae7SDavid Rees whitespace. 74*99e0aae7SDavid Rees 75*99e0aae7SDavid Rees Arguments: 76*99e0aae7SDavid Rees text: The text to parse into an Item. 77*99e0aae7SDavid Rees 78*99e0aae7SDavid Rees Returns: 79*99e0aae7SDavid Rees An Item. 80*99e0aae7SDavid Rees """ 81*99e0aae7SDavid Rees production, terminal = text.split(",") 82*99e0aae7SDavid Rees terminal = terminal.strip() 83*99e0aae7SDavid Rees if terminal == "$": 84*99e0aae7SDavid Rees terminal = END_OF_INPUT 85*99e0aae7SDavid Rees lhs, rhs = production.split("->") 86*99e0aae7SDavid Rees lhs = lhs.strip() 87*99e0aae7SDavid Rees if lhs == "S'": 88*99e0aae7SDavid Rees lhs = START_PRIME 89*99e0aae7SDavid Rees before_dot, after_dot = rhs.split(".") 90*99e0aae7SDavid Rees handle = before_dot.split() 91*99e0aae7SDavid Rees tail = after_dot.split() 92*99e0aae7SDavid Rees return make_item(parser_types.Production(lhs, tuple(handle + tail)), 93*99e0aae7SDavid Rees len(handle), terminal) 94*99e0aae7SDavid Rees 95*99e0aae7SDavid Rees 96*99e0aae7SDavid Reesdef make_item(production, dot, symbol): 97*99e0aae7SDavid Rees return Item(production, dot, symbol, 98*99e0aae7SDavid Rees None if dot >= len(production.rhs) else production.rhs[dot]) 99*99e0aae7SDavid Rees 100*99e0aae7SDavid Rees 101*99e0aae7SDavid Reesclass Conflict( 102*99e0aae7SDavid Rees collections.namedtuple("Conflict", ["state", "symbol", "actions"]) 103*99e0aae7SDavid Rees): 104*99e0aae7SDavid Rees """Conflict represents a parse conflict.""" 105*99e0aae7SDavid Rees 106*99e0aae7SDavid Rees def __str__(self): 107*99e0aae7SDavid Rees return "Conflict for {} in state {}: ".format( 108*99e0aae7SDavid Rees self.symbol, self.state) + " vs ".join([str(a) for a in self.actions]) 109*99e0aae7SDavid Rees 110*99e0aae7SDavid Rees 111*99e0aae7SDavid ReesShift = collections.namedtuple("Shift", ["state", "items"]) 112*99e0aae7SDavid ReesReduce = collections.namedtuple("Reduce", ["rule"]) 113*99e0aae7SDavid ReesAccept = collections.namedtuple("Accept", []) 114*99e0aae7SDavid ReesError = collections.namedtuple("Error", ["code"]) 115*99e0aae7SDavid Rees 116*99e0aae7SDavid ReesSymbol = collections.namedtuple("Symbol", ["symbol"]) 117*99e0aae7SDavid Rees 118*99e0aae7SDavid Rees# START_PRIME is the implicit 'real' root symbol for the grammar. 119*99e0aae7SDavid ReesSTART_PRIME = "S'" 120*99e0aae7SDavid Rees 121*99e0aae7SDavid Rees# END_OF_INPUT is the implicit symbol at the end of input. 122*99e0aae7SDavid ReesEND_OF_INPUT = "$" 123*99e0aae7SDavid Rees 124*99e0aae7SDavid Rees# ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced 125*99e0aae7SDavid Rees# by every other token. 126*99e0aae7SDavid ReesANY_TOKEN = parser_types.Token(object(), "*", 127*99e0aae7SDavid Rees parser_types.parse_location("0:0-0:0")) 128*99e0aae7SDavid Rees 129*99e0aae7SDavid Rees 130*99e0aae7SDavid Reesclass Reduction(collections.namedtuple("Reduction", 131*99e0aae7SDavid Rees ["symbol", "children", "production", 132*99e0aae7SDavid Rees "source_location"])): 133*99e0aae7SDavid Rees """A Reduction is a non-leaf node in a parse tree. 134*99e0aae7SDavid Rees 135*99e0aae7SDavid Rees Attributes: 136*99e0aae7SDavid Rees symbol: The name of this element in the parse. 137*99e0aae7SDavid Rees children: The child elements of this parse. 138*99e0aae7SDavid Rees production: The grammar production to which this reduction corresponds. 139*99e0aae7SDavid Rees source_location: If known, the range in the source text corresponding to the 140*99e0aae7SDavid Rees tokens from which this reduction was parsed. May be 'None' if this 141*99e0aae7SDavid Rees reduction was produced from no symbols, or if the tokens fed to `parse` 142*99e0aae7SDavid Rees did not include source_location. 143*99e0aae7SDavid Rees """ 144*99e0aae7SDavid Rees pass 145*99e0aae7SDavid Rees 146*99e0aae7SDavid Rees 147*99e0aae7SDavid Reesclass Grammar(object): 148*99e0aae7SDavid Rees """Grammar is an LR(1) context-free grammar. 149*99e0aae7SDavid Rees 150*99e0aae7SDavid Rees Attributes: 151*99e0aae7SDavid Rees start: The start symbol for the grammar. 152*99e0aae7SDavid Rees productions: A list of productions in the grammar, including the S' -> start 153*99e0aae7SDavid Rees production. 154*99e0aae7SDavid Rees symbols: A set of all symbols in the grammar, including $ and S'. 155*99e0aae7SDavid Rees nonterminals: A set of all nonterminal symbols in the grammar, including S'. 156*99e0aae7SDavid Rees terminals: A set of all terminal symbols in the grammar, including $. 157*99e0aae7SDavid Rees """ 158*99e0aae7SDavid Rees 159*99e0aae7SDavid Rees def __init__(self, start_symbol, productions): 160*99e0aae7SDavid Rees """Constructs a Grammar object. 161*99e0aae7SDavid Rees 162*99e0aae7SDavid Rees Arguments: 163*99e0aae7SDavid Rees start_symbol: The start symbol for the grammar. 164*99e0aae7SDavid Rees productions: A list of productions (not including the "S' -> start_symbol" 165*99e0aae7SDavid Rees production). 166*99e0aae7SDavid Rees """ 167*99e0aae7SDavid Rees object.__init__(self) 168*99e0aae7SDavid Rees self.start = start_symbol 169*99e0aae7SDavid Rees self._seed_production = parser_types.Production(START_PRIME, (self.start,)) 170*99e0aae7SDavid Rees self.productions = productions + [self._seed_production] 171*99e0aae7SDavid Rees 172*99e0aae7SDavid Rees self._single_level_closure_of_item_cache = {} 173*99e0aae7SDavid Rees self._closure_of_item_cache = {} 174*99e0aae7SDavid Rees self._compute_symbols() 175*99e0aae7SDavid Rees self._compute_seed_firsts() 176*99e0aae7SDavid Rees self._set_productions_by_lhs() 177*99e0aae7SDavid Rees self._populate_item_cache() 178*99e0aae7SDavid Rees 179*99e0aae7SDavid Rees def _set_productions_by_lhs(self): 180*99e0aae7SDavid Rees # Prepopulating _productions_by_lhs speeds up _closure_of_item by about 30%, 181*99e0aae7SDavid Rees # which is significant on medium-to-large grammars. 182*99e0aae7SDavid Rees self._productions_by_lhs = {} 183*99e0aae7SDavid Rees for production in self.productions: 184*99e0aae7SDavid Rees self._productions_by_lhs.setdefault(production.lhs, list()).append( 185*99e0aae7SDavid Rees production) 186*99e0aae7SDavid Rees 187*99e0aae7SDavid Rees def _populate_item_cache(self): 188*99e0aae7SDavid Rees # There are a relatively small number of possible Items for a grammar, and 189*99e0aae7SDavid Rees # the algorithm needs to get Items from their constituent components very 190*99e0aae7SDavid Rees # frequently. As it turns out, pre-caching all possible Items results in a 191*99e0aae7SDavid Rees # ~35% overall speedup to Grammar.parser(). 192*99e0aae7SDavid Rees self._item_cache = {} 193*99e0aae7SDavid Rees for symbol in self.terminals: 194*99e0aae7SDavid Rees for production in self.productions: 195*99e0aae7SDavid Rees for dot in range(len(production.rhs) + 1): 196*99e0aae7SDavid Rees self._item_cache[production, dot, symbol] = make_item( 197*99e0aae7SDavid Rees production, dot, symbol) 198*99e0aae7SDavid Rees 199*99e0aae7SDavid Rees def _compute_symbols(self): 200*99e0aae7SDavid Rees """Finds all grammar symbols, and sorts them into terminal and non-terminal. 201*99e0aae7SDavid Rees 202*99e0aae7SDavid Rees Nonterminal symbols are those which appear on the left side of any 203*99e0aae7SDavid Rees production. Terminal symbols are those which do not. 204*99e0aae7SDavid Rees 205*99e0aae7SDavid Rees _compute_symbols is used during __init__. 206*99e0aae7SDavid Rees """ 207*99e0aae7SDavid Rees self.symbols = {END_OF_INPUT} 208*99e0aae7SDavid Rees self.nonterminals = set() 209*99e0aae7SDavid Rees for production in self.productions: 210*99e0aae7SDavid Rees self.symbols.add(production.lhs) 211*99e0aae7SDavid Rees self.nonterminals.add(production.lhs) 212*99e0aae7SDavid Rees for symbol in production.rhs: 213*99e0aae7SDavid Rees self.symbols.add(symbol) 214*99e0aae7SDavid Rees self.terminals = self.symbols - self.nonterminals 215*99e0aae7SDavid Rees 216*99e0aae7SDavid Rees def _compute_seed_firsts(self): 217*99e0aae7SDavid Rees """Computes FIRST (ALSU p221) for all terminal and nonterminal symbols. 218*99e0aae7SDavid Rees 219*99e0aae7SDavid Rees The algorithm for computing FIRST is an iterative one that terminates when 220*99e0aae7SDavid Rees it reaches a fixed point (that is, when further iterations stop changing 221*99e0aae7SDavid Rees state). _compute_seed_firsts computes the fixed point for all single-symbol 222*99e0aae7SDavid Rees strings, by repeatedly calling _first and updating the internal _firsts 223*99e0aae7SDavid Rees table with the results. 224*99e0aae7SDavid Rees 225*99e0aae7SDavid Rees Once _compute_seed_firsts has completed, _first will return correct results 226*99e0aae7SDavid Rees for both single- and multi-symbol strings. 227*99e0aae7SDavid Rees 228*99e0aae7SDavid Rees _compute_seed_firsts is used during __init__. 229*99e0aae7SDavid Rees """ 230*99e0aae7SDavid Rees self.firsts = {} 231*99e0aae7SDavid Rees # FIRST for a terminal symbol is always just that terminal symbol. 232*99e0aae7SDavid Rees for terminal in self.terminals: 233*99e0aae7SDavid Rees self.firsts[terminal] = set([terminal]) 234*99e0aae7SDavid Rees for nonterminal in self.nonterminals: 235*99e0aae7SDavid Rees self.firsts[nonterminal] = set() 236*99e0aae7SDavid Rees while True: 237*99e0aae7SDavid Rees # The first iteration picks up all the productions that start with 238*99e0aae7SDavid Rees # terminal symbols. The second iteration picks up productions that start 239*99e0aae7SDavid Rees # with nonterminals that the first iteration picked up. The third 240*99e0aae7SDavid Rees # iteration picks up nonterminals that the first and second picked up, and 241*99e0aae7SDavid Rees # so on. 242*99e0aae7SDavid Rees # 243*99e0aae7SDavid Rees # This is guaranteed to end, in the worst case, when every terminal 244*99e0aae7SDavid Rees # symbol and epsilon has been added to the _firsts set for every 245*99e0aae7SDavid Rees # nonterminal symbol. This would be slow, but requires a pathological 246*99e0aae7SDavid Rees # grammar; useful grammars should complete in only a few iterations. 247*99e0aae7SDavid Rees firsts_to_add = {} 248*99e0aae7SDavid Rees for production in self.productions: 249*99e0aae7SDavid Rees for first in self._first(production.rhs): 250*99e0aae7SDavid Rees if first not in self.firsts[production.lhs]: 251*99e0aae7SDavid Rees if production.lhs not in firsts_to_add: 252*99e0aae7SDavid Rees firsts_to_add[production.lhs] = set() 253*99e0aae7SDavid Rees firsts_to_add[production.lhs].add(first) 254*99e0aae7SDavid Rees if not firsts_to_add: 255*99e0aae7SDavid Rees break 256*99e0aae7SDavid Rees for symbol in firsts_to_add: 257*99e0aae7SDavid Rees self.firsts[symbol].update(firsts_to_add[symbol]) 258*99e0aae7SDavid Rees 259*99e0aae7SDavid Rees def _first(self, symbols): 260*99e0aae7SDavid Rees """The FIRST function from ALSU p221. 261*99e0aae7SDavid Rees 262*99e0aae7SDavid Rees _first takes a string of symbols (both terminals and nonterminals) and 263*99e0aae7SDavid Rees returns the set of terminal symbols which could be the first terminal symbol 264*99e0aae7SDavid Rees of a string produced by the given list of symbols. 265*99e0aae7SDavid Rees 266*99e0aae7SDavid Rees _first will not give fully-correct results until _compute_seed_firsts 267*99e0aae7SDavid Rees finishes, but is called by _compute_seed_firsts, and must provide partial 268*99e0aae7SDavid Rees results during that method's execution. 269*99e0aae7SDavid Rees 270*99e0aae7SDavid Rees Args: 271*99e0aae7SDavid Rees symbols: A list of symbols. 272*99e0aae7SDavid Rees 273*99e0aae7SDavid Rees Returns: 274*99e0aae7SDavid Rees A set of terminals which could be the first terminal in "symbols." 275*99e0aae7SDavid Rees """ 276*99e0aae7SDavid Rees result = set() 277*99e0aae7SDavid Rees all_contain_epsilon = True 278*99e0aae7SDavid Rees for symbol in symbols: 279*99e0aae7SDavid Rees for first in self.firsts[symbol]: 280*99e0aae7SDavid Rees if first: 281*99e0aae7SDavid Rees result.add(first) 282*99e0aae7SDavid Rees if None not in self.firsts[symbol]: 283*99e0aae7SDavid Rees all_contain_epsilon = False 284*99e0aae7SDavid Rees break 285*99e0aae7SDavid Rees if all_contain_epsilon: 286*99e0aae7SDavid Rees # "None" seems like a Pythonic way of representing epsilon (no symbol). 287*99e0aae7SDavid Rees result.add(None) 288*99e0aae7SDavid Rees return result 289*99e0aae7SDavid Rees 290*99e0aae7SDavid Rees def _closure_of_item(self, root_item): 291*99e0aae7SDavid Rees """Modified implementation of CLOSURE from ALSU p261. 292*99e0aae7SDavid Rees 293*99e0aae7SDavid Rees _closure_of_item performs the CLOSURE function with a single seed item, with 294*99e0aae7SDavid Rees memoization. In the algorithm as presented in ALSU, CLOSURE is called with 295*99e0aae7SDavid Rees a different set of items every time, which is unhelpful for memoization. 296*99e0aae7SDavid Rees Instead, we let _parallel_goto merge the sets returned by _closure_of_item, 297*99e0aae7SDavid Rees which results in a ~40% speedup. 298*99e0aae7SDavid Rees 299*99e0aae7SDavid Rees CLOSURE, roughly, computes the set of LR(1) Items which might be active when 300*99e0aae7SDavid Rees a "seed" set of Items is active. 301*99e0aae7SDavid Rees 302*99e0aae7SDavid Rees Technically, it is the epsilon-closure of the NFA states represented by 303*99e0aae7SDavid Rees "items," where an epsilon transition (a transition that does not consume any 304*99e0aae7SDavid Rees symbols) occurs from a->Z.bY,q to b->.X,p when p is in FIRST(Yq). (a and b 305*99e0aae7SDavid Rees are nonterminals, X, Y, and Z are arbitrary strings of symbols, and p and q 306*99e0aae7SDavid Rees are terminals.) That is, it is the set of all NFA states which can be 307*99e0aae7SDavid Rees reached from "items" without consuming any input. This set corresponds to a 308*99e0aae7SDavid Rees single DFA state. 309*99e0aae7SDavid Rees 310*99e0aae7SDavid Rees Args: 311*99e0aae7SDavid Rees root_item: The initial LR(1) Item. 312*99e0aae7SDavid Rees 313*99e0aae7SDavid Rees Returns: 314*99e0aae7SDavid Rees A set of LR(1) items which may be active at the time when the provided 315*99e0aae7SDavid Rees item is active. 316*99e0aae7SDavid Rees """ 317*99e0aae7SDavid Rees if root_item in self._closure_of_item_cache: 318*99e0aae7SDavid Rees return self._closure_of_item_cache[root_item] 319*99e0aae7SDavid Rees item_set = set([root_item]) 320*99e0aae7SDavid Rees item_list = [root_item] 321*99e0aae7SDavid Rees i = 0 322*99e0aae7SDavid Rees # Each newly-added Item may trigger the addition of further Items, so 323*99e0aae7SDavid Rees # iterate until no new Items are added. In the worst case, a new Item will 324*99e0aae7SDavid Rees # be added for each production. 325*99e0aae7SDavid Rees # 326*99e0aae7SDavid Rees # This algorithm is really looking for "next" nonterminals in the existing 327*99e0aae7SDavid Rees # items, and adding new items corresponding to their productions. 328*99e0aae7SDavid Rees while i < len(item_list): 329*99e0aae7SDavid Rees item = item_list[i] 330*99e0aae7SDavid Rees i += 1 331*99e0aae7SDavid Rees if not item.next_symbol: 332*99e0aae7SDavid Rees continue 333*99e0aae7SDavid Rees # If _closure_of_item_cache contains the full closure of item, then we can 334*99e0aae7SDavid Rees # add its full closure to the result set, and skip checking any of its 335*99e0aae7SDavid Rees # items: any item that would be added by any item in the cached result 336*99e0aae7SDavid Rees # will already be in the _closure_of_item_cache entry. 337*99e0aae7SDavid Rees if item in self._closure_of_item_cache: 338*99e0aae7SDavid Rees item_set |= self._closure_of_item_cache[item] 339*99e0aae7SDavid Rees continue 340*99e0aae7SDavid Rees # Even if we don't have the full closure of item, we may have the 341*99e0aae7SDavid Rees # immediate closure of item. It turns out that memoizing just this step 342*99e0aae7SDavid Rees # speeds up this function by about 50%, even after the 343*99e0aae7SDavid Rees # _closure_of_item_cache check. 344*99e0aae7SDavid Rees if item not in self._single_level_closure_of_item_cache: 345*99e0aae7SDavid Rees new_items = set() 346*99e0aae7SDavid Rees for production in self._productions_by_lhs.get(item.next_symbol, []): 347*99e0aae7SDavid Rees for terminal in self._first(item.production.rhs[item.dot + 1:] + 348*99e0aae7SDavid Rees (item.terminal,)): 349*99e0aae7SDavid Rees new_items.add(self._item_cache[production, 0, terminal]) 350*99e0aae7SDavid Rees self._single_level_closure_of_item_cache[item] = new_items 351*99e0aae7SDavid Rees for new_item in self._single_level_closure_of_item_cache[item]: 352*99e0aae7SDavid Rees if new_item not in item_set: 353*99e0aae7SDavid Rees item_set.add(new_item) 354*99e0aae7SDavid Rees item_list.append(new_item) 355*99e0aae7SDavid Rees self._closure_of_item_cache[root_item] = item_set 356*99e0aae7SDavid Rees # Typically, _closure_of_item() will be called on items whose closures 357*99e0aae7SDavid Rees # bring in the greatest number of additional items, then on items which 358*99e0aae7SDavid Rees # close over fewer and fewer other items. Since items are not added to 359*99e0aae7SDavid Rees # _closure_of_item_cache unless _closure_of_item() is called directly on 360*99e0aae7SDavid Rees # them, this means that it is unlikely that items brought in will (without 361*99e0aae7SDavid Rees # intervention) have entries in _closure_of_item_cache, which slows down the 362*99e0aae7SDavid Rees # computation of the larger closures. 363*99e0aae7SDavid Rees # 364*99e0aae7SDavid Rees # Although it is not guaranteed, items added to item_list last will tend to 365*99e0aae7SDavid Rees # close over fewer items, and therefore be easier to compute. By forcibly 366*99e0aae7SDavid Rees # re-calculating closures from last to first, and adding the results to 367*99e0aae7SDavid Rees # _closure_of_item_cache at each step, we get a modest performance 368*99e0aae7SDavid Rees # improvement: roughly 50% less time spent in _closure_of_item, which 369*99e0aae7SDavid Rees # translates to about 5% less time in parser(). 370*99e0aae7SDavid Rees for item in item_list[::-1]: 371*99e0aae7SDavid Rees self._closure_of_item(item) 372*99e0aae7SDavid Rees return item_set 373*99e0aae7SDavid Rees 374*99e0aae7SDavid Rees def _parallel_goto(self, items): 375*99e0aae7SDavid Rees """The GOTO function from ALSU p261, executed on all symbols. 376*99e0aae7SDavid Rees 377*99e0aae7SDavid Rees _parallel_goto takes a set of Items, and returns a dict from every symbol in 378*99e0aae7SDavid Rees self.symbols to the set of Items that would be active after a shift 379*99e0aae7SDavid Rees operation (if symbol is a terminal) or after a reduction operation (if 380*99e0aae7SDavid Rees symbol is a nonterminal). 381*99e0aae7SDavid Rees 382*99e0aae7SDavid Rees _parallel_goto is used in lieu of the single-symbol GOTO from ALSU because 383*99e0aae7SDavid Rees it eliminates the outer loop over self.terminals, and thereby reduces the 384*99e0aae7SDavid Rees number of next_symbol calls by a factor of len(self.terminals). 385*99e0aae7SDavid Rees 386*99e0aae7SDavid Rees Args: 387*99e0aae7SDavid Rees items: The set of items representing the initial DFA state. 388*99e0aae7SDavid Rees 389*99e0aae7SDavid Rees Returns: 390*99e0aae7SDavid Rees A dict from symbols to sets of items representing the new DFA states. 391*99e0aae7SDavid Rees """ 392*99e0aae7SDavid Rees results = collections.defaultdict(set) 393*99e0aae7SDavid Rees for item in items: 394*99e0aae7SDavid Rees next_symbol = item.next_symbol 395*99e0aae7SDavid Rees if next_symbol is None: 396*99e0aae7SDavid Rees continue 397*99e0aae7SDavid Rees item = self._item_cache[item.production, item.dot + 1, item.terminal] 398*99e0aae7SDavid Rees # Inlining the cache check results in a ~25% speedup in this function, and 399*99e0aae7SDavid Rees # about 10% overall speedup to parser(). 400*99e0aae7SDavid Rees if item in self._closure_of_item_cache: 401*99e0aae7SDavid Rees closure = self._closure_of_item_cache[item] 402*99e0aae7SDavid Rees else: 403*99e0aae7SDavid Rees closure = self._closure_of_item(item) 404*99e0aae7SDavid Rees # _closure will add newly-started Items (Items with dot=0) to the result 405*99e0aae7SDavid Rees # set. After this operation, the result set will correspond to the new 406*99e0aae7SDavid Rees # state. 407*99e0aae7SDavid Rees results[next_symbol].update(closure) 408*99e0aae7SDavid Rees return results 409*99e0aae7SDavid Rees 410*99e0aae7SDavid Rees def _items(self): 411*99e0aae7SDavid Rees """The items function from ALSU p261. 412*99e0aae7SDavid Rees 413*99e0aae7SDavid Rees _items computes the set of sets of LR(1) items for a shift-reduce parser 414*99e0aae7SDavid Rees that matches the grammar. Each set of LR(1) items corresponds to a single 415*99e0aae7SDavid Rees DFA state. 416*99e0aae7SDavid Rees 417*99e0aae7SDavid Rees Returns: 418*99e0aae7SDavid Rees A tuple. 419*99e0aae7SDavid Rees 420*99e0aae7SDavid Rees The first element of the tuple is a list of sets of LR(1) items (each set 421*99e0aae7SDavid Rees corresponding to a DFA state). 422*99e0aae7SDavid Rees 423*99e0aae7SDavid Rees The second element of the tuple is a dictionary from (int, symbol) pairs 424*99e0aae7SDavid Rees to ints, where all the ints are indexes into the list of sets of LR(1) 425*99e0aae7SDavid Rees items. This dictionary is based on the results of the _Goto function, 426*99e0aae7SDavid Rees where item_sets[dict[i, sym]] == self._Goto(item_sets[i], sym). 427*99e0aae7SDavid Rees """ 428*99e0aae7SDavid Rees # The list of states is seeded with the marker S' production. 429*99e0aae7SDavid Rees item_list = [ 430*99e0aae7SDavid Rees frozenset(self._closure_of_item( 431*99e0aae7SDavid Rees self._item_cache[self._seed_production, 0, END_OF_INPUT])) 432*99e0aae7SDavid Rees ] 433*99e0aae7SDavid Rees items = {item_list[0]: 0} 434*99e0aae7SDavid Rees goto_table = {} 435*99e0aae7SDavid Rees i = 0 436*99e0aae7SDavid Rees # For each state, figure out what the new state when each symbol is added to 437*99e0aae7SDavid Rees # the top of the parsing stack (see the comments in parser._parse). See 438*99e0aae7SDavid Rees # _Goto for an explanation of how that is actually computed. 439*99e0aae7SDavid Rees while i < len(item_list): 440*99e0aae7SDavid Rees item_set = item_list[i] 441*99e0aae7SDavid Rees gotos = self._parallel_goto(item_set) 442*99e0aae7SDavid Rees for symbol, goto in gotos.items(): 443*99e0aae7SDavid Rees goto = frozenset(goto) 444*99e0aae7SDavid Rees if goto not in items: 445*99e0aae7SDavid Rees items[goto] = len(item_list) 446*99e0aae7SDavid Rees item_list.append(goto) 447*99e0aae7SDavid Rees goto_table[i, symbol] = items[goto] 448*99e0aae7SDavid Rees i += 1 449*99e0aae7SDavid Rees return item_list, goto_table 450*99e0aae7SDavid Rees 451*99e0aae7SDavid Rees def parser(self): 452*99e0aae7SDavid Rees """parser returns an LR(1) parser for the Grammar. 453*99e0aae7SDavid Rees 454*99e0aae7SDavid Rees This implements the Canonical LR(1) ("LR(1)") parser algorithm ("Algorithm 455*99e0aae7SDavid Rees 4.56", ALSU p265), rather than the more common Lookahead LR(1) ("LALR(1)") 456*99e0aae7SDavid Rees algorithm. LALR(1) produces smaller tables, but is more complex and does 457*99e0aae7SDavid Rees not cover all LR(1) grammars. When the LR(1) and LALR(1) algorithms were 458*99e0aae7SDavid Rees invented, table sizes were an important consideration; now, the difference 459*99e0aae7SDavid Rees between a few hundred and a few thousand entries is unlikely to matter. 460*99e0aae7SDavid Rees 461*99e0aae7SDavid Rees At this time, Grammar does not handle ambiguous grammars, which are commonly 462*99e0aae7SDavid Rees used to handle precedence, associativity, and the "dangling else" problem. 463*99e0aae7SDavid Rees Formally, these can always be handled by an unambiguous grammar, though 464*99e0aae7SDavid Rees doing so can be cumbersome, particularly for expression languages with many 465*99e0aae7SDavid Rees levels of precedence. ALSU section 4.8 (pp278-287) contains some techniques 466*99e0aae7SDavid Rees for handling these kinds of ambiguity. 467*99e0aae7SDavid Rees 468*99e0aae7SDavid Rees Returns: 469*99e0aae7SDavid Rees A Parser. 470*99e0aae7SDavid Rees """ 471*99e0aae7SDavid Rees item_sets, goto = self._items() 472*99e0aae7SDavid Rees action = {} 473*99e0aae7SDavid Rees conflicts = set() 474*99e0aae7SDavid Rees end_item = self._item_cache[self._seed_production, 1, END_OF_INPUT] 475*99e0aae7SDavid Rees for i in range(len(item_sets)): 476*99e0aae7SDavid Rees for item in item_sets[i]: 477*99e0aae7SDavid Rees new_action = None 478*99e0aae7SDavid Rees if (item.next_symbol is None and 479*99e0aae7SDavid Rees item.production != self._seed_production): 480*99e0aae7SDavid Rees terminal = item.terminal 481*99e0aae7SDavid Rees new_action = Reduce(item.production) 482*99e0aae7SDavid Rees elif item.next_symbol in self.terminals: 483*99e0aae7SDavid Rees terminal = item.next_symbol 484*99e0aae7SDavid Rees assert goto[i, terminal] is not None 485*99e0aae7SDavid Rees new_action = Shift(goto[i, terminal], item_sets[goto[i, terminal]]) 486*99e0aae7SDavid Rees if new_action: 487*99e0aae7SDavid Rees if (i, terminal) in action and action[i, terminal] != new_action: 488*99e0aae7SDavid Rees conflicts.add( 489*99e0aae7SDavid Rees Conflict(i, terminal, 490*99e0aae7SDavid Rees frozenset([action[i, terminal], new_action]))) 491*99e0aae7SDavid Rees action[i, terminal] = new_action 492*99e0aae7SDavid Rees if item == end_item: 493*99e0aae7SDavid Rees new_action = Accept() 494*99e0aae7SDavid Rees assert (i, END_OF_INPUT 495*99e0aae7SDavid Rees ) not in action or action[i, END_OF_INPUT] == new_action 496*99e0aae7SDavid Rees action[i, END_OF_INPUT] = new_action 497*99e0aae7SDavid Rees trimmed_goto = {} 498*99e0aae7SDavid Rees for k in goto: 499*99e0aae7SDavid Rees if k[1] in self.nonterminals: 500*99e0aae7SDavid Rees trimmed_goto[k] = goto[k] 501*99e0aae7SDavid Rees expected = {} 502*99e0aae7SDavid Rees for state, terminal in action: 503*99e0aae7SDavid Rees if state not in expected: 504*99e0aae7SDavid Rees expected[state] = set() 505*99e0aae7SDavid Rees expected[state].add(terminal) 506*99e0aae7SDavid Rees return Parser(item_sets, trimmed_goto, action, expected, conflicts, 507*99e0aae7SDavid Rees self.terminals, self.nonterminals, self.productions) 508*99e0aae7SDavid Rees 509*99e0aae7SDavid Rees 510*99e0aae7SDavid ReesParseError = collections.namedtuple("ParseError", ["code", "index", "token", 511*99e0aae7SDavid Rees "state", "expected_tokens"]) 512*99e0aae7SDavid ReesParseResult = collections.namedtuple("ParseResult", ["parse_tree", "error"]) 513*99e0aae7SDavid Rees 514*99e0aae7SDavid Rees 515*99e0aae7SDavid Reesclass Parser(object): 516*99e0aae7SDavid Rees """Parser is a shift-reduce LR(1) parser. 517*99e0aae7SDavid Rees 518*99e0aae7SDavid Rees Generally, clients will want to get a Parser from a Grammar, rather than 519*99e0aae7SDavid Rees directly instantiating one. 520*99e0aae7SDavid Rees 521*99e0aae7SDavid Rees Parser exposes the raw tables needed to feed into a Shift-Reduce parser, 522*99e0aae7SDavid Rees but can also be used directly for parsing. 523*99e0aae7SDavid Rees 524*99e0aae7SDavid Rees Attributes: 525*99e0aae7SDavid Rees item_sets: A list of item sets which correspond to the state numbers in 526*99e0aae7SDavid Rees the action and goto tables. This is not necessary for parsing, but is 527*99e0aae7SDavid Rees useful for debugging parsers. 528*99e0aae7SDavid Rees goto: The GOTO table for this parser. 529*99e0aae7SDavid Rees action: The ACTION table for this parser. 530*99e0aae7SDavid Rees expected: A table of terminal symbols that are expected (that is, that 531*99e0aae7SDavid Rees have a non-Error action) for each state. This can be used to provide 532*99e0aae7SDavid Rees more helpful error messages for parse errors. 533*99e0aae7SDavid Rees conflicts: A set of unresolved conflicts found during table generation. 534*99e0aae7SDavid Rees terminals: A set of terminal symbols in the grammar. 535*99e0aae7SDavid Rees nonterminals: A set of nonterminal symbols in the grammar. 536*99e0aae7SDavid Rees productions: A list of productions in the grammar. 537*99e0aae7SDavid Rees default_errors: A dict of states to default error codes to use when 538*99e0aae7SDavid Rees encountering an error in that state, when a more-specific Error for the 539*99e0aae7SDavid Rees state/terminal pair has not been set. 540*99e0aae7SDavid Rees """ 541*99e0aae7SDavid Rees 542*99e0aae7SDavid Rees def __init__(self, item_sets, goto, action, expected, conflicts, terminals, 543*99e0aae7SDavid Rees nonterminals, productions): 544*99e0aae7SDavid Rees super(Parser, self).__init__() 545*99e0aae7SDavid Rees self.item_sets = item_sets 546*99e0aae7SDavid Rees self.goto = goto 547*99e0aae7SDavid Rees self.action = action 548*99e0aae7SDavid Rees self.expected = expected 549*99e0aae7SDavid Rees self.conflicts = conflicts 550*99e0aae7SDavid Rees self.terminals = terminals 551*99e0aae7SDavid Rees self.nonterminals = nonterminals 552*99e0aae7SDavid Rees self.productions = productions 553*99e0aae7SDavid Rees self.default_errors = {} 554*99e0aae7SDavid Rees 555*99e0aae7SDavid Rees def _parse(self, tokens): 556*99e0aae7SDavid Rees """_parse implements Shift-Reduce parsing algorithm. 557*99e0aae7SDavid Rees 558*99e0aae7SDavid Rees _parse implements the standard shift-reduce algorithm outlined on ASLU 559*99e0aae7SDavid Rees pp236-237. 560*99e0aae7SDavid Rees 561*99e0aae7SDavid Rees Arguments: 562*99e0aae7SDavid Rees tokens: the list of token objects to parse. 563*99e0aae7SDavid Rees 564*99e0aae7SDavid Rees Returns: 565*99e0aae7SDavid Rees A ParseResult. 566*99e0aae7SDavid Rees """ 567*99e0aae7SDavid Rees # The END_OF_INPUT token is explicitly added to avoid explicit "cursor < 568*99e0aae7SDavid Rees # len(tokens)" checks. 569*99e0aae7SDavid Rees tokens = list(tokens) + [Symbol(END_OF_INPUT)] 570*99e0aae7SDavid Rees 571*99e0aae7SDavid Rees # Each element of stack is a parse state and a (possibly partial) parse 572*99e0aae7SDavid Rees # tree. The state at the top of the stack encodes which productions are 573*99e0aae7SDavid Rees # "active" (that is, which ones the parser has seen partial input which 574*99e0aae7SDavid Rees # matches some prefix of the production, in a place where that production 575*99e0aae7SDavid Rees # might be valid), and, for each active production, how much of the 576*99e0aae7SDavid Rees # production has been completed. 577*99e0aae7SDavid Rees stack = [(0, None)] 578*99e0aae7SDavid Rees 579*99e0aae7SDavid Rees def state(): 580*99e0aae7SDavid Rees return stack[-1][0] 581*99e0aae7SDavid Rees 582*99e0aae7SDavid Rees cursor = 0 583*99e0aae7SDavid Rees 584*99e0aae7SDavid Rees # On each iteration, look at the next symbol and the current state, and 585*99e0aae7SDavid Rees # perform the corresponding action. 586*99e0aae7SDavid Rees while True: 587*99e0aae7SDavid Rees if (state(), tokens[cursor].symbol) not in self.action: 588*99e0aae7SDavid Rees # Most state/symbol entries would be Errors, so rather than exhaustively 589*99e0aae7SDavid Rees # adding error entries, we just check here. 590*99e0aae7SDavid Rees if state() in self.default_errors: 591*99e0aae7SDavid Rees next_action = Error(self.default_errors[state()]) 592*99e0aae7SDavid Rees else: 593*99e0aae7SDavid Rees next_action = Error(None) 594*99e0aae7SDavid Rees else: 595*99e0aae7SDavid Rees next_action = self.action[state(), tokens[cursor].symbol] 596*99e0aae7SDavid Rees 597*99e0aae7SDavid Rees if isinstance(next_action, Shift): 598*99e0aae7SDavid Rees # Shift means that there are no "complete" productions on the stack, 599*99e0aae7SDavid Rees # and so the current token should be shifted onto the stack, with a new 600*99e0aae7SDavid Rees # state indicating the new set of "active" productions. 601*99e0aae7SDavid Rees stack.append((next_action.state, tokens[cursor])) 602*99e0aae7SDavid Rees cursor += 1 603*99e0aae7SDavid Rees elif isinstance(next_action, Accept): 604*99e0aae7SDavid Rees # Accept means that parsing is over, successfully. 605*99e0aae7SDavid Rees assert len(stack) == 2, "Accepted incompletely-reduced input." 606*99e0aae7SDavid Rees assert tokens[cursor].symbol == END_OF_INPUT, ("Accepted parse before " 607*99e0aae7SDavid Rees "end of input.") 608*99e0aae7SDavid Rees return ParseResult(stack[-1][1], None) 609*99e0aae7SDavid Rees elif isinstance(next_action, Reduce): 610*99e0aae7SDavid Rees # Reduce means that there is a complete production on the stack, and 611*99e0aae7SDavid Rees # that the next symbol implies that the completed production is the 612*99e0aae7SDavid Rees # correct production. 613*99e0aae7SDavid Rees # 614*99e0aae7SDavid Rees # Per ALSU, we would simply pop an element off the state stack for each 615*99e0aae7SDavid Rees # symbol on the rhs of the production, and then push a new state by 616*99e0aae7SDavid Rees # looking up the (post-pop) current state and the lhs of the production 617*99e0aae7SDavid Rees # in GOTO. The GOTO table, in some sense, is equivalent to shift 618*99e0aae7SDavid Rees # actions for nonterminal symbols. 619*99e0aae7SDavid Rees # 620*99e0aae7SDavid Rees # Here, we attach a new partial parse tree, with the production lhs as 621*99e0aae7SDavid Rees # the "name" of the tree, and the popped trees as the "children" of the 622*99e0aae7SDavid Rees # new tree. 623*99e0aae7SDavid Rees children = [ 624*99e0aae7SDavid Rees item[1] for item in stack[len(stack) - len(next_action.rule.rhs):] 625*99e0aae7SDavid Rees ] 626*99e0aae7SDavid Rees # Attach source_location, if known. The source location will not be 627*99e0aae7SDavid Rees # known if the reduction consumes no symbols (empty rhs) or if the 628*99e0aae7SDavid Rees # client did not specify source_locations for tokens. 629*99e0aae7SDavid Rees # 630*99e0aae7SDavid Rees # It is necessary to loop in order to handle cases like: 631*99e0aae7SDavid Rees # 632*99e0aae7SDavid Rees # C -> c D 633*99e0aae7SDavid Rees # D -> 634*99e0aae7SDavid Rees # 635*99e0aae7SDavid Rees # The D child of the C reduction will not have a source location 636*99e0aae7SDavid Rees # (because it is not produced from any source), so it is necessary to 637*99e0aae7SDavid Rees # scan backwards through C's children to find the end position. The 638*99e0aae7SDavid Rees # opposite is required in the case where initial children have no 639*99e0aae7SDavid Rees # source. 640*99e0aae7SDavid Rees # 641*99e0aae7SDavid Rees # These loops implicitly handle the case where the reduction has no 642*99e0aae7SDavid Rees # children, setting the source_location to None in that case. 643*99e0aae7SDavid Rees start_position = None 644*99e0aae7SDavid Rees end_position = None 645*99e0aae7SDavid Rees for child in children: 646*99e0aae7SDavid Rees if hasattr(child, 647*99e0aae7SDavid Rees "source_location") and child.source_location is not None: 648*99e0aae7SDavid Rees start_position = child.source_location.start 649*99e0aae7SDavid Rees break 650*99e0aae7SDavid Rees for child in reversed(children): 651*99e0aae7SDavid Rees if hasattr(child, 652*99e0aae7SDavid Rees "source_location") and child.source_location is not None: 653*99e0aae7SDavid Rees end_position = child.source_location.end 654*99e0aae7SDavid Rees break 655*99e0aae7SDavid Rees if start_position is None: 656*99e0aae7SDavid Rees source_location = None 657*99e0aae7SDavid Rees else: 658*99e0aae7SDavid Rees source_location = parser_types.make_location(start_position, 659*99e0aae7SDavid Rees end_position) 660*99e0aae7SDavid Rees reduction = Reduction(next_action.rule.lhs, children, next_action.rule, 661*99e0aae7SDavid Rees source_location) 662*99e0aae7SDavid Rees del stack[len(stack) - len(next_action.rule.rhs):] 663*99e0aae7SDavid Rees stack.append((self.goto[state(), next_action.rule.lhs], reduction)) 664*99e0aae7SDavid Rees elif isinstance(next_action, Error): 665*99e0aae7SDavid Rees # Error means that the parse is impossible. For typical grammars and 666*99e0aae7SDavid Rees # texts, this usually happens within a few tokens after the mistake in 667*99e0aae7SDavid Rees # the input stream, which is convenient (though imperfect) for error 668*99e0aae7SDavid Rees # reporting. 669*99e0aae7SDavid Rees return ParseResult(None, 670*99e0aae7SDavid Rees ParseError(next_action.code, cursor, tokens[cursor], 671*99e0aae7SDavid Rees state(), self.expected[state()])) 672*99e0aae7SDavid Rees else: 673*99e0aae7SDavid Rees assert False, "Shouldn't be here." 674*99e0aae7SDavid Rees 675*99e0aae7SDavid Rees def mark_error(self, tokens, error_token, error_code): 676*99e0aae7SDavid Rees """Marks an error state with the given error code. 677*99e0aae7SDavid Rees 678*99e0aae7SDavid Rees mark_error implements the equivalent of the "Merr" system presented in 679*99e0aae7SDavid Rees "Generating LR Syntax error Messages from Examples" (Jeffery, 2003). 680*99e0aae7SDavid Rees This system has limitations, but has the primary advantage that error 681*99e0aae7SDavid Rees messages can be specified by giving an example of the error and the 682*99e0aae7SDavid Rees message itself. 683*99e0aae7SDavid Rees 684*99e0aae7SDavid Rees Arguments: 685*99e0aae7SDavid Rees tokens: a list of tokens to parse. 686*99e0aae7SDavid Rees error_token: the token where the parse should fail, or None if the parse 687*99e0aae7SDavid Rees should fail at the implicit end-of-input token. 688*99e0aae7SDavid Rees 689*99e0aae7SDavid Rees If the error_token is the special ANY_TOKEN, then the error will be 690*99e0aae7SDavid Rees recorded as the default error for the error state. 691*99e0aae7SDavid Rees error_code: a value to record for the error state reached by parsing 692*99e0aae7SDavid Rees tokens. 693*99e0aae7SDavid Rees 694*99e0aae7SDavid Rees Returns: 695*99e0aae7SDavid Rees None if error_code was successfully recorded, or an error message if there 696*99e0aae7SDavid Rees was a problem. 697*99e0aae7SDavid Rees """ 698*99e0aae7SDavid Rees result = self._parse(tokens) 699*99e0aae7SDavid Rees 700*99e0aae7SDavid Rees # There is no error state to mark on a successful parse. 701*99e0aae7SDavid Rees if not result.error: 702*99e0aae7SDavid Rees return "Input successfully parsed." 703*99e0aae7SDavid Rees 704*99e0aae7SDavid Rees # Check if the error occurred at the specified token; if not, then this was 705*99e0aae7SDavid Rees # not the expected error. 706*99e0aae7SDavid Rees if error_token is None: 707*99e0aae7SDavid Rees error_symbol = END_OF_INPUT 708*99e0aae7SDavid Rees if result.error.token.symbol != END_OF_INPUT: 709*99e0aae7SDavid Rees return "error occurred on {} token, not end of input.".format( 710*99e0aae7SDavid Rees result.error.token.symbol) 711*99e0aae7SDavid Rees else: 712*99e0aae7SDavid Rees error_symbol = error_token.symbol 713*99e0aae7SDavid Rees if result.error.token != error_token: 714*99e0aae7SDavid Rees return "error occurred on {} token, not {} token.".format( 715*99e0aae7SDavid Rees result.error.token.symbol, error_token.symbol) 716*99e0aae7SDavid Rees 717*99e0aae7SDavid Rees # If the expected error was found, attempt to mark it. It is acceptable if 718*99e0aae7SDavid Rees # the given error_code is already set as the error code for the given parse, 719*99e0aae7SDavid Rees # but not if a different code is set. 720*99e0aae7SDavid Rees if result.error.token == ANY_TOKEN: 721*99e0aae7SDavid Rees # For ANY_TOKEN, mark it as a default error. 722*99e0aae7SDavid Rees if result.error.state in self.default_errors: 723*99e0aae7SDavid Rees if self.default_errors[result.error.state] == error_code: 724*99e0aae7SDavid Rees return None 725*99e0aae7SDavid Rees else: 726*99e0aae7SDavid Rees return ("Attempted to overwrite existing default error code {!r} " 727*99e0aae7SDavid Rees "with new error code {!r} for state {}".format( 728*99e0aae7SDavid Rees self.default_errors[result.error.state], error_code, 729*99e0aae7SDavid Rees result.error.state)) 730*99e0aae7SDavid Rees else: 731*99e0aae7SDavid Rees self.default_errors[result.error.state] = error_code 732*99e0aae7SDavid Rees return None 733*99e0aae7SDavid Rees else: 734*99e0aae7SDavid Rees if (result.error.state, error_symbol) in self.action: 735*99e0aae7SDavid Rees existing_error = self.action[result.error.state, error_symbol] 736*99e0aae7SDavid Rees assert isinstance(existing_error, Error), "Bug" 737*99e0aae7SDavid Rees if existing_error.code == error_code: 738*99e0aae7SDavid Rees return None 739*99e0aae7SDavid Rees else: 740*99e0aae7SDavid Rees return ("Attempted to overwrite existing error code {!r} with new " 741*99e0aae7SDavid Rees "error code {!r} for state {}, terminal {}".format( 742*99e0aae7SDavid Rees existing_error.code, error_code, result.error.state, 743*99e0aae7SDavid Rees error_symbol)) 744*99e0aae7SDavid Rees else: 745*99e0aae7SDavid Rees self.action[result.error.state, error_symbol] = Error(error_code) 746*99e0aae7SDavid Rees return None 747*99e0aae7SDavid Rees assert False, "All other paths should lead to return." 748*99e0aae7SDavid Rees 749*99e0aae7SDavid Rees def parse(self, tokens): 750*99e0aae7SDavid Rees """Parses a list of tokens. 751*99e0aae7SDavid Rees 752*99e0aae7SDavid Rees Arguments: 753*99e0aae7SDavid Rees tokens: a list of tokens to parse. 754*99e0aae7SDavid Rees 755*99e0aae7SDavid Rees Returns: 756*99e0aae7SDavid Rees A ParseResult. 757*99e0aae7SDavid Rees """ 758*99e0aae7SDavid Rees result = self._parse(tokens) 759*99e0aae7SDavid Rees return result 760