1*99e0aae7SDavid Rees# Copyright 2019 Google LLC 2*99e0aae7SDavid Rees# 3*99e0aae7SDavid Rees# Licensed under the Apache License, Version 2.0 (the "License"); 4*99e0aae7SDavid Rees# you may not use this file except in compliance with the License. 5*99e0aae7SDavid Rees# You may obtain a copy of the License at 6*99e0aae7SDavid Rees# 7*99e0aae7SDavid Rees# https://www.apache.org/licenses/LICENSE-2.0 8*99e0aae7SDavid Rees# 9*99e0aae7SDavid Rees# Unless required by applicable law or agreed to in writing, software 10*99e0aae7SDavid Rees# distributed under the License is distributed on an "AS IS" BASIS, 11*99e0aae7SDavid Rees# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*99e0aae7SDavid Rees# See the License for the specific language governing permissions and 13*99e0aae7SDavid Rees# limitations under the License. 14*99e0aae7SDavid Rees 15*99e0aae7SDavid Rees"""Tokenization for the Emboss definition language. 16*99e0aae7SDavid Rees 17*99e0aae7SDavid ReesThis module exports the tokenize function and various errors. 18*99e0aae7SDavid Rees 19*99e0aae7SDavid ReesIn addition, a couple of lists are exported for the use of 20*99e0aae7SDavid Reesgenerate_grammar_md.py: 21*99e0aae7SDavid Rees 22*99e0aae7SDavid ReesLITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against 23*99e0aae7SDavid Rees input. 24*99e0aae7SDavid ReesREGEX_TOKEN_PATTERNS: A list of regexes used for tokenization. 25*99e0aae7SDavid Rees REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject 26*99e0aae7SDavid Rees (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and 27*99e0aae7SDavid Rees REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens 28*99e0aae7SDavid Rees which match the pattern. 29*99e0aae7SDavid Rees""" 30*99e0aae7SDavid Rees 31*99e0aae7SDavid Reesimport collections 32*99e0aae7SDavid Reesimport re 33*99e0aae7SDavid Rees 34*99e0aae7SDavid Reesfrom compiler.util import error 35*99e0aae7SDavid Reesfrom compiler.util import parser_types 36*99e0aae7SDavid Rees 37*99e0aae7SDavid Rees 38*99e0aae7SDavid Reesdef tokenize(text, file_name): 39*99e0aae7SDavid Rees # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched 40*99e0aae7SDavid Rees # delimiters ([], (), and {}). 41*99e0aae7SDavid Rees """Tokenizes its argument. 42*99e0aae7SDavid Rees 43*99e0aae7SDavid Rees Arguments: 44*99e0aae7SDavid Rees text: The raw text of a .emb file. 45*99e0aae7SDavid Rees file_name: The name of the file to use in errors. 46*99e0aae7SDavid Rees 47*99e0aae7SDavid Rees Returns: 48*99e0aae7SDavid Rees A tuple of: 49*99e0aae7SDavid Rees a list of parser_types.Tokens or None 50*99e0aae7SDavid Rees a possibly-empty list of errors. 51*99e0aae7SDavid Rees """ 52*99e0aae7SDavid Rees tokens = [] 53*99e0aae7SDavid Rees indent_stack = [""] 54*99e0aae7SDavid Rees line_number = 0 55*99e0aae7SDavid Rees for line in text.splitlines(): 56*99e0aae7SDavid Rees line_number += 1 57*99e0aae7SDavid Rees 58*99e0aae7SDavid Rees # _tokenize_line splits the actual text into tokens. 59*99e0aae7SDavid Rees line_tokens, errors = _tokenize_line(line, line_number, file_name) 60*99e0aae7SDavid Rees if errors: 61*99e0aae7SDavid Rees return None, errors 62*99e0aae7SDavid Rees 63*99e0aae7SDavid Rees # Lines with only whitespace and comments are not used for Indent/Dedent 64*99e0aae7SDavid Rees # calculation, and do not produce end-of-line tokens. 65*99e0aae7SDavid Rees for token in line_tokens: 66*99e0aae7SDavid Rees if token.symbol != "Comment": 67*99e0aae7SDavid Rees break 68*99e0aae7SDavid Rees else: 69*99e0aae7SDavid Rees tokens.extend(line_tokens) 70*99e0aae7SDavid Rees tokens.append(parser_types.Token( 71*99e0aae7SDavid Rees '"\\n"', "\n", parser_types.make_location( 72*99e0aae7SDavid Rees (line_number, len(line) + 1), (line_number, len(line) + 1)))) 73*99e0aae7SDavid Rees continue 74*99e0aae7SDavid Rees 75*99e0aae7SDavid Rees # Leading whitespace is whatever .lstrip() removes. 76*99e0aae7SDavid Rees leading_whitespace = line[0:len(line) - len(line.lstrip())] 77*99e0aae7SDavid Rees if leading_whitespace == indent_stack[-1]: 78*99e0aae7SDavid Rees # If the current leading whitespace is equal to the last leading 79*99e0aae7SDavid Rees # whitespace, do not emit an Indent or Dedent token. 80*99e0aae7SDavid Rees pass 81*99e0aae7SDavid Rees elif leading_whitespace.startswith(indent_stack[-1]): 82*99e0aae7SDavid Rees # If the current leading whitespace is longer than the last leading 83*99e0aae7SDavid Rees # whitespace, emit an Indent token. For the token text, take the new 84*99e0aae7SDavid Rees # part of the whitespace. 85*99e0aae7SDavid Rees tokens.append( 86*99e0aae7SDavid Rees parser_types.Token( 87*99e0aae7SDavid Rees "Indent", leading_whitespace[len(indent_stack[-1]):], 88*99e0aae7SDavid Rees parser_types.make_location( 89*99e0aae7SDavid Rees (line_number, len(indent_stack[-1]) + 1), 90*99e0aae7SDavid Rees (line_number, len(leading_whitespace) + 1)))) 91*99e0aae7SDavid Rees indent_stack.append(leading_whitespace) 92*99e0aae7SDavid Rees else: 93*99e0aae7SDavid Rees # Otherwise, search for the unclosed indentation level that matches 94*99e0aae7SDavid Rees # the current indentation level. Emit a Dedent token for each 95*99e0aae7SDavid Rees # newly-closed indentation level. 96*99e0aae7SDavid Rees for i in range(len(indent_stack) - 1, -1, -1): 97*99e0aae7SDavid Rees if leading_whitespace == indent_stack[i]: 98*99e0aae7SDavid Rees break 99*99e0aae7SDavid Rees tokens.append( 100*99e0aae7SDavid Rees parser_types.Token("Dedent", "", parser_types.make_location( 101*99e0aae7SDavid Rees (line_number, len(leading_whitespace) + 1), 102*99e0aae7SDavid Rees (line_number, len(leading_whitespace) + 1)))) 103*99e0aae7SDavid Rees del indent_stack[i] 104*99e0aae7SDavid Rees else: 105*99e0aae7SDavid Rees return None, [[error.error( 106*99e0aae7SDavid Rees file_name, parser_types.make_location( 107*99e0aae7SDavid Rees (line_number, 1), (line_number, len(leading_whitespace) + 1)), 108*99e0aae7SDavid Rees "Bad indentation")]] 109*99e0aae7SDavid Rees 110*99e0aae7SDavid Rees tokens.extend(line_tokens) 111*99e0aae7SDavid Rees 112*99e0aae7SDavid Rees # Append an end-of-line token (for non-whitespace lines). 113*99e0aae7SDavid Rees tokens.append(parser_types.Token( 114*99e0aae7SDavid Rees '"\\n"', "\n", parser_types.make_location( 115*99e0aae7SDavid Rees (line_number, len(line) + 1), (line_number, len(line) + 1)))) 116*99e0aae7SDavid Rees for i in range(len(indent_stack) - 1): 117*99e0aae7SDavid Rees tokens.append(parser_types.Token("Dedent", "", parser_types.make_location( 118*99e0aae7SDavid Rees (line_number + 1, 1), (line_number + 1, 1)))) 119*99e0aae7SDavid Rees return tokens, [] 120*99e0aae7SDavid Rees 121*99e0aae7SDavid Rees# Token patterns used by _tokenize_line. 122*99e0aae7SDavid ReesLITERAL_TOKEN_PATTERNS = ( 123*99e0aae7SDavid Rees "[ ] ( ) : = + - * . ? == != && || < > <= >= , " 124*99e0aae7SDavid Rees "$static_size_in_bits $is_statically_sized " 125*99e0aae7SDavid Rees "$max $present $upper_bound $lower_bound $next " 126*99e0aae7SDavid Rees "$size_in_bits $size_in_bytes " 127*99e0aae7SDavid Rees "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes " 128*99e0aae7SDavid Rees "$default struct bits enum external import as if let").split() 129*99e0aae7SDavid Rees_T = collections.namedtuple("T", ["regex", "symbol"]) 130*99e0aae7SDavid ReesREGEX_TOKEN_PATTERNS = [ 131*99e0aae7SDavid Rees # Words starting with variations of "emboss reserved" are reserved for 132*99e0aae7SDavid Rees # internal use by the Emboss compiler. 133*99e0aae7SDavid Rees _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"), 134*99e0aae7SDavid Rees _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"), 135*99e0aae7SDavid Rees _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"), 136*99e0aae7SDavid Rees _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"), 137*99e0aae7SDavid Rees _T(re.compile("[0-9]+"), "Number"), 138*99e0aae7SDavid Rees _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"), 139*99e0aae7SDavid Rees _T(re.compile("0x[0-9a-fA-F]+"), "Number"), 140*99e0aae7SDavid Rees _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"), 141*99e0aae7SDavid Rees _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"), 142*99e0aae7SDavid Rees _T(re.compile("0b[01]+"), "Number"), 143*99e0aae7SDavid Rees _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"), 144*99e0aae7SDavid Rees _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"), 145*99e0aae7SDavid Rees _T(re.compile("true|false"), "BooleanConstant"), 146*99e0aae7SDavid Rees _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"), 147*99e0aae7SDavid Rees # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number 148*99e0aae7SDavid Rees # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A 149*99e0aae7SDavid Rees # ShoutyWord must start with an upper case letter and contain at least one 150*99e0aae7SDavid Rees # more upper case letter or '_'. 151*99e0aae7SDavid Rees _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"), 152*99e0aae7SDavid Rees # A CamelWord starts with A-Z and contains at least one a-z, and no _. 153*99e0aae7SDavid Rees _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"), 154*99e0aae7SDavid Rees _T(re.compile("-- .*"), "Documentation"), 155*99e0aae7SDavid Rees _T(re.compile("--$"), "Documentation"), 156*99e0aae7SDavid Rees _T(re.compile("--.*"), "BadDocumentation"), 157*99e0aae7SDavid Rees _T(re.compile(r"\s+"), None), 158*99e0aae7SDavid Rees _T(re.compile("#.*"), "Comment"), 159*99e0aae7SDavid Rees # BadWord and BadNumber are a catch-alls for words and numbers so that 160*99e0aae7SDavid Rees # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord]. 161*99e0aae7SDavid Rees # 162*99e0aae7SDavid Rees # This is preferable to returning an error because the BadWord and BadNumber 163*99e0aae7SDavid Rees # token types can be used in example-based errors. 164*99e0aae7SDavid Rees _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"), 165*99e0aae7SDavid Rees _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"), 166*99e0aae7SDavid Rees] 167*99e0aae7SDavid Reesdel _T 168*99e0aae7SDavid Rees 169*99e0aae7SDavid Rees 170*99e0aae7SDavid Reesdef _tokenize_line(line, line_number, file_name): 171*99e0aae7SDavid Rees """Tokenizes a single line of input. 172*99e0aae7SDavid Rees 173*99e0aae7SDavid Rees Arguments: 174*99e0aae7SDavid Rees line: The line of text to tokenize. 175*99e0aae7SDavid Rees line_number: The line number (used when constructing token objects). 176*99e0aae7SDavid Rees file_name: The name of a file to use in errors. 177*99e0aae7SDavid Rees 178*99e0aae7SDavid Rees Returns: 179*99e0aae7SDavid Rees A tuple of: 180*99e0aae7SDavid Rees A list of token objects or None. 181*99e0aae7SDavid Rees A possibly-empty list of errors. 182*99e0aae7SDavid Rees """ 183*99e0aae7SDavid Rees tokens = [] 184*99e0aae7SDavid Rees offset = 0 185*99e0aae7SDavid Rees while offset < len(line): 186*99e0aae7SDavid Rees best_candidate = "" 187*99e0aae7SDavid Rees best_candidate_symbol = None 188*99e0aae7SDavid Rees # Find the longest match. Ties go to the first match. This way, keywords 189*99e0aae7SDavid Rees # ("struct") are matched as themselves, but words that only happen to start 190*99e0aae7SDavid Rees # with keywords ("structure") are matched as words. 191*99e0aae7SDavid Rees # 192*99e0aae7SDavid Rees # There is never a reason to try to match a literal after a regex that 193*99e0aae7SDavid Rees # could also match that literal, so check literals first. 194*99e0aae7SDavid Rees for literal in LITERAL_TOKEN_PATTERNS: 195*99e0aae7SDavid Rees if line[offset:].startswith(literal) and len(literal) > len( 196*99e0aae7SDavid Rees best_candidate): 197*99e0aae7SDavid Rees best_candidate = literal 198*99e0aae7SDavid Rees # For Emboss, the name of a literal token is just the literal in quotes, 199*99e0aae7SDavid Rees # so that the grammar can read a little more naturally, e.g.: 200*99e0aae7SDavid Rees # 201*99e0aae7SDavid Rees # expression -> expression "+" expression 202*99e0aae7SDavid Rees # 203*99e0aae7SDavid Rees # instead of 204*99e0aae7SDavid Rees # 205*99e0aae7SDavid Rees # expression -> expression Plus expression 206*99e0aae7SDavid Rees best_candidate_symbol = '"' + literal + '"' 207*99e0aae7SDavid Rees for pattern in REGEX_TOKEN_PATTERNS: 208*99e0aae7SDavid Rees match_result = pattern.regex.match(line[offset:]) 209*99e0aae7SDavid Rees if match_result and len(match_result.group(0)) > len(best_candidate): 210*99e0aae7SDavid Rees best_candidate = match_result.group(0) 211*99e0aae7SDavid Rees best_candidate_symbol = pattern.symbol 212*99e0aae7SDavid Rees if not best_candidate: 213*99e0aae7SDavid Rees return None, [[error.error( 214*99e0aae7SDavid Rees file_name, parser_types.make_location( 215*99e0aae7SDavid Rees (line_number, offset + 1), (line_number, offset + 2)), 216*99e0aae7SDavid Rees "Unrecognized token")]] 217*99e0aae7SDavid Rees if best_candidate_symbol: 218*99e0aae7SDavid Rees tokens.append(parser_types.Token( 219*99e0aae7SDavid Rees best_candidate_symbol, best_candidate, parser_types.make_location( 220*99e0aae7SDavid Rees (line_number, offset + 1), 221*99e0aae7SDavid Rees (line_number, offset + len(best_candidate) + 1)))) 222*99e0aae7SDavid Rees offset += len(best_candidate) 223*99e0aae7SDavid Rees return tokens, None 224