xref: /aosp_15_r20/external/emboss/compiler/front_end/tokenizer.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70)
1*99e0aae7SDavid Rees# Copyright 2019 Google LLC
2*99e0aae7SDavid Rees#
3*99e0aae7SDavid Rees# Licensed under the Apache License, Version 2.0 (the "License");
4*99e0aae7SDavid Rees# you may not use this file except in compliance with the License.
5*99e0aae7SDavid Rees# You may obtain a copy of the License at
6*99e0aae7SDavid Rees#
7*99e0aae7SDavid Rees#     https://www.apache.org/licenses/LICENSE-2.0
8*99e0aae7SDavid Rees#
9*99e0aae7SDavid Rees# Unless required by applicable law or agreed to in writing, software
10*99e0aae7SDavid Rees# distributed under the License is distributed on an "AS IS" BASIS,
11*99e0aae7SDavid Rees# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*99e0aae7SDavid Rees# See the License for the specific language governing permissions and
13*99e0aae7SDavid Rees# limitations under the License.
14*99e0aae7SDavid Rees
15*99e0aae7SDavid Rees"""Tokenization for the Emboss definition language.
16*99e0aae7SDavid Rees
17*99e0aae7SDavid ReesThis module exports the tokenize function and various errors.
18*99e0aae7SDavid Rees
19*99e0aae7SDavid ReesIn addition, a couple of lists are exported for the use of
20*99e0aae7SDavid Reesgenerate_grammar_md.py:
21*99e0aae7SDavid Rees
22*99e0aae7SDavid ReesLITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against
23*99e0aae7SDavid Rees  input.
24*99e0aae7SDavid ReesREGEX_TOKEN_PATTERNS: A list of regexes used for tokenization.
25*99e0aae7SDavid Rees  REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject
26*99e0aae7SDavid Rees  (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and
27*99e0aae7SDavid Rees  REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens
28*99e0aae7SDavid Rees  which match the pattern.
29*99e0aae7SDavid Rees"""
30*99e0aae7SDavid Rees
31*99e0aae7SDavid Reesimport collections
32*99e0aae7SDavid Reesimport re
33*99e0aae7SDavid Rees
34*99e0aae7SDavid Reesfrom compiler.util import error
35*99e0aae7SDavid Reesfrom compiler.util import parser_types
36*99e0aae7SDavid Rees
37*99e0aae7SDavid Rees
38*99e0aae7SDavid Reesdef tokenize(text, file_name):
39*99e0aae7SDavid Rees  # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched
40*99e0aae7SDavid Rees  # delimiters ([], (), and {}).
41*99e0aae7SDavid Rees  """Tokenizes its argument.
42*99e0aae7SDavid Rees
43*99e0aae7SDavid Rees  Arguments:
44*99e0aae7SDavid Rees    text: The raw text of a .emb file.
45*99e0aae7SDavid Rees    file_name: The name of the file to use in errors.
46*99e0aae7SDavid Rees
47*99e0aae7SDavid Rees  Returns:
48*99e0aae7SDavid Rees    A tuple of:
49*99e0aae7SDavid Rees      a list of parser_types.Tokens or None
50*99e0aae7SDavid Rees      a possibly-empty list of errors.
51*99e0aae7SDavid Rees  """
52*99e0aae7SDavid Rees  tokens = []
53*99e0aae7SDavid Rees  indent_stack = [""]
54*99e0aae7SDavid Rees  line_number = 0
55*99e0aae7SDavid Rees  for line in text.splitlines():
56*99e0aae7SDavid Rees    line_number += 1
57*99e0aae7SDavid Rees
58*99e0aae7SDavid Rees    # _tokenize_line splits the actual text into tokens.
59*99e0aae7SDavid Rees    line_tokens, errors = _tokenize_line(line, line_number, file_name)
60*99e0aae7SDavid Rees    if errors:
61*99e0aae7SDavid Rees      return None, errors
62*99e0aae7SDavid Rees
63*99e0aae7SDavid Rees    # Lines with only whitespace and comments are not used for Indent/Dedent
64*99e0aae7SDavid Rees    # calculation, and do not produce end-of-line tokens.
65*99e0aae7SDavid Rees    for token in line_tokens:
66*99e0aae7SDavid Rees      if token.symbol != "Comment":
67*99e0aae7SDavid Rees        break
68*99e0aae7SDavid Rees    else:
69*99e0aae7SDavid Rees      tokens.extend(line_tokens)
70*99e0aae7SDavid Rees      tokens.append(parser_types.Token(
71*99e0aae7SDavid Rees          '"\\n"', "\n", parser_types.make_location(
72*99e0aae7SDavid Rees              (line_number, len(line) + 1), (line_number, len(line) + 1))))
73*99e0aae7SDavid Rees      continue
74*99e0aae7SDavid Rees
75*99e0aae7SDavid Rees    # Leading whitespace is whatever .lstrip() removes.
76*99e0aae7SDavid Rees    leading_whitespace = line[0:len(line) - len(line.lstrip())]
77*99e0aae7SDavid Rees    if leading_whitespace == indent_stack[-1]:
78*99e0aae7SDavid Rees      # If the current leading whitespace is equal to the last leading
79*99e0aae7SDavid Rees      # whitespace, do not emit an Indent or Dedent token.
80*99e0aae7SDavid Rees      pass
81*99e0aae7SDavid Rees    elif leading_whitespace.startswith(indent_stack[-1]):
82*99e0aae7SDavid Rees      # If the current leading whitespace is longer than the last leading
83*99e0aae7SDavid Rees      # whitespace, emit an Indent token.  For the token text, take the new
84*99e0aae7SDavid Rees      # part of the whitespace.
85*99e0aae7SDavid Rees      tokens.append(
86*99e0aae7SDavid Rees          parser_types.Token(
87*99e0aae7SDavid Rees              "Indent", leading_whitespace[len(indent_stack[-1]):],
88*99e0aae7SDavid Rees              parser_types.make_location(
89*99e0aae7SDavid Rees                  (line_number, len(indent_stack[-1]) + 1),
90*99e0aae7SDavid Rees                  (line_number, len(leading_whitespace) + 1))))
91*99e0aae7SDavid Rees      indent_stack.append(leading_whitespace)
92*99e0aae7SDavid Rees    else:
93*99e0aae7SDavid Rees      # Otherwise, search for the unclosed indentation level that matches
94*99e0aae7SDavid Rees      # the current indentation level.  Emit a Dedent token for each
95*99e0aae7SDavid Rees      # newly-closed indentation level.
96*99e0aae7SDavid Rees      for i in range(len(indent_stack) - 1, -1, -1):
97*99e0aae7SDavid Rees        if leading_whitespace == indent_stack[i]:
98*99e0aae7SDavid Rees          break
99*99e0aae7SDavid Rees        tokens.append(
100*99e0aae7SDavid Rees            parser_types.Token("Dedent", "", parser_types.make_location(
101*99e0aae7SDavid Rees                (line_number, len(leading_whitespace) + 1),
102*99e0aae7SDavid Rees                (line_number, len(leading_whitespace) + 1))))
103*99e0aae7SDavid Rees        del indent_stack[i]
104*99e0aae7SDavid Rees      else:
105*99e0aae7SDavid Rees        return None, [[error.error(
106*99e0aae7SDavid Rees            file_name, parser_types.make_location(
107*99e0aae7SDavid Rees                (line_number, 1), (line_number, len(leading_whitespace) + 1)),
108*99e0aae7SDavid Rees            "Bad indentation")]]
109*99e0aae7SDavid Rees
110*99e0aae7SDavid Rees    tokens.extend(line_tokens)
111*99e0aae7SDavid Rees
112*99e0aae7SDavid Rees    # Append an end-of-line token (for non-whitespace lines).
113*99e0aae7SDavid Rees    tokens.append(parser_types.Token(
114*99e0aae7SDavid Rees        '"\\n"', "\n", parser_types.make_location(
115*99e0aae7SDavid Rees            (line_number, len(line) + 1), (line_number, len(line) + 1))))
116*99e0aae7SDavid Rees  for i in range(len(indent_stack) - 1):
117*99e0aae7SDavid Rees    tokens.append(parser_types.Token("Dedent", "", parser_types.make_location(
118*99e0aae7SDavid Rees        (line_number + 1, 1), (line_number + 1, 1))))
119*99e0aae7SDavid Rees  return tokens, []
120*99e0aae7SDavid Rees
121*99e0aae7SDavid Rees# Token patterns used by _tokenize_line.
122*99e0aae7SDavid ReesLITERAL_TOKEN_PATTERNS = (
123*99e0aae7SDavid Rees    "[ ] ( ) : = + - * . ? == != && || < > <= >= , "
124*99e0aae7SDavid Rees    "$static_size_in_bits $is_statically_sized "
125*99e0aae7SDavid Rees    "$max $present $upper_bound $lower_bound $next "
126*99e0aae7SDavid Rees    "$size_in_bits $size_in_bytes "
127*99e0aae7SDavid Rees    "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes "
128*99e0aae7SDavid Rees    "$default struct bits enum external import as if let").split()
129*99e0aae7SDavid Rees_T = collections.namedtuple("T", ["regex", "symbol"])
130*99e0aae7SDavid ReesREGEX_TOKEN_PATTERNS = [
131*99e0aae7SDavid Rees    # Words starting with variations of "emboss reserved" are reserved for
132*99e0aae7SDavid Rees    # internal use by the Emboss compiler.
133*99e0aae7SDavid Rees    _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"),
134*99e0aae7SDavid Rees    _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"),
135*99e0aae7SDavid Rees    _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"),
136*99e0aae7SDavid Rees    _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"),
137*99e0aae7SDavid Rees    _T(re.compile("[0-9]+"), "Number"),
138*99e0aae7SDavid Rees    _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"),
139*99e0aae7SDavid Rees    _T(re.compile("0x[0-9a-fA-F]+"), "Number"),
140*99e0aae7SDavid Rees    _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"),
141*99e0aae7SDavid Rees    _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"),
142*99e0aae7SDavid Rees    _T(re.compile("0b[01]+"), "Number"),
143*99e0aae7SDavid Rees    _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"),
144*99e0aae7SDavid Rees    _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"),
145*99e0aae7SDavid Rees    _T(re.compile("true|false"), "BooleanConstant"),
146*99e0aae7SDavid Rees    _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"),
147*99e0aae7SDavid Rees    # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number
148*99e0aae7SDavid Rees    # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords.  A
149*99e0aae7SDavid Rees    # ShoutyWord must start with an upper case letter and contain at least one
150*99e0aae7SDavid Rees    # more upper case letter or '_'.
151*99e0aae7SDavid Rees    _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"),
152*99e0aae7SDavid Rees    # A CamelWord starts with A-Z and contains at least one a-z, and no _.
153*99e0aae7SDavid Rees    _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"),
154*99e0aae7SDavid Rees    _T(re.compile("-- .*"), "Documentation"),
155*99e0aae7SDavid Rees    _T(re.compile("--$"), "Documentation"),
156*99e0aae7SDavid Rees    _T(re.compile("--.*"), "BadDocumentation"),
157*99e0aae7SDavid Rees    _T(re.compile(r"\s+"), None),
158*99e0aae7SDavid Rees    _T(re.compile("#.*"), "Comment"),
159*99e0aae7SDavid Rees    # BadWord and BadNumber are a catch-alls for words and numbers so that
160*99e0aae7SDavid Rees    # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord].
161*99e0aae7SDavid Rees    #
162*99e0aae7SDavid Rees    # This is preferable to returning an error because the BadWord and BadNumber
163*99e0aae7SDavid Rees    # token types can be used in example-based errors.
164*99e0aae7SDavid Rees    _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"),
165*99e0aae7SDavid Rees    _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"),
166*99e0aae7SDavid Rees]
167*99e0aae7SDavid Reesdel _T
168*99e0aae7SDavid Rees
169*99e0aae7SDavid Rees
170*99e0aae7SDavid Reesdef _tokenize_line(line, line_number, file_name):
171*99e0aae7SDavid Rees  """Tokenizes a single line of input.
172*99e0aae7SDavid Rees
173*99e0aae7SDavid Rees  Arguments:
174*99e0aae7SDavid Rees    line: The line of text to tokenize.
175*99e0aae7SDavid Rees    line_number: The line number (used when constructing token objects).
176*99e0aae7SDavid Rees    file_name: The name of a file to use in errors.
177*99e0aae7SDavid Rees
178*99e0aae7SDavid Rees  Returns:
179*99e0aae7SDavid Rees    A tuple of:
180*99e0aae7SDavid Rees      A list of token objects or None.
181*99e0aae7SDavid Rees      A possibly-empty list of errors.
182*99e0aae7SDavid Rees  """
183*99e0aae7SDavid Rees  tokens = []
184*99e0aae7SDavid Rees  offset = 0
185*99e0aae7SDavid Rees  while offset < len(line):
186*99e0aae7SDavid Rees    best_candidate = ""
187*99e0aae7SDavid Rees    best_candidate_symbol = None
188*99e0aae7SDavid Rees    # Find the longest match.  Ties go to the first match.  This way, keywords
189*99e0aae7SDavid Rees    # ("struct") are matched as themselves, but words that only happen to start
190*99e0aae7SDavid Rees    # with keywords ("structure") are matched as words.
191*99e0aae7SDavid Rees    #
192*99e0aae7SDavid Rees    # There is never a reason to try to match a literal after a regex that
193*99e0aae7SDavid Rees    # could also match that literal, so check literals first.
194*99e0aae7SDavid Rees    for literal in LITERAL_TOKEN_PATTERNS:
195*99e0aae7SDavid Rees      if line[offset:].startswith(literal) and len(literal) > len(
196*99e0aae7SDavid Rees          best_candidate):
197*99e0aae7SDavid Rees        best_candidate = literal
198*99e0aae7SDavid Rees        # For Emboss, the name of a literal token is just the literal in quotes,
199*99e0aae7SDavid Rees        # so that the grammar can read a little more naturally, e.g.:
200*99e0aae7SDavid Rees        #
201*99e0aae7SDavid Rees        #     expression -> expression "+" expression
202*99e0aae7SDavid Rees        #
203*99e0aae7SDavid Rees        # instead of
204*99e0aae7SDavid Rees        #
205*99e0aae7SDavid Rees        #     expression -> expression Plus expression
206*99e0aae7SDavid Rees        best_candidate_symbol = '"' + literal + '"'
207*99e0aae7SDavid Rees    for pattern in REGEX_TOKEN_PATTERNS:
208*99e0aae7SDavid Rees      match_result = pattern.regex.match(line[offset:])
209*99e0aae7SDavid Rees      if match_result and len(match_result.group(0)) > len(best_candidate):
210*99e0aae7SDavid Rees        best_candidate = match_result.group(0)
211*99e0aae7SDavid Rees        best_candidate_symbol = pattern.symbol
212*99e0aae7SDavid Rees    if not best_candidate:
213*99e0aae7SDavid Rees      return None, [[error.error(
214*99e0aae7SDavid Rees          file_name, parser_types.make_location(
215*99e0aae7SDavid Rees              (line_number, offset + 1), (line_number, offset + 2)),
216*99e0aae7SDavid Rees          "Unrecognized token")]]
217*99e0aae7SDavid Rees    if best_candidate_symbol:
218*99e0aae7SDavid Rees      tokens.append(parser_types.Token(
219*99e0aae7SDavid Rees          best_candidate_symbol, best_candidate, parser_types.make_location(
220*99e0aae7SDavid Rees              (line_number, offset + 1),
221*99e0aae7SDavid Rees              (line_number, offset + len(best_candidate) + 1))))
222*99e0aae7SDavid Rees    offset += len(best_candidate)
223*99e0aae7SDavid Rees  return tokens, None
224