xref: /aosp_15_r20/external/protobuf/python/google/protobuf/text_format.py (revision 1b3f573f81763fcece89efc2b6a5209149e44ab8)
1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example::
34
35  # Create a proto object and serialize it to a text proto string.
36  message = my_proto_pb2.MyMessage(foo='bar')
37  text_proto = text_format.MessageToString(message)
38
39  # Parse a text proto string.
40  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = '[email protected] (Kenton Varda)'
44
45# TODO(b/129989314) Import thread contention leads to test failures.
46import encodings.raw_unicode_escape  # pylint: disable=unused-import
47import encodings.unicode_escape  # pylint: disable=unused-import
48import io
49import math
50import re
51
52from google.protobuf.internal import decoder
53from google.protobuf.internal import type_checkers
54from google.protobuf import descriptor
55from google.protobuf import text_encoding
56from google.protobuf import unknown_fields
57
58# pylint: disable=g-import-not-at-top
59__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
60           'PrintFieldValue', 'Merge', 'MessageToBytes']
61
62_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
63                     type_checkers.Int32ValueChecker(),
64                     type_checkers.Uint64ValueChecker(),
65                     type_checkers.Int64ValueChecker())
66_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
67_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
68_QUOTES = frozenset(("'", '"'))
69_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
70
71
72class Error(Exception):
73  """Top-level module error for text_format."""
74
75
76class ParseError(Error):
77  """Thrown in case of text parsing or tokenizing error."""
78
79  def __init__(self, message=None, line=None, column=None):
80    if message is not None and line is not None:
81      loc = str(line)
82      if column is not None:
83        loc += ':{0}'.format(column)
84      message = '{0} : {1}'.format(loc, message)
85    if message is not None:
86      super(ParseError, self).__init__(message)
87    else:
88      super(ParseError, self).__init__()
89    self._line = line
90    self._column = column
91
92  def GetLine(self):
93    return self._line
94
95  def GetColumn(self):
96    return self._column
97
98
99class TextWriter(object):
100
101  def __init__(self, as_utf8):
102    self._writer = io.StringIO()
103
104  def write(self, val):
105    return self._writer.write(val)
106
107  def close(self):
108    return self._writer.close()
109
110  def getvalue(self):
111    return self._writer.getvalue()
112
113
114def MessageToString(
115    message,
116    as_utf8=False,
117    as_one_line=False,
118    use_short_repeated_primitives=False,
119    pointy_brackets=False,
120    use_index_order=False,
121    float_format=None,
122    double_format=None,
123    use_field_number=False,
124    descriptor_pool=None,
125    indent=0,
126    message_formatter=None,
127    print_unknown_fields=False,
128    force_colon=False):
129  # type: (...) -> str
130  """Convert protobuf message to text format.
131
132  Double values can be formatted compactly with 15 digits of
133  precision (which is the most that IEEE 754 "double" can guarantee)
134  using double_format='.15g'. To ensure that converting to text and back to a
135  proto will result in an identical value, double_format='.17g' should be used.
136
137  Args:
138    message: The protocol buffers message.
139    as_utf8: Return unescaped Unicode for non-ASCII characters.
140    as_one_line: Don't introduce newlines between fields.
141    use_short_repeated_primitives: Use short repeated format for primitives.
142    pointy_brackets: If True, use angle brackets instead of curly braces for
143      nesting.
144    use_index_order: If True, fields of a proto message will be printed using
145      the order defined in source code instead of the field number, extensions
146      will be printed at the end of the message and their relative order is
147      determined by the extension number. By default, use the field number
148      order.
149    float_format (str): If set, use this to specify float field formatting
150      (per the "Format Specification Mini-Language"); otherwise, shortest float
151      that has same value in wire will be printed. Also affect double field
152      if double_format is not set but float_format is set.
153    double_format (str): If set, use this to specify double field formatting
154      (per the "Format Specification Mini-Language"); if it is not set but
155      float_format is set, use float_format. Otherwise, use ``str()``
156    use_field_number: If True, print field numbers instead of names.
157    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
158    indent (int): The initial indent level, in terms of spaces, for pretty
159      print.
160    message_formatter (function(message, indent, as_one_line) -> unicode|None):
161      Custom formatter for selected sub-messages (usually based on message
162      type). Use to pretty print parts of the protobuf for easier diffing.
163    print_unknown_fields: If True, unknown fields will be printed.
164    force_colon: If set, a colon will be added after the field name even if the
165      field is a proto message.
166
167  Returns:
168    str: A string of the text formatted protocol buffer message.
169  """
170  out = TextWriter(as_utf8)
171  printer = _Printer(
172      out,
173      indent,
174      as_utf8,
175      as_one_line,
176      use_short_repeated_primitives,
177      pointy_brackets,
178      use_index_order,
179      float_format,
180      double_format,
181      use_field_number,
182      descriptor_pool,
183      message_formatter,
184      print_unknown_fields=print_unknown_fields,
185      force_colon=force_colon)
186  printer.PrintMessage(message)
187  result = out.getvalue()
188  out.close()
189  if as_one_line:
190    return result.rstrip()
191  return result
192
193
194def MessageToBytes(message, **kwargs):
195  # type: (...) -> bytes
196  """Convert protobuf message to encoded text format.  See MessageToString."""
197  text = MessageToString(message, **kwargs)
198  if isinstance(text, bytes):
199    return text
200  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
201  return text.encode(codec)
202
203
204def _IsMapEntry(field):
205  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
206          field.message_type.has_options and
207          field.message_type.GetOptions().map_entry)
208
209
210def PrintMessage(message,
211                 out,
212                 indent=0,
213                 as_utf8=False,
214                 as_one_line=False,
215                 use_short_repeated_primitives=False,
216                 pointy_brackets=False,
217                 use_index_order=False,
218                 float_format=None,
219                 double_format=None,
220                 use_field_number=False,
221                 descriptor_pool=None,
222                 message_formatter=None,
223                 print_unknown_fields=False,
224                 force_colon=False):
225  """Convert the message to text format and write it to the out stream.
226
227  Args:
228    message: The Message object to convert to text format.
229    out: A file handle to write the message to.
230    indent: The initial indent level for pretty print.
231    as_utf8: Return unescaped Unicode for non-ASCII characters.
232    as_one_line: Don't introduce newlines between fields.
233    use_short_repeated_primitives: Use short repeated format for primitives.
234    pointy_brackets: If True, use angle brackets instead of curly braces for
235      nesting.
236    use_index_order: If True, print fields of a proto message using the order
237      defined in source code instead of the field number. By default, use the
238      field number order.
239    float_format: If set, use this to specify float field formatting
240      (per the "Format Specification Mini-Language"); otherwise, shortest
241      float that has same value in wire will be printed. Also affect double
242      field if double_format is not set but float_format is set.
243    double_format: If set, use this to specify double field formatting
244      (per the "Format Specification Mini-Language"); if it is not set but
245      float_format is set, use float_format. Otherwise, str() is used.
246    use_field_number: If True, print field numbers instead of names.
247    descriptor_pool: A DescriptorPool used to resolve Any types.
248    message_formatter: A function(message, indent, as_one_line): unicode|None
249      to custom format selected sub-messages (usually based on message type).
250      Use to pretty print parts of the protobuf for easier diffing.
251    print_unknown_fields: If True, unknown fields will be printed.
252    force_colon: If set, a colon will be added after the field name even if
253      the field is a proto message.
254  """
255  printer = _Printer(
256      out=out, indent=indent, as_utf8=as_utf8,
257      as_one_line=as_one_line,
258      use_short_repeated_primitives=use_short_repeated_primitives,
259      pointy_brackets=pointy_brackets,
260      use_index_order=use_index_order,
261      float_format=float_format,
262      double_format=double_format,
263      use_field_number=use_field_number,
264      descriptor_pool=descriptor_pool,
265      message_formatter=message_formatter,
266      print_unknown_fields=print_unknown_fields,
267      force_colon=force_colon)
268  printer.PrintMessage(message)
269
270
271def PrintField(field,
272               value,
273               out,
274               indent=0,
275               as_utf8=False,
276               as_one_line=False,
277               use_short_repeated_primitives=False,
278               pointy_brackets=False,
279               use_index_order=False,
280               float_format=None,
281               double_format=None,
282               message_formatter=None,
283               print_unknown_fields=False,
284               force_colon=False):
285  """Print a single field name/value pair."""
286  printer = _Printer(out, indent, as_utf8, as_one_line,
287                     use_short_repeated_primitives, pointy_brackets,
288                     use_index_order, float_format, double_format,
289                     message_formatter=message_formatter,
290                     print_unknown_fields=print_unknown_fields,
291                     force_colon=force_colon)
292  printer.PrintField(field, value)
293
294
295def PrintFieldValue(field,
296                    value,
297                    out,
298                    indent=0,
299                    as_utf8=False,
300                    as_one_line=False,
301                    use_short_repeated_primitives=False,
302                    pointy_brackets=False,
303                    use_index_order=False,
304                    float_format=None,
305                    double_format=None,
306                    message_formatter=None,
307                    print_unknown_fields=False,
308                    force_colon=False):
309  """Print a single field value (not including name)."""
310  printer = _Printer(out, indent, as_utf8, as_one_line,
311                     use_short_repeated_primitives, pointy_brackets,
312                     use_index_order, float_format, double_format,
313                     message_formatter=message_formatter,
314                     print_unknown_fields=print_unknown_fields,
315                     force_colon=force_colon)
316  printer.PrintFieldValue(field, value)
317
318
319def _BuildMessageFromTypeName(type_name, descriptor_pool):
320  """Returns a protobuf message instance.
321
322  Args:
323    type_name: Fully-qualified protobuf  message type name string.
324    descriptor_pool: DescriptorPool instance.
325
326  Returns:
327    A Message instance of type matching type_name, or None if the a Descriptor
328    wasn't found matching type_name.
329  """
330  # pylint: disable=g-import-not-at-top
331  if descriptor_pool is None:
332    from google.protobuf import descriptor_pool as pool_mod
333    descriptor_pool = pool_mod.Default()
334  from google.protobuf import symbol_database
335  database = symbol_database.Default()
336  try:
337    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
338  except KeyError:
339    return None
340  message_type = database.GetPrototype(message_descriptor)
341  return message_type()
342
343
344# These values must match WireType enum in google/protobuf/wire_format.h.
345WIRETYPE_LENGTH_DELIMITED = 2
346WIRETYPE_START_GROUP = 3
347
348
349class _Printer(object):
350  """Text format printer for protocol message."""
351
352  def __init__(
353      self,
354      out,
355      indent=0,
356      as_utf8=False,
357      as_one_line=False,
358      use_short_repeated_primitives=False,
359      pointy_brackets=False,
360      use_index_order=False,
361      float_format=None,
362      double_format=None,
363      use_field_number=False,
364      descriptor_pool=None,
365      message_formatter=None,
366      print_unknown_fields=False,
367      force_colon=False):
368    """Initialize the Printer.
369
370    Double values can be formatted compactly with 15 digits of precision
371    (which is the most that IEEE 754 "double" can guarantee) using
372    double_format='.15g'. To ensure that converting to text and back to a proto
373    will result in an identical value, double_format='.17g' should be used.
374
375    Args:
376      out: To record the text format result.
377      indent: The initial indent level for pretty print.
378      as_utf8: Return unescaped Unicode for non-ASCII characters.
379      as_one_line: Don't introduce newlines between fields.
380      use_short_repeated_primitives: Use short repeated format for primitives.
381      pointy_brackets: If True, use angle brackets instead of curly braces for
382        nesting.
383      use_index_order: If True, print fields of a proto message using the order
384        defined in source code instead of the field number. By default, use the
385        field number order.
386      float_format: If set, use this to specify float field formatting
387        (per the "Format Specification Mini-Language"); otherwise, shortest
388        float that has same value in wire will be printed. Also affect double
389        field if double_format is not set but float_format is set.
390      double_format: If set, use this to specify double field formatting
391        (per the "Format Specification Mini-Language"); if it is not set but
392        float_format is set, use float_format. Otherwise, str() is used.
393      use_field_number: If True, print field numbers instead of names.
394      descriptor_pool: A DescriptorPool used to resolve Any types.
395      message_formatter: A function(message, indent, as_one_line): unicode|None
396        to custom format selected sub-messages (usually based on message type).
397        Use to pretty print parts of the protobuf for easier diffing.
398      print_unknown_fields: If True, unknown fields will be printed.
399      force_colon: If set, a colon will be added after the field name even if
400        the field is a proto message.
401    """
402    self.out = out
403    self.indent = indent
404    self.as_utf8 = as_utf8
405    self.as_one_line = as_one_line
406    self.use_short_repeated_primitives = use_short_repeated_primitives
407    self.pointy_brackets = pointy_brackets
408    self.use_index_order = use_index_order
409    self.float_format = float_format
410    if double_format is not None:
411      self.double_format = double_format
412    else:
413      self.double_format = float_format
414    self.use_field_number = use_field_number
415    self.descriptor_pool = descriptor_pool
416    self.message_formatter = message_formatter
417    self.print_unknown_fields = print_unknown_fields
418    self.force_colon = force_colon
419
420  def _TryPrintAsAnyMessage(self, message):
421    """Serializes if message is a google.protobuf.Any field."""
422    if '/' not in message.type_url:
423      return False
424    packed_message = _BuildMessageFromTypeName(message.TypeName(),
425                                               self.descriptor_pool)
426    if packed_message:
427      packed_message.MergeFromString(message.value)
428      colon = ':' if self.force_colon else ''
429      self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
430      self._PrintMessageFieldValue(packed_message)
431      self.out.write(' ' if self.as_one_line else '\n')
432      return True
433    else:
434      return False
435
436  def _TryCustomFormatMessage(self, message):
437    formatted = self.message_formatter(message, self.indent, self.as_one_line)
438    if formatted is None:
439      return False
440
441    out = self.out
442    out.write(' ' * self.indent)
443    out.write(formatted)
444    out.write(' ' if self.as_one_line else '\n')
445    return True
446
447  def PrintMessage(self, message):
448    """Convert protobuf message to text format.
449
450    Args:
451      message: The protocol buffers message.
452    """
453    if self.message_formatter and self._TryCustomFormatMessage(message):
454      return
455    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
456        self._TryPrintAsAnyMessage(message)):
457      return
458    fields = message.ListFields()
459    if self.use_index_order:
460      fields.sort(
461          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
462    for field, value in fields:
463      if _IsMapEntry(field):
464        for key in sorted(value):
465          # This is slow for maps with submessage entries because it copies the
466          # entire tree.  Unfortunately this would take significant refactoring
467          # of this file to work around.
468          #
469          # TODO(haberman): refactor and optimize if this becomes an issue.
470          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
471          self.PrintField(field, entry_submsg)
472      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
473        if (self.use_short_repeated_primitives
474            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
475            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
476          self._PrintShortRepeatedPrimitivesValue(field, value)
477        else:
478          for element in value:
479            self.PrintField(field, element)
480      else:
481        self.PrintField(field, value)
482
483    if self.print_unknown_fields:
484      self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message))
485
486  def _PrintUnknownFields(self, unknown_field_set):
487    """Print unknown fields."""
488    out = self.out
489    for field in unknown_field_set:
490      out.write(' ' * self.indent)
491      out.write(str(field.field_number))
492      if field.wire_type == WIRETYPE_START_GROUP:
493        if self.as_one_line:
494          out.write(' { ')
495        else:
496          out.write(' {\n')
497          self.indent += 2
498
499        self._PrintUnknownFields(field.data)
500
501        if self.as_one_line:
502          out.write('} ')
503        else:
504          self.indent -= 2
505          out.write(' ' * self.indent + '}\n')
506      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
507        try:
508          # If this field is parseable as a Message, it is probably
509          # an embedded message.
510          # pylint: disable=protected-access
511          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
512              memoryview(field.data), 0, len(field.data))
513        except Exception:    # pylint: disable=broad-except
514          pos = 0
515
516        if pos == len(field.data):
517          if self.as_one_line:
518            out.write(' { ')
519          else:
520            out.write(' {\n')
521            self.indent += 2
522
523          self._PrintUnknownFields(embedded_unknown_message)
524
525          if self.as_one_line:
526            out.write('} ')
527          else:
528            self.indent -= 2
529            out.write(' ' * self.indent + '}\n')
530        else:
531          # A string or bytes field. self.as_utf8 may not work.
532          out.write(': \"')
533          out.write(text_encoding.CEscape(field.data, False))
534          out.write('\" ' if self.as_one_line else '\"\n')
535      else:
536        # varint, fixed32, fixed64
537        out.write(': ')
538        out.write(str(field.data))
539        out.write(' ' if self.as_one_line else '\n')
540
541  def _PrintFieldName(self, field):
542    """Print field name."""
543    out = self.out
544    out.write(' ' * self.indent)
545    if self.use_field_number:
546      out.write(str(field.number))
547    else:
548      if field.is_extension:
549        out.write('[')
550        if (field.containing_type.GetOptions().message_set_wire_format and
551            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
552            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
553          out.write(field.message_type.full_name)
554        else:
555          out.write(field.full_name)
556        out.write(']')
557      elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
558        # For groups, use the capitalized name.
559        out.write(field.message_type.name)
560      else:
561          out.write(field.name)
562
563    if (self.force_colon or
564        field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
565      # The colon is optional in this case, but our cross-language golden files
566      # don't include it. Here, the colon is only included if force_colon is
567      # set to True
568      out.write(':')
569
570  def PrintField(self, field, value):
571    """Print a single field name/value pair."""
572    self._PrintFieldName(field)
573    self.out.write(' ')
574    self.PrintFieldValue(field, value)
575    self.out.write(' ' if self.as_one_line else '\n')
576
577  def _PrintShortRepeatedPrimitivesValue(self, field, value):
578    """"Prints short repeated primitives value."""
579    # Note: this is called only when value has at least one element.
580    self._PrintFieldName(field)
581    self.out.write(' [')
582    for i in range(len(value) - 1):
583      self.PrintFieldValue(field, value[i])
584      self.out.write(', ')
585    self.PrintFieldValue(field, value[-1])
586    self.out.write(']')
587    self.out.write(' ' if self.as_one_line else '\n')
588
589  def _PrintMessageFieldValue(self, value):
590    if self.pointy_brackets:
591      openb = '<'
592      closeb = '>'
593    else:
594      openb = '{'
595      closeb = '}'
596
597    if self.as_one_line:
598      self.out.write('%s ' % openb)
599      self.PrintMessage(value)
600      self.out.write(closeb)
601    else:
602      self.out.write('%s\n' % openb)
603      self.indent += 2
604      self.PrintMessage(value)
605      self.indent -= 2
606      self.out.write(' ' * self.indent + closeb)
607
608  def PrintFieldValue(self, field, value):
609    """Print a single field value (not including name).
610
611    For repeated fields, the value should be a single element.
612
613    Args:
614      field: The descriptor of the field to be printed.
615      value: The value of the field.
616    """
617    out = self.out
618    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
619      self._PrintMessageFieldValue(value)
620    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
621      enum_value = field.enum_type.values_by_number.get(value, None)
622      if enum_value is not None:
623        out.write(enum_value.name)
624      else:
625        out.write(str(value))
626    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
627      out.write('\"')
628      if isinstance(value, str) and not self.as_utf8:
629        out_value = value.encode('utf-8')
630      else:
631        out_value = value
632      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
633        # We always need to escape all binary data in TYPE_BYTES fields.
634        out_as_utf8 = False
635      else:
636        out_as_utf8 = self.as_utf8
637      out.write(text_encoding.CEscape(out_value, out_as_utf8))
638      out.write('\"')
639    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
640      if value:
641        out.write('true')
642      else:
643        out.write('false')
644    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
645      if self.float_format is not None:
646        out.write('{1:{0}}'.format(self.float_format, value))
647      else:
648        if math.isnan(value):
649          out.write(str(value))
650        else:
651          out.write(str(type_checkers.ToShortestFloat(value)))
652    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
653          self.double_format is not None):
654      out.write('{1:{0}}'.format(self.double_format, value))
655    else:
656      out.write(str(value))
657
658
659def Parse(text,
660          message,
661          allow_unknown_extension=False,
662          allow_field_number=False,
663          descriptor_pool=None,
664          allow_unknown_field=False):
665  """Parses a text representation of a protocol message into a message.
666
667  NOTE: for historical reasons this function does not clear the input
668  message. This is different from what the binary msg.ParseFrom(...) does.
669  If text contains a field already set in message, the value is appended if the
670  field is repeated. Otherwise, an error is raised.
671
672  Example::
673
674    a = MyProto()
675    a.repeated_field.append('test')
676    b = MyProto()
677
678    # Repeated fields are combined
679    text_format.Parse(repr(a), b)
680    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
681
682    # Non-repeated fields cannot be overwritten
683    a.singular_field = 1
684    b.singular_field = 2
685    text_format.Parse(repr(a), b) # ParseError
686
687    # Binary version:
688    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
689
690  Caller is responsible for clearing the message as needed.
691
692  Args:
693    text (str): Message text representation.
694    message (Message): A protocol buffer message to merge into.
695    allow_unknown_extension: if True, skip over missing extensions and keep
696      parsing
697    allow_field_number: if True, both field number and field name are allowed.
698    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
699    allow_unknown_field: if True, skip over unknown field and keep
700      parsing. Avoid to use this option if possible. It may hide some
701      errors (e.g. spelling error on field name)
702
703  Returns:
704    Message: The same message passed as argument.
705
706  Raises:
707    ParseError: On text parsing problems.
708  """
709  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
710                    message,
711                    allow_unknown_extension,
712                    allow_field_number,
713                    descriptor_pool=descriptor_pool,
714                    allow_unknown_field=allow_unknown_field)
715
716
717def Merge(text,
718          message,
719          allow_unknown_extension=False,
720          allow_field_number=False,
721          descriptor_pool=None,
722          allow_unknown_field=False):
723  """Parses a text representation of a protocol message into a message.
724
725  Like Parse(), but allows repeated values for a non-repeated field, and uses
726  the last one. This means any non-repeated, top-level fields specified in text
727  replace those in the message.
728
729  Args:
730    text (str): Message text representation.
731    message (Message): A protocol buffer message to merge into.
732    allow_unknown_extension: if True, skip over missing extensions and keep
733      parsing
734    allow_field_number: if True, both field number and field name are allowed.
735    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
736    allow_unknown_field: if True, skip over unknown field and keep
737      parsing. Avoid to use this option if possible. It may hide some
738      errors (e.g. spelling error on field name)
739
740  Returns:
741    Message: The same message passed as argument.
742
743  Raises:
744    ParseError: On text parsing problems.
745  """
746  return MergeLines(
747      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
748      message,
749      allow_unknown_extension,
750      allow_field_number,
751      descriptor_pool=descriptor_pool,
752      allow_unknown_field=allow_unknown_field)
753
754
755def ParseLines(lines,
756               message,
757               allow_unknown_extension=False,
758               allow_field_number=False,
759               descriptor_pool=None,
760               allow_unknown_field=False):
761  """Parses a text representation of a protocol message into a message.
762
763  See Parse() for caveats.
764
765  Args:
766    lines: An iterable of lines of a message's text representation.
767    message: A protocol buffer message to merge into.
768    allow_unknown_extension: if True, skip over missing extensions and keep
769      parsing
770    allow_field_number: if True, both field number and field name are allowed.
771    descriptor_pool: A DescriptorPool used to resolve Any types.
772    allow_unknown_field: if True, skip over unknown field and keep
773      parsing. Avoid to use this option if possible. It may hide some
774      errors (e.g. spelling error on field name)
775
776  Returns:
777    The same message passed as argument.
778
779  Raises:
780    ParseError: On text parsing problems.
781  """
782  parser = _Parser(allow_unknown_extension,
783                   allow_field_number,
784                   descriptor_pool=descriptor_pool,
785                   allow_unknown_field=allow_unknown_field)
786  return parser.ParseLines(lines, message)
787
788
789def MergeLines(lines,
790               message,
791               allow_unknown_extension=False,
792               allow_field_number=False,
793               descriptor_pool=None,
794               allow_unknown_field=False):
795  """Parses a text representation of a protocol message into a message.
796
797  See Merge() for more details.
798
799  Args:
800    lines: An iterable of lines of a message's text representation.
801    message: A protocol buffer message to merge into.
802    allow_unknown_extension: if True, skip over missing extensions and keep
803      parsing
804    allow_field_number: if True, both field number and field name are allowed.
805    descriptor_pool: A DescriptorPool used to resolve Any types.
806    allow_unknown_field: if True, skip over unknown field and keep
807      parsing. Avoid to use this option if possible. It may hide some
808      errors (e.g. spelling error on field name)
809
810  Returns:
811    The same message passed as argument.
812
813  Raises:
814    ParseError: On text parsing problems.
815  """
816  parser = _Parser(allow_unknown_extension,
817                   allow_field_number,
818                   descriptor_pool=descriptor_pool,
819                   allow_unknown_field=allow_unknown_field)
820  return parser.MergeLines(lines, message)
821
822
823class _Parser(object):
824  """Text format parser for protocol message."""
825
826  def __init__(self,
827               allow_unknown_extension=False,
828               allow_field_number=False,
829               descriptor_pool=None,
830               allow_unknown_field=False):
831    self.allow_unknown_extension = allow_unknown_extension
832    self.allow_field_number = allow_field_number
833    self.descriptor_pool = descriptor_pool
834    self.allow_unknown_field = allow_unknown_field
835
836  def ParseLines(self, lines, message):
837    """Parses a text representation of a protocol message into a message."""
838    self._allow_multiple_scalars = False
839    self._ParseOrMerge(lines, message)
840    return message
841
842  def MergeLines(self, lines, message):
843    """Merges a text representation of a protocol message into a message."""
844    self._allow_multiple_scalars = True
845    self._ParseOrMerge(lines, message)
846    return message
847
848  def _ParseOrMerge(self, lines, message):
849    """Converts a text representation of a protocol message into a message.
850
851    Args:
852      lines: Lines of a message's text representation.
853      message: A protocol buffer message to merge into.
854
855    Raises:
856      ParseError: On text parsing problems.
857    """
858    # Tokenize expects native str lines.
859    str_lines = (
860        line if isinstance(line, str) else line.decode('utf-8')
861        for line in lines)
862    tokenizer = Tokenizer(str_lines)
863    while not tokenizer.AtEnd():
864      self._MergeField(tokenizer, message)
865
866  def _MergeField(self, tokenizer, message):
867    """Merges a single protocol message field into a message.
868
869    Args:
870      tokenizer: A tokenizer to parse the field name and values.
871      message: A protocol message to record the data.
872
873    Raises:
874      ParseError: In case of text parsing problems.
875    """
876    message_descriptor = message.DESCRIPTOR
877    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
878        tokenizer.TryConsume('[')):
879      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
880      tokenizer.Consume(']')
881      tokenizer.TryConsume(':')
882      if tokenizer.TryConsume('<'):
883        expanded_any_end_token = '>'
884      else:
885        tokenizer.Consume('{')
886        expanded_any_end_token = '}'
887      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
888                                                           self.descriptor_pool)
889      # Direct comparison with None is used instead of implicit bool conversion
890      # to avoid false positives with falsy initial values, e.g. for
891      # google.protobuf.ListValue.
892      if expanded_any_sub_message is None:
893        raise ParseError('Type %s not found in descriptor pool' %
894                         packed_type_name)
895      while not tokenizer.TryConsume(expanded_any_end_token):
896        if tokenizer.AtEnd():
897          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
898                                                  (expanded_any_end_token,))
899        self._MergeField(tokenizer, expanded_any_sub_message)
900      deterministic = False
901
902      message.Pack(expanded_any_sub_message,
903                   type_url_prefix=type_url_prefix,
904                   deterministic=deterministic)
905      return
906
907    if tokenizer.TryConsume('['):
908      name = [tokenizer.ConsumeIdentifier()]
909      while tokenizer.TryConsume('.'):
910        name.append(tokenizer.ConsumeIdentifier())
911      name = '.'.join(name)
912
913      if not message_descriptor.is_extendable:
914        raise tokenizer.ParseErrorPreviousToken(
915            'Message type "%s" does not have extensions.' %
916            message_descriptor.full_name)
917      # pylint: disable=protected-access
918      field = message.Extensions._FindExtensionByName(name)
919      # pylint: enable=protected-access
920
921
922      if not field:
923        if self.allow_unknown_extension:
924          field = None
925        else:
926          raise tokenizer.ParseErrorPreviousToken(
927              'Extension "%s" not registered. '
928              'Did you import the _pb2 module which defines it? '
929              'If you are trying to place the extension in the MessageSet '
930              'field of another message that is in an Any or MessageSet field, '
931              'that message\'s _pb2 module must be imported as well' % name)
932      elif message_descriptor != field.containing_type:
933        raise tokenizer.ParseErrorPreviousToken(
934            'Extension "%s" does not extend message type "%s".' %
935            (name, message_descriptor.full_name))
936
937      tokenizer.Consume(']')
938
939    else:
940      name = tokenizer.ConsumeIdentifierOrNumber()
941      if self.allow_field_number and name.isdigit():
942        number = ParseInteger(name, True, True)
943        field = message_descriptor.fields_by_number.get(number, None)
944        if not field and message_descriptor.is_extendable:
945          field = message.Extensions._FindExtensionByNumber(number)
946      else:
947        field = message_descriptor.fields_by_name.get(name, None)
948
949        # Group names are expected to be capitalized as they appear in the
950        # .proto file, which actually matches their type names, not their field
951        # names.
952        if not field:
953          field = message_descriptor.fields_by_name.get(name.lower(), None)
954          if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
955            field = None
956
957        if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
958            field.message_type.name != name):
959          field = None
960
961      if not field and not self.allow_unknown_field:
962        raise tokenizer.ParseErrorPreviousToken(
963            'Message type "%s" has no field named "%s".' %
964            (message_descriptor.full_name, name))
965
966    if field:
967      if not self._allow_multiple_scalars and field.containing_oneof:
968        # Check if there's a different field set in this oneof.
969        # Note that we ignore the case if the same field was set before, and we
970        # apply _allow_multiple_scalars to non-scalar fields as well.
971        which_oneof = message.WhichOneof(field.containing_oneof.name)
972        if which_oneof is not None and which_oneof != field.name:
973          raise tokenizer.ParseErrorPreviousToken(
974              'Field "%s" is specified along with field "%s", another member '
975              'of oneof "%s" for message type "%s".' %
976              (field.name, which_oneof, field.containing_oneof.name,
977               message_descriptor.full_name))
978
979      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
980        tokenizer.TryConsume(':')
981        merger = self._MergeMessageField
982      else:
983        tokenizer.Consume(':')
984        merger = self._MergeScalarField
985
986      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
987          tokenizer.TryConsume('[')):
988        # Short repeated format, e.g. "foo: [1, 2, 3]"
989        if not tokenizer.TryConsume(']'):
990          while True:
991            merger(tokenizer, message, field)
992            if tokenizer.TryConsume(']'):
993              break
994            tokenizer.Consume(',')
995
996      else:
997        merger(tokenizer, message, field)
998
999    else:  # Proto field is unknown.
1000      assert (self.allow_unknown_extension or self.allow_unknown_field)
1001      _SkipFieldContents(tokenizer)
1002
1003    # For historical reasons, fields may optionally be separated by commas or
1004    # semicolons.
1005    if not tokenizer.TryConsume(','):
1006      tokenizer.TryConsume(';')
1007
1008
1009  def _ConsumeAnyTypeUrl(self, tokenizer):
1010    """Consumes a google.protobuf.Any type URL and returns the type name."""
1011    # Consume "type.googleapis.com/".
1012    prefix = [tokenizer.ConsumeIdentifier()]
1013    tokenizer.Consume('.')
1014    prefix.append(tokenizer.ConsumeIdentifier())
1015    tokenizer.Consume('.')
1016    prefix.append(tokenizer.ConsumeIdentifier())
1017    tokenizer.Consume('/')
1018    # Consume the fully-qualified type name.
1019    name = [tokenizer.ConsumeIdentifier()]
1020    while tokenizer.TryConsume('.'):
1021      name.append(tokenizer.ConsumeIdentifier())
1022    return '.'.join(prefix), '.'.join(name)
1023
1024  def _MergeMessageField(self, tokenizer, message, field):
1025    """Merges a single scalar field into a message.
1026
1027    Args:
1028      tokenizer: A tokenizer to parse the field value.
1029      message: The message of which field is a member.
1030      field: The descriptor of the field to be merged.
1031
1032    Raises:
1033      ParseError: In case of text parsing problems.
1034    """
1035    is_map_entry = _IsMapEntry(field)
1036
1037    if tokenizer.TryConsume('<'):
1038      end_token = '>'
1039    else:
1040      tokenizer.Consume('{')
1041      end_token = '}'
1042
1043    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1044      if field.is_extension:
1045        sub_message = message.Extensions[field].add()
1046      elif is_map_entry:
1047        sub_message = getattr(message, field.name).GetEntryClass()()
1048      else:
1049        sub_message = getattr(message, field.name).add()
1050    else:
1051      if field.is_extension:
1052        if (not self._allow_multiple_scalars and
1053            message.HasExtension(field)):
1054          raise tokenizer.ParseErrorPreviousToken(
1055              'Message type "%s" should not have multiple "%s" extensions.' %
1056              (message.DESCRIPTOR.full_name, field.full_name))
1057        sub_message = message.Extensions[field]
1058      else:
1059        # Also apply _allow_multiple_scalars to message field.
1060        # TODO(jieluo): Change to _allow_singular_overwrites.
1061        if (not self._allow_multiple_scalars and
1062            message.HasField(field.name)):
1063          raise tokenizer.ParseErrorPreviousToken(
1064              'Message type "%s" should not have multiple "%s" fields.' %
1065              (message.DESCRIPTOR.full_name, field.name))
1066        sub_message = getattr(message, field.name)
1067      sub_message.SetInParent()
1068
1069    while not tokenizer.TryConsume(end_token):
1070      if tokenizer.AtEnd():
1071        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1072      self._MergeField(tokenizer, sub_message)
1073
1074    if is_map_entry:
1075      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1076      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1077        value = getattr(message, field.name)[sub_message.key]
1078        value.CopyFrom(sub_message.value)
1079      else:
1080        getattr(message, field.name)[sub_message.key] = sub_message.value
1081
1082  @staticmethod
1083  def _IsProto3Syntax(message):
1084    message_descriptor = message.DESCRIPTOR
1085    return (hasattr(message_descriptor, 'syntax') and
1086            message_descriptor.syntax == 'proto3')
1087
1088  def _MergeScalarField(self, tokenizer, message, field):
1089    """Merges a single scalar field into a message.
1090
1091    Args:
1092      tokenizer: A tokenizer to parse the field value.
1093      message: A protocol message to record the data.
1094      field: The descriptor of the field to be merged.
1095
1096    Raises:
1097      ParseError: In case of text parsing problems.
1098      RuntimeError: On runtime errors.
1099    """
1100    _ = self.allow_unknown_extension
1101    value = None
1102
1103    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1104                      descriptor.FieldDescriptor.TYPE_SINT32,
1105                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1106      value = _ConsumeInt32(tokenizer)
1107    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1108                        descriptor.FieldDescriptor.TYPE_SINT64,
1109                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1110      value = _ConsumeInt64(tokenizer)
1111    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1112                        descriptor.FieldDescriptor.TYPE_FIXED32):
1113      value = _ConsumeUint32(tokenizer)
1114    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1115                        descriptor.FieldDescriptor.TYPE_FIXED64):
1116      value = _ConsumeUint64(tokenizer)
1117    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1118                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1119      value = tokenizer.ConsumeFloat()
1120    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1121      value = tokenizer.ConsumeBool()
1122    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1123      value = tokenizer.ConsumeString()
1124    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1125      value = tokenizer.ConsumeByteString()
1126    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1127      value = tokenizer.ConsumeEnum(field)
1128    else:
1129      raise RuntimeError('Unknown field type %d' % field.type)
1130
1131    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1132      if field.is_extension:
1133        message.Extensions[field].append(value)
1134      else:
1135        getattr(message, field.name).append(value)
1136    else:
1137      if field.is_extension:
1138        if (not self._allow_multiple_scalars and
1139            not self._IsProto3Syntax(message) and
1140            message.HasExtension(field)):
1141          raise tokenizer.ParseErrorPreviousToken(
1142              'Message type "%s" should not have multiple "%s" extensions.' %
1143              (message.DESCRIPTOR.full_name, field.full_name))
1144        else:
1145          message.Extensions[field] = value
1146      else:
1147        duplicate_error = False
1148        if not self._allow_multiple_scalars:
1149          if self._IsProto3Syntax(message):
1150            # Proto3 doesn't represent presence so we try best effort to check
1151            # multiple scalars by compare to default values.
1152            duplicate_error = bool(getattr(message, field.name))
1153          else:
1154            duplicate_error = message.HasField(field.name)
1155
1156        if duplicate_error:
1157          raise tokenizer.ParseErrorPreviousToken(
1158              'Message type "%s" should not have multiple "%s" fields.' %
1159              (message.DESCRIPTOR.full_name, field.name))
1160        else:
1161          setattr(message, field.name, value)
1162
1163
1164def _SkipFieldContents(tokenizer):
1165  """Skips over contents (value or message) of a field.
1166
1167  Args:
1168    tokenizer: A tokenizer to parse the field name and values.
1169  """
1170  # Try to guess the type of this field.
1171  # If this field is not a message, there should be a ":" between the
1172  # field name and the field value and also the field value should not
1173  # start with "{" or "<" which indicates the beginning of a message body.
1174  # If there is no ":" or there is a "{" or "<" after ":", this field has
1175  # to be a message or the input is ill-formed.
1176  if tokenizer.TryConsume(
1177      ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'):
1178    if tokenizer.LookingAt('['):
1179      _SkipRepeatedFieldValue(tokenizer)
1180    else:
1181      _SkipFieldValue(tokenizer)
1182  else:
1183    _SkipFieldMessage(tokenizer)
1184
1185
1186def _SkipField(tokenizer):
1187  """Skips over a complete field (name and value/message).
1188
1189  Args:
1190    tokenizer: A tokenizer to parse the field name and values.
1191  """
1192  if tokenizer.TryConsume('['):
1193    # Consume extension name.
1194    tokenizer.ConsumeIdentifier()
1195    while tokenizer.TryConsume('.'):
1196      tokenizer.ConsumeIdentifier()
1197    tokenizer.Consume(']')
1198  else:
1199    tokenizer.ConsumeIdentifierOrNumber()
1200
1201  _SkipFieldContents(tokenizer)
1202
1203  # For historical reasons, fields may optionally be separated by commas or
1204  # semicolons.
1205  if not tokenizer.TryConsume(','):
1206    tokenizer.TryConsume(';')
1207
1208
1209def _SkipFieldMessage(tokenizer):
1210  """Skips over a field message.
1211
1212  Args:
1213    tokenizer: A tokenizer to parse the field name and values.
1214  """
1215
1216  if tokenizer.TryConsume('<'):
1217    delimiter = '>'
1218  else:
1219    tokenizer.Consume('{')
1220    delimiter = '}'
1221
1222  while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1223    _SkipField(tokenizer)
1224
1225  tokenizer.Consume(delimiter)
1226
1227
1228def _SkipFieldValue(tokenizer):
1229  """Skips over a field value.
1230
1231  Args:
1232    tokenizer: A tokenizer to parse the field name and values.
1233
1234  Raises:
1235    ParseError: In case an invalid field value is found.
1236  """
1237  # String/bytes tokens can come in multiple adjacent string literals.
1238  # If we can consume one, consume as many as we can.
1239  if tokenizer.TryConsumeByteString():
1240    while tokenizer.TryConsumeByteString():
1241      pass
1242    return
1243
1244  if (not tokenizer.TryConsumeIdentifier() and
1245      not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
1246      not tokenizer.TryConsumeFloat()):
1247    raise ParseError('Invalid field value: ' + tokenizer.token)
1248
1249
1250def _SkipRepeatedFieldValue(tokenizer):
1251  """Skips over a repeated field value.
1252
1253  Args:
1254    tokenizer: A tokenizer to parse the field value.
1255  """
1256  tokenizer.Consume('[')
1257  if not tokenizer.LookingAt(']'):
1258    _SkipFieldValue(tokenizer)
1259    while tokenizer.TryConsume(','):
1260      _SkipFieldValue(tokenizer)
1261  tokenizer.Consume(']')
1262
1263
1264class Tokenizer(object):
1265  """Protocol buffer text representation tokenizer.
1266
1267  This class handles the lower level string parsing by splitting it into
1268  meaningful tokens.
1269
1270  It was directly ported from the Java protocol buffer API.
1271  """
1272
1273  _WHITESPACE = re.compile(r'\s+')
1274  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1275  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1276  _TOKEN = re.compile('|'.join([
1277      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1278      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1279  ] + [  # quoted str for each quote mark
1280      # Avoid backtracking! https://stackoverflow.com/a/844267
1281      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1282      for mark in _QUOTES
1283  ]))
1284
1285  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1286  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1287
1288  def __init__(self, lines, skip_comments=True):
1289    self._position = 0
1290    self._line = -1
1291    self._column = 0
1292    self._token_start = None
1293    self.token = ''
1294    self._lines = iter(lines)
1295    self._current_line = ''
1296    self._previous_line = 0
1297    self._previous_column = 0
1298    self._more_lines = True
1299    self._skip_comments = skip_comments
1300    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1301                                or self._WHITESPACE)
1302    self._SkipWhitespace()
1303    self.NextToken()
1304
1305  def LookingAt(self, token):
1306    return self.token == token
1307
1308  def AtEnd(self):
1309    """Checks the end of the text was reached.
1310
1311    Returns:
1312      True iff the end was reached.
1313    """
1314    return not self.token
1315
1316  def _PopLine(self):
1317    while len(self._current_line) <= self._column:
1318      try:
1319        self._current_line = next(self._lines)
1320      except StopIteration:
1321        self._current_line = ''
1322        self._more_lines = False
1323        return
1324      else:
1325        self._line += 1
1326        self._column = 0
1327
1328  def _SkipWhitespace(self):
1329    while True:
1330      self._PopLine()
1331      match = self._whitespace_pattern.match(self._current_line, self._column)
1332      if not match:
1333        break
1334      length = len(match.group(0))
1335      self._column += length
1336
1337  def TryConsume(self, token):
1338    """Tries to consume a given piece of text.
1339
1340    Args:
1341      token: Text to consume.
1342
1343    Returns:
1344      True iff the text was consumed.
1345    """
1346    if self.token == token:
1347      self.NextToken()
1348      return True
1349    return False
1350
1351  def Consume(self, token):
1352    """Consumes a piece of text.
1353
1354    Args:
1355      token: Text to consume.
1356
1357    Raises:
1358      ParseError: If the text couldn't be consumed.
1359    """
1360    if not self.TryConsume(token):
1361      raise self.ParseError('Expected "%s".' % token)
1362
1363  def ConsumeComment(self):
1364    result = self.token
1365    if not self._COMMENT.match(result):
1366      raise self.ParseError('Expected comment.')
1367    self.NextToken()
1368    return result
1369
1370  def ConsumeCommentOrTrailingComment(self):
1371    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1372
1373    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1374    # tokenizer starts, it looks like there is a previous token on the line.
1375    just_started = self._line == 0 and self._column == 0
1376
1377    before_parsing = self._previous_line
1378    comment = self.ConsumeComment()
1379
1380    # A trailing comment is a comment on the same line than the previous token.
1381    trailing = (self._previous_line == before_parsing
1382                and not just_started)
1383
1384    return trailing, comment
1385
1386  def TryConsumeIdentifier(self):
1387    try:
1388      self.ConsumeIdentifier()
1389      return True
1390    except ParseError:
1391      return False
1392
1393  def ConsumeIdentifier(self):
1394    """Consumes protocol message field identifier.
1395
1396    Returns:
1397      Identifier string.
1398
1399    Raises:
1400      ParseError: If an identifier couldn't be consumed.
1401    """
1402    result = self.token
1403    if not self._IDENTIFIER.match(result):
1404      raise self.ParseError('Expected identifier.')
1405    self.NextToken()
1406    return result
1407
1408  def TryConsumeIdentifierOrNumber(self):
1409    try:
1410      self.ConsumeIdentifierOrNumber()
1411      return True
1412    except ParseError:
1413      return False
1414
1415  def ConsumeIdentifierOrNumber(self):
1416    """Consumes protocol message field identifier.
1417
1418    Returns:
1419      Identifier string.
1420
1421    Raises:
1422      ParseError: If an identifier couldn't be consumed.
1423    """
1424    result = self.token
1425    if not self._IDENTIFIER_OR_NUMBER.match(result):
1426      raise self.ParseError('Expected identifier or number, got %s.' % result)
1427    self.NextToken()
1428    return result
1429
1430  def TryConsumeInteger(self):
1431    try:
1432      self.ConsumeInteger()
1433      return True
1434    except ParseError:
1435      return False
1436
1437  def ConsumeInteger(self):
1438    """Consumes an integer number.
1439
1440    Returns:
1441      The integer parsed.
1442
1443    Raises:
1444      ParseError: If an integer couldn't be consumed.
1445    """
1446    try:
1447      result = _ParseAbstractInteger(self.token)
1448    except ValueError as e:
1449      raise self.ParseError(str(e))
1450    self.NextToken()
1451    return result
1452
1453  def TryConsumeFloat(self):
1454    try:
1455      self.ConsumeFloat()
1456      return True
1457    except ParseError:
1458      return False
1459
1460  def ConsumeFloat(self):
1461    """Consumes an floating point number.
1462
1463    Returns:
1464      The number parsed.
1465
1466    Raises:
1467      ParseError: If a floating point number couldn't be consumed.
1468    """
1469    try:
1470      result = ParseFloat(self.token)
1471    except ValueError as e:
1472      raise self.ParseError(str(e))
1473    self.NextToken()
1474    return result
1475
1476  def ConsumeBool(self):
1477    """Consumes a boolean value.
1478
1479    Returns:
1480      The bool parsed.
1481
1482    Raises:
1483      ParseError: If a boolean value couldn't be consumed.
1484    """
1485    try:
1486      result = ParseBool(self.token)
1487    except ValueError as e:
1488      raise self.ParseError(str(e))
1489    self.NextToken()
1490    return result
1491
1492  def TryConsumeByteString(self):
1493    try:
1494      self.ConsumeByteString()
1495      return True
1496    except ParseError:
1497      return False
1498
1499  def ConsumeString(self):
1500    """Consumes a string value.
1501
1502    Returns:
1503      The string parsed.
1504
1505    Raises:
1506      ParseError: If a string value couldn't be consumed.
1507    """
1508    the_bytes = self.ConsumeByteString()
1509    try:
1510      return str(the_bytes, 'utf-8')
1511    except UnicodeDecodeError as e:
1512      raise self._StringParseError(e)
1513
1514  def ConsumeByteString(self):
1515    """Consumes a byte array value.
1516
1517    Returns:
1518      The array parsed (as a string).
1519
1520    Raises:
1521      ParseError: If a byte array value couldn't be consumed.
1522    """
1523    the_list = [self._ConsumeSingleByteString()]
1524    while self.token and self.token[0] in _QUOTES:
1525      the_list.append(self._ConsumeSingleByteString())
1526    return b''.join(the_list)
1527
1528  def _ConsumeSingleByteString(self):
1529    """Consume one token of a string literal.
1530
1531    String literals (whether bytes or text) can come in multiple adjacent
1532    tokens which are automatically concatenated, like in C or Python.  This
1533    method only consumes one token.
1534
1535    Returns:
1536      The token parsed.
1537    Raises:
1538      ParseError: When the wrong format data is found.
1539    """
1540    text = self.token
1541    if len(text) < 1 or text[0] not in _QUOTES:
1542      raise self.ParseError('Expected string but found: %r' % (text,))
1543
1544    if len(text) < 2 or text[-1] != text[0]:
1545      raise self.ParseError('String missing ending quote: %r' % (text,))
1546
1547    try:
1548      result = text_encoding.CUnescape(text[1:-1])
1549    except ValueError as e:
1550      raise self.ParseError(str(e))
1551    self.NextToken()
1552    return result
1553
1554  def ConsumeEnum(self, field):
1555    try:
1556      result = ParseEnum(field, self.token)
1557    except ValueError as e:
1558      raise self.ParseError(str(e))
1559    self.NextToken()
1560    return result
1561
1562  def ParseErrorPreviousToken(self, message):
1563    """Creates and *returns* a ParseError for the previously read token.
1564
1565    Args:
1566      message: A message to set for the exception.
1567
1568    Returns:
1569      A ParseError instance.
1570    """
1571    return ParseError(message, self._previous_line + 1,
1572                      self._previous_column + 1)
1573
1574  def ParseError(self, message):
1575    """Creates and *returns* a ParseError for the current token."""
1576    return ParseError('\'' + self._current_line + '\': ' + message,
1577                      self._line + 1, self._column + 1)
1578
1579  def _StringParseError(self, e):
1580    return self.ParseError('Couldn\'t parse string: ' + str(e))
1581
1582  def NextToken(self):
1583    """Reads the next meaningful token."""
1584    self._previous_line = self._line
1585    self._previous_column = self._column
1586
1587    self._column += len(self.token)
1588    self._SkipWhitespace()
1589
1590    if not self._more_lines:
1591      self.token = ''
1592      return
1593
1594    match = self._TOKEN.match(self._current_line, self._column)
1595    if not match and not self._skip_comments:
1596      match = self._COMMENT.match(self._current_line, self._column)
1597    if match:
1598      token = match.group(0)
1599      self.token = token
1600    else:
1601      self.token = self._current_line[self._column]
1602
1603# Aliased so it can still be accessed by current visibility violators.
1604# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1605_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1606
1607
1608def _ConsumeInt32(tokenizer):
1609  """Consumes a signed 32bit integer number from tokenizer.
1610
1611  Args:
1612    tokenizer: A tokenizer used to parse the number.
1613
1614  Returns:
1615    The integer parsed.
1616
1617  Raises:
1618    ParseError: If a signed 32bit integer couldn't be consumed.
1619  """
1620  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1621
1622
1623def _ConsumeUint32(tokenizer):
1624  """Consumes an unsigned 32bit integer number from tokenizer.
1625
1626  Args:
1627    tokenizer: A tokenizer used to parse the number.
1628
1629  Returns:
1630    The integer parsed.
1631
1632  Raises:
1633    ParseError: If an unsigned 32bit integer couldn't be consumed.
1634  """
1635  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1636
1637
1638def _TryConsumeInt64(tokenizer):
1639  try:
1640    _ConsumeInt64(tokenizer)
1641    return True
1642  except ParseError:
1643    return False
1644
1645
1646def _ConsumeInt64(tokenizer):
1647  """Consumes a signed 32bit integer number from tokenizer.
1648
1649  Args:
1650    tokenizer: A tokenizer used to parse the number.
1651
1652  Returns:
1653    The integer parsed.
1654
1655  Raises:
1656    ParseError: If a signed 32bit integer couldn't be consumed.
1657  """
1658  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1659
1660
1661def _TryConsumeUint64(tokenizer):
1662  try:
1663    _ConsumeUint64(tokenizer)
1664    return True
1665  except ParseError:
1666    return False
1667
1668
1669def _ConsumeUint64(tokenizer):
1670  """Consumes an unsigned 64bit integer number from tokenizer.
1671
1672  Args:
1673    tokenizer: A tokenizer used to parse the number.
1674
1675  Returns:
1676    The integer parsed.
1677
1678  Raises:
1679    ParseError: If an unsigned 64bit integer couldn't be consumed.
1680  """
1681  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1682
1683
1684def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1685  """Consumes an integer number from tokenizer.
1686
1687  Args:
1688    tokenizer: A tokenizer used to parse the number.
1689    is_signed: True if a signed integer must be parsed.
1690    is_long: True if a long integer must be parsed.
1691
1692  Returns:
1693    The integer parsed.
1694
1695  Raises:
1696    ParseError: If an integer with given characteristics couldn't be consumed.
1697  """
1698  try:
1699    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1700  except ValueError as e:
1701    raise tokenizer.ParseError(str(e))
1702  tokenizer.NextToken()
1703  return result
1704
1705
1706def ParseInteger(text, is_signed=False, is_long=False):
1707  """Parses an integer.
1708
1709  Args:
1710    text: The text to parse.
1711    is_signed: True if a signed integer must be parsed.
1712    is_long: True if a long integer must be parsed.
1713
1714  Returns:
1715    The integer value.
1716
1717  Raises:
1718    ValueError: Thrown Iff the text is not a valid integer.
1719  """
1720  # Do the actual parsing. Exception handling is propagated to caller.
1721  result = _ParseAbstractInteger(text)
1722
1723  # Check if the integer is sane. Exceptions handled by callers.
1724  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1725  checker.CheckValue(result)
1726  return result
1727
1728
1729def _ParseAbstractInteger(text):
1730  """Parses an integer without checking size/signedness.
1731
1732  Args:
1733    text: The text to parse.
1734
1735  Returns:
1736    The integer value.
1737
1738  Raises:
1739    ValueError: Thrown Iff the text is not a valid integer.
1740  """
1741  # Do the actual parsing. Exception handling is propagated to caller.
1742  orig_text = text
1743  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1744  if c_octal_match:
1745    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1746    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1747    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1748  try:
1749    return int(text, 0)
1750  except ValueError:
1751    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1752
1753
1754def ParseFloat(text):
1755  """Parse a floating point number.
1756
1757  Args:
1758    text: Text to parse.
1759
1760  Returns:
1761    The number parsed.
1762
1763  Raises:
1764    ValueError: If a floating point number couldn't be parsed.
1765  """
1766  try:
1767    # Assume Python compatible syntax.
1768    return float(text)
1769  except ValueError:
1770    # Check alternative spellings.
1771    if _FLOAT_INFINITY.match(text):
1772      if text[0] == '-':
1773        return float('-inf')
1774      else:
1775        return float('inf')
1776    elif _FLOAT_NAN.match(text):
1777      return float('nan')
1778    else:
1779      # assume '1.0f' format
1780      try:
1781        return float(text.rstrip('f'))
1782      except ValueError:
1783        raise ValueError('Couldn\'t parse float: %s' % text)
1784
1785
1786def ParseBool(text):
1787  """Parse a boolean value.
1788
1789  Args:
1790    text: Text to parse.
1791
1792  Returns:
1793    Boolean values parsed
1794
1795  Raises:
1796    ValueError: If text is not a valid boolean.
1797  """
1798  if text in ('true', 't', '1', 'True'):
1799    return True
1800  elif text in ('false', 'f', '0', 'False'):
1801    return False
1802  else:
1803    raise ValueError('Expected "true" or "false".')
1804
1805
1806def ParseEnum(field, value):
1807  """Parse an enum value.
1808
1809  The value can be specified by a number (the enum value), or by
1810  a string literal (the enum name).
1811
1812  Args:
1813    field: Enum field descriptor.
1814    value: String value.
1815
1816  Returns:
1817    Enum value number.
1818
1819  Raises:
1820    ValueError: If the enum value could not be parsed.
1821  """
1822  enum_descriptor = field.enum_type
1823  try:
1824    number = int(value, 0)
1825  except ValueError:
1826    # Identifier.
1827    enum_value = enum_descriptor.values_by_name.get(value, None)
1828    if enum_value is None:
1829      raise ValueError('Enum type "%s" has no value named %s.' %
1830                       (enum_descriptor.full_name, value))
1831  else:
1832    # Numeric value.
1833    if hasattr(field.file, 'syntax'):
1834      # Attribute is checked for compatibility.
1835      if field.file.syntax == 'proto3':
1836        # Proto3 accept numeric unknown enums.
1837        return number
1838    enum_value = enum_descriptor.values_by_number.get(number, None)
1839    if enum_value is None:
1840      raise ValueError('Enum type "%s" has no value with number %d.' %
1841                       (enum_descriptor.full_name, number))
1842  return enum_value.number
1843