1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# https://developers.google.com/protocol-buffers/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format. 32 33Simple usage example:: 34 35 # Create a proto object and serialize it to a text proto string. 36 message = my_proto_pb2.MyMessage(foo='bar') 37 text_proto = text_format.MessageToString(message) 38 39 # Parse a text proto string. 40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 41""" 42 43__author__ = '[email protected] (Kenton Varda)' 44 45# TODO(b/129989314) Import thread contention leads to test failures. 46import encodings.raw_unicode_escape # pylint: disable=unused-import 47import encodings.unicode_escape # pylint: disable=unused-import 48import io 49import math 50import re 51 52from google.protobuf.internal import decoder 53from google.protobuf.internal import type_checkers 54from google.protobuf import descriptor 55from google.protobuf import text_encoding 56from google.protobuf import unknown_fields 57 58# pylint: disable=g-import-not-at-top 59__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 60 'PrintFieldValue', 'Merge', 'MessageToBytes'] 61 62_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 63 type_checkers.Int32ValueChecker(), 64 type_checkers.Uint64ValueChecker(), 65 type_checkers.Int64ValueChecker()) 66_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 67_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 68_QUOTES = frozenset(("'", '"')) 69_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 70 71 72class Error(Exception): 73 """Top-level module error for text_format.""" 74 75 76class ParseError(Error): 77 """Thrown in case of text parsing or tokenizing error.""" 78 79 def __init__(self, message=None, line=None, column=None): 80 if message is not None and line is not None: 81 loc = str(line) 82 if column is not None: 83 loc += ':{0}'.format(column) 84 message = '{0} : {1}'.format(loc, message) 85 if message is not None: 86 super(ParseError, self).__init__(message) 87 else: 88 super(ParseError, self).__init__() 89 self._line = line 90 self._column = column 91 92 def GetLine(self): 93 return self._line 94 95 def GetColumn(self): 96 return self._column 97 98 99class TextWriter(object): 100 101 def __init__(self, as_utf8): 102 self._writer = io.StringIO() 103 104 def write(self, val): 105 return self._writer.write(val) 106 107 def close(self): 108 return self._writer.close() 109 110 def getvalue(self): 111 return self._writer.getvalue() 112 113 114def MessageToString( 115 message, 116 as_utf8=False, 117 as_one_line=False, 118 use_short_repeated_primitives=False, 119 pointy_brackets=False, 120 use_index_order=False, 121 float_format=None, 122 double_format=None, 123 use_field_number=False, 124 descriptor_pool=None, 125 indent=0, 126 message_formatter=None, 127 print_unknown_fields=False, 128 force_colon=False): 129 # type: (...) -> str 130 """Convert protobuf message to text format. 131 132 Double values can be formatted compactly with 15 digits of 133 precision (which is the most that IEEE 754 "double" can guarantee) 134 using double_format='.15g'. To ensure that converting to text and back to a 135 proto will result in an identical value, double_format='.17g' should be used. 136 137 Args: 138 message: The protocol buffers message. 139 as_utf8: Return unescaped Unicode for non-ASCII characters. 140 as_one_line: Don't introduce newlines between fields. 141 use_short_repeated_primitives: Use short repeated format for primitives. 142 pointy_brackets: If True, use angle brackets instead of curly braces for 143 nesting. 144 use_index_order: If True, fields of a proto message will be printed using 145 the order defined in source code instead of the field number, extensions 146 will be printed at the end of the message and their relative order is 147 determined by the extension number. By default, use the field number 148 order. 149 float_format (str): If set, use this to specify float field formatting 150 (per the "Format Specification Mini-Language"); otherwise, shortest float 151 that has same value in wire will be printed. Also affect double field 152 if double_format is not set but float_format is set. 153 double_format (str): If set, use this to specify double field formatting 154 (per the "Format Specification Mini-Language"); if it is not set but 155 float_format is set, use float_format. Otherwise, use ``str()`` 156 use_field_number: If True, print field numbers instead of names. 157 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 158 indent (int): The initial indent level, in terms of spaces, for pretty 159 print. 160 message_formatter (function(message, indent, as_one_line) -> unicode|None): 161 Custom formatter for selected sub-messages (usually based on message 162 type). Use to pretty print parts of the protobuf for easier diffing. 163 print_unknown_fields: If True, unknown fields will be printed. 164 force_colon: If set, a colon will be added after the field name even if the 165 field is a proto message. 166 167 Returns: 168 str: A string of the text formatted protocol buffer message. 169 """ 170 out = TextWriter(as_utf8) 171 printer = _Printer( 172 out, 173 indent, 174 as_utf8, 175 as_one_line, 176 use_short_repeated_primitives, 177 pointy_brackets, 178 use_index_order, 179 float_format, 180 double_format, 181 use_field_number, 182 descriptor_pool, 183 message_formatter, 184 print_unknown_fields=print_unknown_fields, 185 force_colon=force_colon) 186 printer.PrintMessage(message) 187 result = out.getvalue() 188 out.close() 189 if as_one_line: 190 return result.rstrip() 191 return result 192 193 194def MessageToBytes(message, **kwargs): 195 # type: (...) -> bytes 196 """Convert protobuf message to encoded text format. See MessageToString.""" 197 text = MessageToString(message, **kwargs) 198 if isinstance(text, bytes): 199 return text 200 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 201 return text.encode(codec) 202 203 204def _IsMapEntry(field): 205 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 206 field.message_type.has_options and 207 field.message_type.GetOptions().map_entry) 208 209 210def PrintMessage(message, 211 out, 212 indent=0, 213 as_utf8=False, 214 as_one_line=False, 215 use_short_repeated_primitives=False, 216 pointy_brackets=False, 217 use_index_order=False, 218 float_format=None, 219 double_format=None, 220 use_field_number=False, 221 descriptor_pool=None, 222 message_formatter=None, 223 print_unknown_fields=False, 224 force_colon=False): 225 """Convert the message to text format and write it to the out stream. 226 227 Args: 228 message: The Message object to convert to text format. 229 out: A file handle to write the message to. 230 indent: The initial indent level for pretty print. 231 as_utf8: Return unescaped Unicode for non-ASCII characters. 232 as_one_line: Don't introduce newlines between fields. 233 use_short_repeated_primitives: Use short repeated format for primitives. 234 pointy_brackets: If True, use angle brackets instead of curly braces for 235 nesting. 236 use_index_order: If True, print fields of a proto message using the order 237 defined in source code instead of the field number. By default, use the 238 field number order. 239 float_format: If set, use this to specify float field formatting 240 (per the "Format Specification Mini-Language"); otherwise, shortest 241 float that has same value in wire will be printed. Also affect double 242 field if double_format is not set but float_format is set. 243 double_format: If set, use this to specify double field formatting 244 (per the "Format Specification Mini-Language"); if it is not set but 245 float_format is set, use float_format. Otherwise, str() is used. 246 use_field_number: If True, print field numbers instead of names. 247 descriptor_pool: A DescriptorPool used to resolve Any types. 248 message_formatter: A function(message, indent, as_one_line): unicode|None 249 to custom format selected sub-messages (usually based on message type). 250 Use to pretty print parts of the protobuf for easier diffing. 251 print_unknown_fields: If True, unknown fields will be printed. 252 force_colon: If set, a colon will be added after the field name even if 253 the field is a proto message. 254 """ 255 printer = _Printer( 256 out=out, indent=indent, as_utf8=as_utf8, 257 as_one_line=as_one_line, 258 use_short_repeated_primitives=use_short_repeated_primitives, 259 pointy_brackets=pointy_brackets, 260 use_index_order=use_index_order, 261 float_format=float_format, 262 double_format=double_format, 263 use_field_number=use_field_number, 264 descriptor_pool=descriptor_pool, 265 message_formatter=message_formatter, 266 print_unknown_fields=print_unknown_fields, 267 force_colon=force_colon) 268 printer.PrintMessage(message) 269 270 271def PrintField(field, 272 value, 273 out, 274 indent=0, 275 as_utf8=False, 276 as_one_line=False, 277 use_short_repeated_primitives=False, 278 pointy_brackets=False, 279 use_index_order=False, 280 float_format=None, 281 double_format=None, 282 message_formatter=None, 283 print_unknown_fields=False, 284 force_colon=False): 285 """Print a single field name/value pair.""" 286 printer = _Printer(out, indent, as_utf8, as_one_line, 287 use_short_repeated_primitives, pointy_brackets, 288 use_index_order, float_format, double_format, 289 message_formatter=message_formatter, 290 print_unknown_fields=print_unknown_fields, 291 force_colon=force_colon) 292 printer.PrintField(field, value) 293 294 295def PrintFieldValue(field, 296 value, 297 out, 298 indent=0, 299 as_utf8=False, 300 as_one_line=False, 301 use_short_repeated_primitives=False, 302 pointy_brackets=False, 303 use_index_order=False, 304 float_format=None, 305 double_format=None, 306 message_formatter=None, 307 print_unknown_fields=False, 308 force_colon=False): 309 """Print a single field value (not including name).""" 310 printer = _Printer(out, indent, as_utf8, as_one_line, 311 use_short_repeated_primitives, pointy_brackets, 312 use_index_order, float_format, double_format, 313 message_formatter=message_formatter, 314 print_unknown_fields=print_unknown_fields, 315 force_colon=force_colon) 316 printer.PrintFieldValue(field, value) 317 318 319def _BuildMessageFromTypeName(type_name, descriptor_pool): 320 """Returns a protobuf message instance. 321 322 Args: 323 type_name: Fully-qualified protobuf message type name string. 324 descriptor_pool: DescriptorPool instance. 325 326 Returns: 327 A Message instance of type matching type_name, or None if the a Descriptor 328 wasn't found matching type_name. 329 """ 330 # pylint: disable=g-import-not-at-top 331 if descriptor_pool is None: 332 from google.protobuf import descriptor_pool as pool_mod 333 descriptor_pool = pool_mod.Default() 334 from google.protobuf import symbol_database 335 database = symbol_database.Default() 336 try: 337 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 338 except KeyError: 339 return None 340 message_type = database.GetPrototype(message_descriptor) 341 return message_type() 342 343 344# These values must match WireType enum in google/protobuf/wire_format.h. 345WIRETYPE_LENGTH_DELIMITED = 2 346WIRETYPE_START_GROUP = 3 347 348 349class _Printer(object): 350 """Text format printer for protocol message.""" 351 352 def __init__( 353 self, 354 out, 355 indent=0, 356 as_utf8=False, 357 as_one_line=False, 358 use_short_repeated_primitives=False, 359 pointy_brackets=False, 360 use_index_order=False, 361 float_format=None, 362 double_format=None, 363 use_field_number=False, 364 descriptor_pool=None, 365 message_formatter=None, 366 print_unknown_fields=False, 367 force_colon=False): 368 """Initialize the Printer. 369 370 Double values can be formatted compactly with 15 digits of precision 371 (which is the most that IEEE 754 "double" can guarantee) using 372 double_format='.15g'. To ensure that converting to text and back to a proto 373 will result in an identical value, double_format='.17g' should be used. 374 375 Args: 376 out: To record the text format result. 377 indent: The initial indent level for pretty print. 378 as_utf8: Return unescaped Unicode for non-ASCII characters. 379 as_one_line: Don't introduce newlines between fields. 380 use_short_repeated_primitives: Use short repeated format for primitives. 381 pointy_brackets: If True, use angle brackets instead of curly braces for 382 nesting. 383 use_index_order: If True, print fields of a proto message using the order 384 defined in source code instead of the field number. By default, use the 385 field number order. 386 float_format: If set, use this to specify float field formatting 387 (per the "Format Specification Mini-Language"); otherwise, shortest 388 float that has same value in wire will be printed. Also affect double 389 field if double_format is not set but float_format is set. 390 double_format: If set, use this to specify double field formatting 391 (per the "Format Specification Mini-Language"); if it is not set but 392 float_format is set, use float_format. Otherwise, str() is used. 393 use_field_number: If True, print field numbers instead of names. 394 descriptor_pool: A DescriptorPool used to resolve Any types. 395 message_formatter: A function(message, indent, as_one_line): unicode|None 396 to custom format selected sub-messages (usually based on message type). 397 Use to pretty print parts of the protobuf for easier diffing. 398 print_unknown_fields: If True, unknown fields will be printed. 399 force_colon: If set, a colon will be added after the field name even if 400 the field is a proto message. 401 """ 402 self.out = out 403 self.indent = indent 404 self.as_utf8 = as_utf8 405 self.as_one_line = as_one_line 406 self.use_short_repeated_primitives = use_short_repeated_primitives 407 self.pointy_brackets = pointy_brackets 408 self.use_index_order = use_index_order 409 self.float_format = float_format 410 if double_format is not None: 411 self.double_format = double_format 412 else: 413 self.double_format = float_format 414 self.use_field_number = use_field_number 415 self.descriptor_pool = descriptor_pool 416 self.message_formatter = message_formatter 417 self.print_unknown_fields = print_unknown_fields 418 self.force_colon = force_colon 419 420 def _TryPrintAsAnyMessage(self, message): 421 """Serializes if message is a google.protobuf.Any field.""" 422 if '/' not in message.type_url: 423 return False 424 packed_message = _BuildMessageFromTypeName(message.TypeName(), 425 self.descriptor_pool) 426 if packed_message: 427 packed_message.MergeFromString(message.value) 428 colon = ':' if self.force_colon else '' 429 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon)) 430 self._PrintMessageFieldValue(packed_message) 431 self.out.write(' ' if self.as_one_line else '\n') 432 return True 433 else: 434 return False 435 436 def _TryCustomFormatMessage(self, message): 437 formatted = self.message_formatter(message, self.indent, self.as_one_line) 438 if formatted is None: 439 return False 440 441 out = self.out 442 out.write(' ' * self.indent) 443 out.write(formatted) 444 out.write(' ' if self.as_one_line else '\n') 445 return True 446 447 def PrintMessage(self, message): 448 """Convert protobuf message to text format. 449 450 Args: 451 message: The protocol buffers message. 452 """ 453 if self.message_formatter and self._TryCustomFormatMessage(message): 454 return 455 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 456 self._TryPrintAsAnyMessage(message)): 457 return 458 fields = message.ListFields() 459 if self.use_index_order: 460 fields.sort( 461 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 462 for field, value in fields: 463 if _IsMapEntry(field): 464 for key in sorted(value): 465 # This is slow for maps with submessage entries because it copies the 466 # entire tree. Unfortunately this would take significant refactoring 467 # of this file to work around. 468 # 469 # TODO(haberman): refactor and optimize if this becomes an issue. 470 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 471 self.PrintField(field, entry_submsg) 472 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 473 if (self.use_short_repeated_primitives 474 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 475 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 476 self._PrintShortRepeatedPrimitivesValue(field, value) 477 else: 478 for element in value: 479 self.PrintField(field, element) 480 else: 481 self.PrintField(field, value) 482 483 if self.print_unknown_fields: 484 self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message)) 485 486 def _PrintUnknownFields(self, unknown_field_set): 487 """Print unknown fields.""" 488 out = self.out 489 for field in unknown_field_set: 490 out.write(' ' * self.indent) 491 out.write(str(field.field_number)) 492 if field.wire_type == WIRETYPE_START_GROUP: 493 if self.as_one_line: 494 out.write(' { ') 495 else: 496 out.write(' {\n') 497 self.indent += 2 498 499 self._PrintUnknownFields(field.data) 500 501 if self.as_one_line: 502 out.write('} ') 503 else: 504 self.indent -= 2 505 out.write(' ' * self.indent + '}\n') 506 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 507 try: 508 # If this field is parseable as a Message, it is probably 509 # an embedded message. 510 # pylint: disable=protected-access 511 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 512 memoryview(field.data), 0, len(field.data)) 513 except Exception: # pylint: disable=broad-except 514 pos = 0 515 516 if pos == len(field.data): 517 if self.as_one_line: 518 out.write(' { ') 519 else: 520 out.write(' {\n') 521 self.indent += 2 522 523 self._PrintUnknownFields(embedded_unknown_message) 524 525 if self.as_one_line: 526 out.write('} ') 527 else: 528 self.indent -= 2 529 out.write(' ' * self.indent + '}\n') 530 else: 531 # A string or bytes field. self.as_utf8 may not work. 532 out.write(': \"') 533 out.write(text_encoding.CEscape(field.data, False)) 534 out.write('\" ' if self.as_one_line else '\"\n') 535 else: 536 # varint, fixed32, fixed64 537 out.write(': ') 538 out.write(str(field.data)) 539 out.write(' ' if self.as_one_line else '\n') 540 541 def _PrintFieldName(self, field): 542 """Print field name.""" 543 out = self.out 544 out.write(' ' * self.indent) 545 if self.use_field_number: 546 out.write(str(field.number)) 547 else: 548 if field.is_extension: 549 out.write('[') 550 if (field.containing_type.GetOptions().message_set_wire_format and 551 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 552 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 553 out.write(field.message_type.full_name) 554 else: 555 out.write(field.full_name) 556 out.write(']') 557 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 558 # For groups, use the capitalized name. 559 out.write(field.message_type.name) 560 else: 561 out.write(field.name) 562 563 if (self.force_colon or 564 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE): 565 # The colon is optional in this case, but our cross-language golden files 566 # don't include it. Here, the colon is only included if force_colon is 567 # set to True 568 out.write(':') 569 570 def PrintField(self, field, value): 571 """Print a single field name/value pair.""" 572 self._PrintFieldName(field) 573 self.out.write(' ') 574 self.PrintFieldValue(field, value) 575 self.out.write(' ' if self.as_one_line else '\n') 576 577 def _PrintShortRepeatedPrimitivesValue(self, field, value): 578 """"Prints short repeated primitives value.""" 579 # Note: this is called only when value has at least one element. 580 self._PrintFieldName(field) 581 self.out.write(' [') 582 for i in range(len(value) - 1): 583 self.PrintFieldValue(field, value[i]) 584 self.out.write(', ') 585 self.PrintFieldValue(field, value[-1]) 586 self.out.write(']') 587 self.out.write(' ' if self.as_one_line else '\n') 588 589 def _PrintMessageFieldValue(self, value): 590 if self.pointy_brackets: 591 openb = '<' 592 closeb = '>' 593 else: 594 openb = '{' 595 closeb = '}' 596 597 if self.as_one_line: 598 self.out.write('%s ' % openb) 599 self.PrintMessage(value) 600 self.out.write(closeb) 601 else: 602 self.out.write('%s\n' % openb) 603 self.indent += 2 604 self.PrintMessage(value) 605 self.indent -= 2 606 self.out.write(' ' * self.indent + closeb) 607 608 def PrintFieldValue(self, field, value): 609 """Print a single field value (not including name). 610 611 For repeated fields, the value should be a single element. 612 613 Args: 614 field: The descriptor of the field to be printed. 615 value: The value of the field. 616 """ 617 out = self.out 618 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 619 self._PrintMessageFieldValue(value) 620 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 621 enum_value = field.enum_type.values_by_number.get(value, None) 622 if enum_value is not None: 623 out.write(enum_value.name) 624 else: 625 out.write(str(value)) 626 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 627 out.write('\"') 628 if isinstance(value, str) and not self.as_utf8: 629 out_value = value.encode('utf-8') 630 else: 631 out_value = value 632 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 633 # We always need to escape all binary data in TYPE_BYTES fields. 634 out_as_utf8 = False 635 else: 636 out_as_utf8 = self.as_utf8 637 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 638 out.write('\"') 639 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 640 if value: 641 out.write('true') 642 else: 643 out.write('false') 644 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 645 if self.float_format is not None: 646 out.write('{1:{0}}'.format(self.float_format, value)) 647 else: 648 if math.isnan(value): 649 out.write(str(value)) 650 else: 651 out.write(str(type_checkers.ToShortestFloat(value))) 652 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and 653 self.double_format is not None): 654 out.write('{1:{0}}'.format(self.double_format, value)) 655 else: 656 out.write(str(value)) 657 658 659def Parse(text, 660 message, 661 allow_unknown_extension=False, 662 allow_field_number=False, 663 descriptor_pool=None, 664 allow_unknown_field=False): 665 """Parses a text representation of a protocol message into a message. 666 667 NOTE: for historical reasons this function does not clear the input 668 message. This is different from what the binary msg.ParseFrom(...) does. 669 If text contains a field already set in message, the value is appended if the 670 field is repeated. Otherwise, an error is raised. 671 672 Example:: 673 674 a = MyProto() 675 a.repeated_field.append('test') 676 b = MyProto() 677 678 # Repeated fields are combined 679 text_format.Parse(repr(a), b) 680 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 681 682 # Non-repeated fields cannot be overwritten 683 a.singular_field = 1 684 b.singular_field = 2 685 text_format.Parse(repr(a), b) # ParseError 686 687 # Binary version: 688 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 689 690 Caller is responsible for clearing the message as needed. 691 692 Args: 693 text (str): Message text representation. 694 message (Message): A protocol buffer message to merge into. 695 allow_unknown_extension: if True, skip over missing extensions and keep 696 parsing 697 allow_field_number: if True, both field number and field name are allowed. 698 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 699 allow_unknown_field: if True, skip over unknown field and keep 700 parsing. Avoid to use this option if possible. It may hide some 701 errors (e.g. spelling error on field name) 702 703 Returns: 704 Message: The same message passed as argument. 705 706 Raises: 707 ParseError: On text parsing problems. 708 """ 709 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 710 message, 711 allow_unknown_extension, 712 allow_field_number, 713 descriptor_pool=descriptor_pool, 714 allow_unknown_field=allow_unknown_field) 715 716 717def Merge(text, 718 message, 719 allow_unknown_extension=False, 720 allow_field_number=False, 721 descriptor_pool=None, 722 allow_unknown_field=False): 723 """Parses a text representation of a protocol message into a message. 724 725 Like Parse(), but allows repeated values for a non-repeated field, and uses 726 the last one. This means any non-repeated, top-level fields specified in text 727 replace those in the message. 728 729 Args: 730 text (str): Message text representation. 731 message (Message): A protocol buffer message to merge into. 732 allow_unknown_extension: if True, skip over missing extensions and keep 733 parsing 734 allow_field_number: if True, both field number and field name are allowed. 735 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 736 allow_unknown_field: if True, skip over unknown field and keep 737 parsing. Avoid to use this option if possible. It may hide some 738 errors (e.g. spelling error on field name) 739 740 Returns: 741 Message: The same message passed as argument. 742 743 Raises: 744 ParseError: On text parsing problems. 745 """ 746 return MergeLines( 747 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 748 message, 749 allow_unknown_extension, 750 allow_field_number, 751 descriptor_pool=descriptor_pool, 752 allow_unknown_field=allow_unknown_field) 753 754 755def ParseLines(lines, 756 message, 757 allow_unknown_extension=False, 758 allow_field_number=False, 759 descriptor_pool=None, 760 allow_unknown_field=False): 761 """Parses a text representation of a protocol message into a message. 762 763 See Parse() for caveats. 764 765 Args: 766 lines: An iterable of lines of a message's text representation. 767 message: A protocol buffer message to merge into. 768 allow_unknown_extension: if True, skip over missing extensions and keep 769 parsing 770 allow_field_number: if True, both field number and field name are allowed. 771 descriptor_pool: A DescriptorPool used to resolve Any types. 772 allow_unknown_field: if True, skip over unknown field and keep 773 parsing. Avoid to use this option if possible. It may hide some 774 errors (e.g. spelling error on field name) 775 776 Returns: 777 The same message passed as argument. 778 779 Raises: 780 ParseError: On text parsing problems. 781 """ 782 parser = _Parser(allow_unknown_extension, 783 allow_field_number, 784 descriptor_pool=descriptor_pool, 785 allow_unknown_field=allow_unknown_field) 786 return parser.ParseLines(lines, message) 787 788 789def MergeLines(lines, 790 message, 791 allow_unknown_extension=False, 792 allow_field_number=False, 793 descriptor_pool=None, 794 allow_unknown_field=False): 795 """Parses a text representation of a protocol message into a message. 796 797 See Merge() for more details. 798 799 Args: 800 lines: An iterable of lines of a message's text representation. 801 message: A protocol buffer message to merge into. 802 allow_unknown_extension: if True, skip over missing extensions and keep 803 parsing 804 allow_field_number: if True, both field number and field name are allowed. 805 descriptor_pool: A DescriptorPool used to resolve Any types. 806 allow_unknown_field: if True, skip over unknown field and keep 807 parsing. Avoid to use this option if possible. It may hide some 808 errors (e.g. spelling error on field name) 809 810 Returns: 811 The same message passed as argument. 812 813 Raises: 814 ParseError: On text parsing problems. 815 """ 816 parser = _Parser(allow_unknown_extension, 817 allow_field_number, 818 descriptor_pool=descriptor_pool, 819 allow_unknown_field=allow_unknown_field) 820 return parser.MergeLines(lines, message) 821 822 823class _Parser(object): 824 """Text format parser for protocol message.""" 825 826 def __init__(self, 827 allow_unknown_extension=False, 828 allow_field_number=False, 829 descriptor_pool=None, 830 allow_unknown_field=False): 831 self.allow_unknown_extension = allow_unknown_extension 832 self.allow_field_number = allow_field_number 833 self.descriptor_pool = descriptor_pool 834 self.allow_unknown_field = allow_unknown_field 835 836 def ParseLines(self, lines, message): 837 """Parses a text representation of a protocol message into a message.""" 838 self._allow_multiple_scalars = False 839 self._ParseOrMerge(lines, message) 840 return message 841 842 def MergeLines(self, lines, message): 843 """Merges a text representation of a protocol message into a message.""" 844 self._allow_multiple_scalars = True 845 self._ParseOrMerge(lines, message) 846 return message 847 848 def _ParseOrMerge(self, lines, message): 849 """Converts a text representation of a protocol message into a message. 850 851 Args: 852 lines: Lines of a message's text representation. 853 message: A protocol buffer message to merge into. 854 855 Raises: 856 ParseError: On text parsing problems. 857 """ 858 # Tokenize expects native str lines. 859 str_lines = ( 860 line if isinstance(line, str) else line.decode('utf-8') 861 for line in lines) 862 tokenizer = Tokenizer(str_lines) 863 while not tokenizer.AtEnd(): 864 self._MergeField(tokenizer, message) 865 866 def _MergeField(self, tokenizer, message): 867 """Merges a single protocol message field into a message. 868 869 Args: 870 tokenizer: A tokenizer to parse the field name and values. 871 message: A protocol message to record the data. 872 873 Raises: 874 ParseError: In case of text parsing problems. 875 """ 876 message_descriptor = message.DESCRIPTOR 877 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 878 tokenizer.TryConsume('[')): 879 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 880 tokenizer.Consume(']') 881 tokenizer.TryConsume(':') 882 if tokenizer.TryConsume('<'): 883 expanded_any_end_token = '>' 884 else: 885 tokenizer.Consume('{') 886 expanded_any_end_token = '}' 887 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 888 self.descriptor_pool) 889 # Direct comparison with None is used instead of implicit bool conversion 890 # to avoid false positives with falsy initial values, e.g. for 891 # google.protobuf.ListValue. 892 if expanded_any_sub_message is None: 893 raise ParseError('Type %s not found in descriptor pool' % 894 packed_type_name) 895 while not tokenizer.TryConsume(expanded_any_end_token): 896 if tokenizer.AtEnd(): 897 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 898 (expanded_any_end_token,)) 899 self._MergeField(tokenizer, expanded_any_sub_message) 900 deterministic = False 901 902 message.Pack(expanded_any_sub_message, 903 type_url_prefix=type_url_prefix, 904 deterministic=deterministic) 905 return 906 907 if tokenizer.TryConsume('['): 908 name = [tokenizer.ConsumeIdentifier()] 909 while tokenizer.TryConsume('.'): 910 name.append(tokenizer.ConsumeIdentifier()) 911 name = '.'.join(name) 912 913 if not message_descriptor.is_extendable: 914 raise tokenizer.ParseErrorPreviousToken( 915 'Message type "%s" does not have extensions.' % 916 message_descriptor.full_name) 917 # pylint: disable=protected-access 918 field = message.Extensions._FindExtensionByName(name) 919 # pylint: enable=protected-access 920 921 922 if not field: 923 if self.allow_unknown_extension: 924 field = None 925 else: 926 raise tokenizer.ParseErrorPreviousToken( 927 'Extension "%s" not registered. ' 928 'Did you import the _pb2 module which defines it? ' 929 'If you are trying to place the extension in the MessageSet ' 930 'field of another message that is in an Any or MessageSet field, ' 931 'that message\'s _pb2 module must be imported as well' % name) 932 elif message_descriptor != field.containing_type: 933 raise tokenizer.ParseErrorPreviousToken( 934 'Extension "%s" does not extend message type "%s".' % 935 (name, message_descriptor.full_name)) 936 937 tokenizer.Consume(']') 938 939 else: 940 name = tokenizer.ConsumeIdentifierOrNumber() 941 if self.allow_field_number and name.isdigit(): 942 number = ParseInteger(name, True, True) 943 field = message_descriptor.fields_by_number.get(number, None) 944 if not field and message_descriptor.is_extendable: 945 field = message.Extensions._FindExtensionByNumber(number) 946 else: 947 field = message_descriptor.fields_by_name.get(name, None) 948 949 # Group names are expected to be capitalized as they appear in the 950 # .proto file, which actually matches their type names, not their field 951 # names. 952 if not field: 953 field = message_descriptor.fields_by_name.get(name.lower(), None) 954 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 955 field = None 956 957 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 958 field.message_type.name != name): 959 field = None 960 961 if not field and not self.allow_unknown_field: 962 raise tokenizer.ParseErrorPreviousToken( 963 'Message type "%s" has no field named "%s".' % 964 (message_descriptor.full_name, name)) 965 966 if field: 967 if not self._allow_multiple_scalars and field.containing_oneof: 968 # Check if there's a different field set in this oneof. 969 # Note that we ignore the case if the same field was set before, and we 970 # apply _allow_multiple_scalars to non-scalar fields as well. 971 which_oneof = message.WhichOneof(field.containing_oneof.name) 972 if which_oneof is not None and which_oneof != field.name: 973 raise tokenizer.ParseErrorPreviousToken( 974 'Field "%s" is specified along with field "%s", another member ' 975 'of oneof "%s" for message type "%s".' % 976 (field.name, which_oneof, field.containing_oneof.name, 977 message_descriptor.full_name)) 978 979 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 980 tokenizer.TryConsume(':') 981 merger = self._MergeMessageField 982 else: 983 tokenizer.Consume(':') 984 merger = self._MergeScalarField 985 986 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and 987 tokenizer.TryConsume('[')): 988 # Short repeated format, e.g. "foo: [1, 2, 3]" 989 if not tokenizer.TryConsume(']'): 990 while True: 991 merger(tokenizer, message, field) 992 if tokenizer.TryConsume(']'): 993 break 994 tokenizer.Consume(',') 995 996 else: 997 merger(tokenizer, message, field) 998 999 else: # Proto field is unknown. 1000 assert (self.allow_unknown_extension or self.allow_unknown_field) 1001 _SkipFieldContents(tokenizer) 1002 1003 # For historical reasons, fields may optionally be separated by commas or 1004 # semicolons. 1005 if not tokenizer.TryConsume(','): 1006 tokenizer.TryConsume(';') 1007 1008 1009 def _ConsumeAnyTypeUrl(self, tokenizer): 1010 """Consumes a google.protobuf.Any type URL and returns the type name.""" 1011 # Consume "type.googleapis.com/". 1012 prefix = [tokenizer.ConsumeIdentifier()] 1013 tokenizer.Consume('.') 1014 prefix.append(tokenizer.ConsumeIdentifier()) 1015 tokenizer.Consume('.') 1016 prefix.append(tokenizer.ConsumeIdentifier()) 1017 tokenizer.Consume('/') 1018 # Consume the fully-qualified type name. 1019 name = [tokenizer.ConsumeIdentifier()] 1020 while tokenizer.TryConsume('.'): 1021 name.append(tokenizer.ConsumeIdentifier()) 1022 return '.'.join(prefix), '.'.join(name) 1023 1024 def _MergeMessageField(self, tokenizer, message, field): 1025 """Merges a single scalar field into a message. 1026 1027 Args: 1028 tokenizer: A tokenizer to parse the field value. 1029 message: The message of which field is a member. 1030 field: The descriptor of the field to be merged. 1031 1032 Raises: 1033 ParseError: In case of text parsing problems. 1034 """ 1035 is_map_entry = _IsMapEntry(field) 1036 1037 if tokenizer.TryConsume('<'): 1038 end_token = '>' 1039 else: 1040 tokenizer.Consume('{') 1041 end_token = '}' 1042 1043 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1044 if field.is_extension: 1045 sub_message = message.Extensions[field].add() 1046 elif is_map_entry: 1047 sub_message = getattr(message, field.name).GetEntryClass()() 1048 else: 1049 sub_message = getattr(message, field.name).add() 1050 else: 1051 if field.is_extension: 1052 if (not self._allow_multiple_scalars and 1053 message.HasExtension(field)): 1054 raise tokenizer.ParseErrorPreviousToken( 1055 'Message type "%s" should not have multiple "%s" extensions.' % 1056 (message.DESCRIPTOR.full_name, field.full_name)) 1057 sub_message = message.Extensions[field] 1058 else: 1059 # Also apply _allow_multiple_scalars to message field. 1060 # TODO(jieluo): Change to _allow_singular_overwrites. 1061 if (not self._allow_multiple_scalars and 1062 message.HasField(field.name)): 1063 raise tokenizer.ParseErrorPreviousToken( 1064 'Message type "%s" should not have multiple "%s" fields.' % 1065 (message.DESCRIPTOR.full_name, field.name)) 1066 sub_message = getattr(message, field.name) 1067 sub_message.SetInParent() 1068 1069 while not tokenizer.TryConsume(end_token): 1070 if tokenizer.AtEnd(): 1071 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 1072 self._MergeField(tokenizer, sub_message) 1073 1074 if is_map_entry: 1075 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 1076 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 1077 value = getattr(message, field.name)[sub_message.key] 1078 value.CopyFrom(sub_message.value) 1079 else: 1080 getattr(message, field.name)[sub_message.key] = sub_message.value 1081 1082 @staticmethod 1083 def _IsProto3Syntax(message): 1084 message_descriptor = message.DESCRIPTOR 1085 return (hasattr(message_descriptor, 'syntax') and 1086 message_descriptor.syntax == 'proto3') 1087 1088 def _MergeScalarField(self, tokenizer, message, field): 1089 """Merges a single scalar field into a message. 1090 1091 Args: 1092 tokenizer: A tokenizer to parse the field value. 1093 message: A protocol message to record the data. 1094 field: The descriptor of the field to be merged. 1095 1096 Raises: 1097 ParseError: In case of text parsing problems. 1098 RuntimeError: On runtime errors. 1099 """ 1100 _ = self.allow_unknown_extension 1101 value = None 1102 1103 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 1104 descriptor.FieldDescriptor.TYPE_SINT32, 1105 descriptor.FieldDescriptor.TYPE_SFIXED32): 1106 value = _ConsumeInt32(tokenizer) 1107 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 1108 descriptor.FieldDescriptor.TYPE_SINT64, 1109 descriptor.FieldDescriptor.TYPE_SFIXED64): 1110 value = _ConsumeInt64(tokenizer) 1111 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 1112 descriptor.FieldDescriptor.TYPE_FIXED32): 1113 value = _ConsumeUint32(tokenizer) 1114 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 1115 descriptor.FieldDescriptor.TYPE_FIXED64): 1116 value = _ConsumeUint64(tokenizer) 1117 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 1118 descriptor.FieldDescriptor.TYPE_DOUBLE): 1119 value = tokenizer.ConsumeFloat() 1120 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 1121 value = tokenizer.ConsumeBool() 1122 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 1123 value = tokenizer.ConsumeString() 1124 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 1125 value = tokenizer.ConsumeByteString() 1126 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 1127 value = tokenizer.ConsumeEnum(field) 1128 else: 1129 raise RuntimeError('Unknown field type %d' % field.type) 1130 1131 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1132 if field.is_extension: 1133 message.Extensions[field].append(value) 1134 else: 1135 getattr(message, field.name).append(value) 1136 else: 1137 if field.is_extension: 1138 if (not self._allow_multiple_scalars and 1139 not self._IsProto3Syntax(message) and 1140 message.HasExtension(field)): 1141 raise tokenizer.ParseErrorPreviousToken( 1142 'Message type "%s" should not have multiple "%s" extensions.' % 1143 (message.DESCRIPTOR.full_name, field.full_name)) 1144 else: 1145 message.Extensions[field] = value 1146 else: 1147 duplicate_error = False 1148 if not self._allow_multiple_scalars: 1149 if self._IsProto3Syntax(message): 1150 # Proto3 doesn't represent presence so we try best effort to check 1151 # multiple scalars by compare to default values. 1152 duplicate_error = bool(getattr(message, field.name)) 1153 else: 1154 duplicate_error = message.HasField(field.name) 1155 1156 if duplicate_error: 1157 raise tokenizer.ParseErrorPreviousToken( 1158 'Message type "%s" should not have multiple "%s" fields.' % 1159 (message.DESCRIPTOR.full_name, field.name)) 1160 else: 1161 setattr(message, field.name, value) 1162 1163 1164def _SkipFieldContents(tokenizer): 1165 """Skips over contents (value or message) of a field. 1166 1167 Args: 1168 tokenizer: A tokenizer to parse the field name and values. 1169 """ 1170 # Try to guess the type of this field. 1171 # If this field is not a message, there should be a ":" between the 1172 # field name and the field value and also the field value should not 1173 # start with "{" or "<" which indicates the beginning of a message body. 1174 # If there is no ":" or there is a "{" or "<" after ":", this field has 1175 # to be a message or the input is ill-formed. 1176 if tokenizer.TryConsume( 1177 ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'): 1178 if tokenizer.LookingAt('['): 1179 _SkipRepeatedFieldValue(tokenizer) 1180 else: 1181 _SkipFieldValue(tokenizer) 1182 else: 1183 _SkipFieldMessage(tokenizer) 1184 1185 1186def _SkipField(tokenizer): 1187 """Skips over a complete field (name and value/message). 1188 1189 Args: 1190 tokenizer: A tokenizer to parse the field name and values. 1191 """ 1192 if tokenizer.TryConsume('['): 1193 # Consume extension name. 1194 tokenizer.ConsumeIdentifier() 1195 while tokenizer.TryConsume('.'): 1196 tokenizer.ConsumeIdentifier() 1197 tokenizer.Consume(']') 1198 else: 1199 tokenizer.ConsumeIdentifierOrNumber() 1200 1201 _SkipFieldContents(tokenizer) 1202 1203 # For historical reasons, fields may optionally be separated by commas or 1204 # semicolons. 1205 if not tokenizer.TryConsume(','): 1206 tokenizer.TryConsume(';') 1207 1208 1209def _SkipFieldMessage(tokenizer): 1210 """Skips over a field message. 1211 1212 Args: 1213 tokenizer: A tokenizer to parse the field name and values. 1214 """ 1215 1216 if tokenizer.TryConsume('<'): 1217 delimiter = '>' 1218 else: 1219 tokenizer.Consume('{') 1220 delimiter = '}' 1221 1222 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 1223 _SkipField(tokenizer) 1224 1225 tokenizer.Consume(delimiter) 1226 1227 1228def _SkipFieldValue(tokenizer): 1229 """Skips over a field value. 1230 1231 Args: 1232 tokenizer: A tokenizer to parse the field name and values. 1233 1234 Raises: 1235 ParseError: In case an invalid field value is found. 1236 """ 1237 # String/bytes tokens can come in multiple adjacent string literals. 1238 # If we can consume one, consume as many as we can. 1239 if tokenizer.TryConsumeByteString(): 1240 while tokenizer.TryConsumeByteString(): 1241 pass 1242 return 1243 1244 if (not tokenizer.TryConsumeIdentifier() and 1245 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and 1246 not tokenizer.TryConsumeFloat()): 1247 raise ParseError('Invalid field value: ' + tokenizer.token) 1248 1249 1250def _SkipRepeatedFieldValue(tokenizer): 1251 """Skips over a repeated field value. 1252 1253 Args: 1254 tokenizer: A tokenizer to parse the field value. 1255 """ 1256 tokenizer.Consume('[') 1257 if not tokenizer.LookingAt(']'): 1258 _SkipFieldValue(tokenizer) 1259 while tokenizer.TryConsume(','): 1260 _SkipFieldValue(tokenizer) 1261 tokenizer.Consume(']') 1262 1263 1264class Tokenizer(object): 1265 """Protocol buffer text representation tokenizer. 1266 1267 This class handles the lower level string parsing by splitting it into 1268 meaningful tokens. 1269 1270 It was directly ported from the Java protocol buffer API. 1271 """ 1272 1273 _WHITESPACE = re.compile(r'\s+') 1274 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 1275 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 1276 _TOKEN = re.compile('|'.join([ 1277 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 1278 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 1279 ] + [ # quoted str for each quote mark 1280 # Avoid backtracking! https://stackoverflow.com/a/844267 1281 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark) 1282 for mark in _QUOTES 1283 ])) 1284 1285 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 1286 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 1287 1288 def __init__(self, lines, skip_comments=True): 1289 self._position = 0 1290 self._line = -1 1291 self._column = 0 1292 self._token_start = None 1293 self.token = '' 1294 self._lines = iter(lines) 1295 self._current_line = '' 1296 self._previous_line = 0 1297 self._previous_column = 0 1298 self._more_lines = True 1299 self._skip_comments = skip_comments 1300 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 1301 or self._WHITESPACE) 1302 self._SkipWhitespace() 1303 self.NextToken() 1304 1305 def LookingAt(self, token): 1306 return self.token == token 1307 1308 def AtEnd(self): 1309 """Checks the end of the text was reached. 1310 1311 Returns: 1312 True iff the end was reached. 1313 """ 1314 return not self.token 1315 1316 def _PopLine(self): 1317 while len(self._current_line) <= self._column: 1318 try: 1319 self._current_line = next(self._lines) 1320 except StopIteration: 1321 self._current_line = '' 1322 self._more_lines = False 1323 return 1324 else: 1325 self._line += 1 1326 self._column = 0 1327 1328 def _SkipWhitespace(self): 1329 while True: 1330 self._PopLine() 1331 match = self._whitespace_pattern.match(self._current_line, self._column) 1332 if not match: 1333 break 1334 length = len(match.group(0)) 1335 self._column += length 1336 1337 def TryConsume(self, token): 1338 """Tries to consume a given piece of text. 1339 1340 Args: 1341 token: Text to consume. 1342 1343 Returns: 1344 True iff the text was consumed. 1345 """ 1346 if self.token == token: 1347 self.NextToken() 1348 return True 1349 return False 1350 1351 def Consume(self, token): 1352 """Consumes a piece of text. 1353 1354 Args: 1355 token: Text to consume. 1356 1357 Raises: 1358 ParseError: If the text couldn't be consumed. 1359 """ 1360 if not self.TryConsume(token): 1361 raise self.ParseError('Expected "%s".' % token) 1362 1363 def ConsumeComment(self): 1364 result = self.token 1365 if not self._COMMENT.match(result): 1366 raise self.ParseError('Expected comment.') 1367 self.NextToken() 1368 return result 1369 1370 def ConsumeCommentOrTrailingComment(self): 1371 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 1372 1373 # Tokenizer initializes _previous_line and _previous_column to 0. As the 1374 # tokenizer starts, it looks like there is a previous token on the line. 1375 just_started = self._line == 0 and self._column == 0 1376 1377 before_parsing = self._previous_line 1378 comment = self.ConsumeComment() 1379 1380 # A trailing comment is a comment on the same line than the previous token. 1381 trailing = (self._previous_line == before_parsing 1382 and not just_started) 1383 1384 return trailing, comment 1385 1386 def TryConsumeIdentifier(self): 1387 try: 1388 self.ConsumeIdentifier() 1389 return True 1390 except ParseError: 1391 return False 1392 1393 def ConsumeIdentifier(self): 1394 """Consumes protocol message field identifier. 1395 1396 Returns: 1397 Identifier string. 1398 1399 Raises: 1400 ParseError: If an identifier couldn't be consumed. 1401 """ 1402 result = self.token 1403 if not self._IDENTIFIER.match(result): 1404 raise self.ParseError('Expected identifier.') 1405 self.NextToken() 1406 return result 1407 1408 def TryConsumeIdentifierOrNumber(self): 1409 try: 1410 self.ConsumeIdentifierOrNumber() 1411 return True 1412 except ParseError: 1413 return False 1414 1415 def ConsumeIdentifierOrNumber(self): 1416 """Consumes protocol message field identifier. 1417 1418 Returns: 1419 Identifier string. 1420 1421 Raises: 1422 ParseError: If an identifier couldn't be consumed. 1423 """ 1424 result = self.token 1425 if not self._IDENTIFIER_OR_NUMBER.match(result): 1426 raise self.ParseError('Expected identifier or number, got %s.' % result) 1427 self.NextToken() 1428 return result 1429 1430 def TryConsumeInteger(self): 1431 try: 1432 self.ConsumeInteger() 1433 return True 1434 except ParseError: 1435 return False 1436 1437 def ConsumeInteger(self): 1438 """Consumes an integer number. 1439 1440 Returns: 1441 The integer parsed. 1442 1443 Raises: 1444 ParseError: If an integer couldn't be consumed. 1445 """ 1446 try: 1447 result = _ParseAbstractInteger(self.token) 1448 except ValueError as e: 1449 raise self.ParseError(str(e)) 1450 self.NextToken() 1451 return result 1452 1453 def TryConsumeFloat(self): 1454 try: 1455 self.ConsumeFloat() 1456 return True 1457 except ParseError: 1458 return False 1459 1460 def ConsumeFloat(self): 1461 """Consumes an floating point number. 1462 1463 Returns: 1464 The number parsed. 1465 1466 Raises: 1467 ParseError: If a floating point number couldn't be consumed. 1468 """ 1469 try: 1470 result = ParseFloat(self.token) 1471 except ValueError as e: 1472 raise self.ParseError(str(e)) 1473 self.NextToken() 1474 return result 1475 1476 def ConsumeBool(self): 1477 """Consumes a boolean value. 1478 1479 Returns: 1480 The bool parsed. 1481 1482 Raises: 1483 ParseError: If a boolean value couldn't be consumed. 1484 """ 1485 try: 1486 result = ParseBool(self.token) 1487 except ValueError as e: 1488 raise self.ParseError(str(e)) 1489 self.NextToken() 1490 return result 1491 1492 def TryConsumeByteString(self): 1493 try: 1494 self.ConsumeByteString() 1495 return True 1496 except ParseError: 1497 return False 1498 1499 def ConsumeString(self): 1500 """Consumes a string value. 1501 1502 Returns: 1503 The string parsed. 1504 1505 Raises: 1506 ParseError: If a string value couldn't be consumed. 1507 """ 1508 the_bytes = self.ConsumeByteString() 1509 try: 1510 return str(the_bytes, 'utf-8') 1511 except UnicodeDecodeError as e: 1512 raise self._StringParseError(e) 1513 1514 def ConsumeByteString(self): 1515 """Consumes a byte array value. 1516 1517 Returns: 1518 The array parsed (as a string). 1519 1520 Raises: 1521 ParseError: If a byte array value couldn't be consumed. 1522 """ 1523 the_list = [self._ConsumeSingleByteString()] 1524 while self.token and self.token[0] in _QUOTES: 1525 the_list.append(self._ConsumeSingleByteString()) 1526 return b''.join(the_list) 1527 1528 def _ConsumeSingleByteString(self): 1529 """Consume one token of a string literal. 1530 1531 String literals (whether bytes or text) can come in multiple adjacent 1532 tokens which are automatically concatenated, like in C or Python. This 1533 method only consumes one token. 1534 1535 Returns: 1536 The token parsed. 1537 Raises: 1538 ParseError: When the wrong format data is found. 1539 """ 1540 text = self.token 1541 if len(text) < 1 or text[0] not in _QUOTES: 1542 raise self.ParseError('Expected string but found: %r' % (text,)) 1543 1544 if len(text) < 2 or text[-1] != text[0]: 1545 raise self.ParseError('String missing ending quote: %r' % (text,)) 1546 1547 try: 1548 result = text_encoding.CUnescape(text[1:-1]) 1549 except ValueError as e: 1550 raise self.ParseError(str(e)) 1551 self.NextToken() 1552 return result 1553 1554 def ConsumeEnum(self, field): 1555 try: 1556 result = ParseEnum(field, self.token) 1557 except ValueError as e: 1558 raise self.ParseError(str(e)) 1559 self.NextToken() 1560 return result 1561 1562 def ParseErrorPreviousToken(self, message): 1563 """Creates and *returns* a ParseError for the previously read token. 1564 1565 Args: 1566 message: A message to set for the exception. 1567 1568 Returns: 1569 A ParseError instance. 1570 """ 1571 return ParseError(message, self._previous_line + 1, 1572 self._previous_column + 1) 1573 1574 def ParseError(self, message): 1575 """Creates and *returns* a ParseError for the current token.""" 1576 return ParseError('\'' + self._current_line + '\': ' + message, 1577 self._line + 1, self._column + 1) 1578 1579 def _StringParseError(self, e): 1580 return self.ParseError('Couldn\'t parse string: ' + str(e)) 1581 1582 def NextToken(self): 1583 """Reads the next meaningful token.""" 1584 self._previous_line = self._line 1585 self._previous_column = self._column 1586 1587 self._column += len(self.token) 1588 self._SkipWhitespace() 1589 1590 if not self._more_lines: 1591 self.token = '' 1592 return 1593 1594 match = self._TOKEN.match(self._current_line, self._column) 1595 if not match and not self._skip_comments: 1596 match = self._COMMENT.match(self._current_line, self._column) 1597 if match: 1598 token = match.group(0) 1599 self.token = token 1600 else: 1601 self.token = self._current_line[self._column] 1602 1603# Aliased so it can still be accessed by current visibility violators. 1604# TODO(dbarnett): Migrate violators to textformat_tokenizer. 1605_Tokenizer = Tokenizer # pylint: disable=invalid-name 1606 1607 1608def _ConsumeInt32(tokenizer): 1609 """Consumes a signed 32bit integer number from tokenizer. 1610 1611 Args: 1612 tokenizer: A tokenizer used to parse the number. 1613 1614 Returns: 1615 The integer parsed. 1616 1617 Raises: 1618 ParseError: If a signed 32bit integer couldn't be consumed. 1619 """ 1620 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 1621 1622 1623def _ConsumeUint32(tokenizer): 1624 """Consumes an unsigned 32bit integer number from tokenizer. 1625 1626 Args: 1627 tokenizer: A tokenizer used to parse the number. 1628 1629 Returns: 1630 The integer parsed. 1631 1632 Raises: 1633 ParseError: If an unsigned 32bit integer couldn't be consumed. 1634 """ 1635 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 1636 1637 1638def _TryConsumeInt64(tokenizer): 1639 try: 1640 _ConsumeInt64(tokenizer) 1641 return True 1642 except ParseError: 1643 return False 1644 1645 1646def _ConsumeInt64(tokenizer): 1647 """Consumes a signed 32bit integer number from tokenizer. 1648 1649 Args: 1650 tokenizer: A tokenizer used to parse the number. 1651 1652 Returns: 1653 The integer parsed. 1654 1655 Raises: 1656 ParseError: If a signed 32bit integer couldn't be consumed. 1657 """ 1658 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 1659 1660 1661def _TryConsumeUint64(tokenizer): 1662 try: 1663 _ConsumeUint64(tokenizer) 1664 return True 1665 except ParseError: 1666 return False 1667 1668 1669def _ConsumeUint64(tokenizer): 1670 """Consumes an unsigned 64bit integer number from tokenizer. 1671 1672 Args: 1673 tokenizer: A tokenizer used to parse the number. 1674 1675 Returns: 1676 The integer parsed. 1677 1678 Raises: 1679 ParseError: If an unsigned 64bit integer couldn't be consumed. 1680 """ 1681 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 1682 1683 1684def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 1685 """Consumes an integer number from tokenizer. 1686 1687 Args: 1688 tokenizer: A tokenizer used to parse the number. 1689 is_signed: True if a signed integer must be parsed. 1690 is_long: True if a long integer must be parsed. 1691 1692 Returns: 1693 The integer parsed. 1694 1695 Raises: 1696 ParseError: If an integer with given characteristics couldn't be consumed. 1697 """ 1698 try: 1699 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 1700 except ValueError as e: 1701 raise tokenizer.ParseError(str(e)) 1702 tokenizer.NextToken() 1703 return result 1704 1705 1706def ParseInteger(text, is_signed=False, is_long=False): 1707 """Parses an integer. 1708 1709 Args: 1710 text: The text to parse. 1711 is_signed: True if a signed integer must be parsed. 1712 is_long: True if a long integer must be parsed. 1713 1714 Returns: 1715 The integer value. 1716 1717 Raises: 1718 ValueError: Thrown Iff the text is not a valid integer. 1719 """ 1720 # Do the actual parsing. Exception handling is propagated to caller. 1721 result = _ParseAbstractInteger(text) 1722 1723 # Check if the integer is sane. Exceptions handled by callers. 1724 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 1725 checker.CheckValue(result) 1726 return result 1727 1728 1729def _ParseAbstractInteger(text): 1730 """Parses an integer without checking size/signedness. 1731 1732 Args: 1733 text: The text to parse. 1734 1735 Returns: 1736 The integer value. 1737 1738 Raises: 1739 ValueError: Thrown Iff the text is not a valid integer. 1740 """ 1741 # Do the actual parsing. Exception handling is propagated to caller. 1742 orig_text = text 1743 c_octal_match = re.match(r'(-?)0(\d+)$', text) 1744 if c_octal_match: 1745 # Python 3 no longer supports 0755 octal syntax without the 'o', so 1746 # we always use the '0o' prefix for multi-digit numbers starting with 0. 1747 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 1748 try: 1749 return int(text, 0) 1750 except ValueError: 1751 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 1752 1753 1754def ParseFloat(text): 1755 """Parse a floating point number. 1756 1757 Args: 1758 text: Text to parse. 1759 1760 Returns: 1761 The number parsed. 1762 1763 Raises: 1764 ValueError: If a floating point number couldn't be parsed. 1765 """ 1766 try: 1767 # Assume Python compatible syntax. 1768 return float(text) 1769 except ValueError: 1770 # Check alternative spellings. 1771 if _FLOAT_INFINITY.match(text): 1772 if text[0] == '-': 1773 return float('-inf') 1774 else: 1775 return float('inf') 1776 elif _FLOAT_NAN.match(text): 1777 return float('nan') 1778 else: 1779 # assume '1.0f' format 1780 try: 1781 return float(text.rstrip('f')) 1782 except ValueError: 1783 raise ValueError('Couldn\'t parse float: %s' % text) 1784 1785 1786def ParseBool(text): 1787 """Parse a boolean value. 1788 1789 Args: 1790 text: Text to parse. 1791 1792 Returns: 1793 Boolean values parsed 1794 1795 Raises: 1796 ValueError: If text is not a valid boolean. 1797 """ 1798 if text in ('true', 't', '1', 'True'): 1799 return True 1800 elif text in ('false', 'f', '0', 'False'): 1801 return False 1802 else: 1803 raise ValueError('Expected "true" or "false".') 1804 1805 1806def ParseEnum(field, value): 1807 """Parse an enum value. 1808 1809 The value can be specified by a number (the enum value), or by 1810 a string literal (the enum name). 1811 1812 Args: 1813 field: Enum field descriptor. 1814 value: String value. 1815 1816 Returns: 1817 Enum value number. 1818 1819 Raises: 1820 ValueError: If the enum value could not be parsed. 1821 """ 1822 enum_descriptor = field.enum_type 1823 try: 1824 number = int(value, 0) 1825 except ValueError: 1826 # Identifier. 1827 enum_value = enum_descriptor.values_by_name.get(value, None) 1828 if enum_value is None: 1829 raise ValueError('Enum type "%s" has no value named %s.' % 1830 (enum_descriptor.full_name, value)) 1831 else: 1832 # Numeric value. 1833 if hasattr(field.file, 'syntax'): 1834 # Attribute is checked for compatibility. 1835 if field.file.syntax == 'proto3': 1836 # Proto3 accept numeric unknown enums. 1837 return number 1838 enum_value = enum_descriptor.values_by_number.get(number, None) 1839 if enum_value is None: 1840 raise ValueError('Enum type "%s" has no value with number %d.' % 1841 (enum_descriptor.full_name, number)) 1842 return enum_value.number 1843