1 '''"Executable documentation" for the pickle module.
2 
3 Extensive comments about the pickle protocols and pickle-machine opcodes
4 can be found here.  Some functions meant for external use:
5 
6 genops(pickle)
7    Generate all the opcodes in a pickle, as (opcode, arg, position) triples.
8 
9 dis(pickle, out=None, memo=None, indentlevel=4)
10    Print a symbolic disassembly of a pickle.
11 '''
12 
13 import codecs
14 import io
15 import pickle
16 import re
17 import sys
18 
19 __all__ = ['dis', 'genops', 'optimize']
20 
21 bytes_types = pickle.bytes_types
22 
23 # Other ideas:
24 #
25 # - A pickle verifier:  read a pickle and check it exhaustively for
26 #   well-formedness.  dis() does a lot of this already.
27 #
28 # - A protocol identifier:  examine a pickle and return its protocol number
29 #   (== the highest .proto attr value among all the opcodes in the pickle).
30 #   dis() already prints this info at the end.
31 #
32 # - A pickle optimizer:  for example, tuple-building code is sometimes more
33 #   elaborate than necessary, catering for the possibility that the tuple
34 #   is recursive.  Or lots of times a PUT is generated that's never accessed
35 #   by a later GET.
36 
37 
38 # "A pickle" is a program for a virtual pickle machine (PM, but more accurately
39 # called an unpickling machine).  It's a sequence of opcodes, interpreted by the
40 # PM, building an arbitrarily complex Python object.
41 #
42 # For the most part, the PM is very simple:  there are no looping, testing, or
43 # conditional instructions, no arithmetic and no function calls.  Opcodes are
44 # executed once each, from first to last, until a STOP opcode is reached.
45 #
46 # The PM has two data areas, "the stack" and "the memo".
47 #
48 # Many opcodes push Python objects onto the stack; e.g., INT pushes a Python
49 # integer object on the stack, whose value is gotten from a decimal string
50 # literal immediately following the INT opcode in the pickle bytestream.  Other
51 # opcodes take Python objects off the stack.  The result of unpickling is
52 # whatever object is left on the stack when the final STOP opcode is executed.
53 #
54 # The memo is simply an array of objects, or it can be implemented as a dict
55 # mapping little integers to objects.  The memo serves as the PM's "long term
56 # memory", and the little integers indexing the memo are akin to variable
57 # names.  Some opcodes pop a stack object into the memo at a given index,
58 # and others push a memo object at a given index onto the stack again.
59 #
60 # At heart, that's all the PM has.  Subtleties arise for these reasons:
61 #
62 # + Object identity.  Objects can be arbitrarily complex, and subobjects
63 #   may be shared (for example, the list [a, a] refers to the same object a
64 #   twice).  It can be vital that unpickling recreate an isomorphic object
65 #   graph, faithfully reproducing sharing.
66 #
67 # + Recursive objects.  For example, after "L = []; L.append(L)", L is a
68 #   list, and L[0] is the same list.  This is related to the object identity
69 #   point, and some sequences of pickle opcodes are subtle in order to
70 #   get the right result in all cases.
71 #
72 # + Things pickle doesn't know everything about.  Examples of things pickle
73 #   does know everything about are Python's builtin scalar and container
74 #   types, like ints and tuples.  They generally have opcodes dedicated to
75 #   them.  For things like module references and instances of user-defined
76 #   classes, pickle's knowledge is limited.  Historically, many enhancements
77 #   have been made to the pickle protocol in order to do a better (faster,
78 #   and/or more compact) job on those.
79 #
80 # + Backward compatibility and micro-optimization.  As explained below,
81 #   pickle opcodes never go away, not even when better ways to do a thing
82 #   get invented.  The repertoire of the PM just keeps growing over time.
83 #   For example, protocol 0 had two opcodes for building Python integers (INT
84 #   and LONG), protocol 1 added three more for more-efficient pickling of short
85 #   integers, and protocol 2 added two more for more-efficient pickling of
86 #   long integers (before protocol 2, the only ways to pickle a Python long
87 #   took time quadratic in the number of digits, for both pickling and
88 #   unpickling).  "Opcode bloat" isn't so much a subtlety as a source of
89 #   wearying complication.
90 #
91 #
92 # Pickle protocols:
93 #
94 # For compatibility, the meaning of a pickle opcode never changes.  Instead new
95 # pickle opcodes get added, and each version's unpickler can handle all the
96 # pickle opcodes in all protocol versions to date.  So old pickles continue to
97 # be readable forever.  The pickler can generally be told to restrict itself to
98 # the subset of opcodes available under previous protocol versions too, so that
99 # users can create pickles under the current version readable by older
100 # versions.  However, a pickle does not contain its version number embedded
101 # within it.  If an older unpickler tries to read a pickle using a later
102 # protocol, the result is most likely an exception due to seeing an unknown (in
103 # the older unpickler) opcode.
104 #
105 # The original pickle used what's now called "protocol 0", and what was called
106 # "text mode" before Python 2.3.  The entire pickle bytestream is made up of
107 # printable 7-bit ASCII characters, plus the newline character, in protocol 0.
108 # That's why it was called text mode.  Protocol 0 is small and elegant, but
109 # sometimes painfully inefficient.
110 #
111 # The second major set of additions is now called "protocol 1", and was called
112 # "binary mode" before Python 2.3.  This added many opcodes with arguments
113 # consisting of arbitrary bytes, including NUL bytes and unprintable "high bit"
114 # bytes.  Binary mode pickles can be substantially smaller than equivalent
115 # text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
116 # int as 4 bytes following the opcode, which is cheaper to unpickle than the
117 # (perhaps) 11-character decimal string attached to INT.  Protocol 1 also added
118 # a number of opcodes that operate on many stack elements at once (like APPENDS
119 # and SETITEMS), and "shortcut" opcodes (like EMPTY_DICT and EMPTY_TUPLE).
120 #
121 # The third major set of additions came in Python 2.3, and is called "protocol
122 # 2".  This added:
123 #
124 # - A better way to pickle instances of new-style classes (NEWOBJ).
125 #
126 # - A way for a pickle to identify its protocol (PROTO).
127 #
128 # - Time- and space- efficient pickling of long ints (LONG{1,4}).
129 #
130 # - Shortcuts for small tuples (TUPLE{1,2,3}}.
131 #
132 # - Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
133 #
134 # - The "extension registry", a vector of popular objects that can be pushed
135 #   efficiently by index (EXT{1,2,4}).  This is akin to the memo and GET, but
136 #   the registry contents are predefined (there's nothing akin to the memo's
137 #   PUT).
138 #
139 # Another independent change with Python 2.3 is the abandonment of any
140 # pretense that it might be safe to load pickles received from untrusted
141 # parties -- no sufficient security analysis has been done to guarantee
142 # this and there isn't a use case that warrants the expense of such an
143 # analysis.
144 #
145 # To this end, all tests for __safe_for_unpickling__ or for
146 # copyreg.safe_constructors are removed from the unpickling code.
147 # References to these variables in the descriptions below are to be seen
148 # as describing unpickling in Python 2.2 and before.
149 
150 
151 # Meta-rule:  Descriptions are stored in instances of descriptor objects,
152 # with plain constructors.  No meta-language is defined from which
153 # descriptors could be constructed.  If you want, e.g., XML, write a little
154 # program to generate XML from the objects.
155 
156 ##############################################################################
157 # Some pickle opcodes have an argument, following the opcode in the
158 # bytestream.  An argument is of a specific type, described by an instance
159 # of ArgumentDescriptor.  These are not to be confused with arguments taken
160 # off the stack -- ArgumentDescriptor applies only to arguments embedded in
161 # the opcode stream, immediately following an opcode.
162 
163 # Represents the number of bytes consumed by an argument delimited by the
164 # next newline character.
165 UP_TO_NEWLINE = -1
166 
167 # Represents the number of bytes consumed by a two-argument opcode where
168 # the first argument gives the number of bytes in the second argument.
169 TAKEN_FROM_ARGUMENT1  = -2   # num bytes is 1-byte unsigned int
170 TAKEN_FROM_ARGUMENT4  = -3   # num bytes is 4-byte signed little-endian int
171 TAKEN_FROM_ARGUMENT4U = -4   # num bytes is 4-byte unsigned little-endian int
172 TAKEN_FROM_ARGUMENT8U = -5   # num bytes is 8-byte unsigned little-endian int
173 
174 class ArgumentDescriptor(object):
175     __slots__ = (
176         # name of descriptor record, also a module global name; a string
177         'name',
178 
179         # length of argument, in bytes; an int; UP_TO_NEWLINE and
180         # TAKEN_FROM_ARGUMENT{1,4,8} are negative values for variable-length
181         # cases
182         'n',
183 
184         # a function taking a file-like object, reading this kind of argument
185         # from the object at the current position, advancing the current
186         # position by n bytes, and returning the value of the argument
187         'reader',
188 
189         # human-readable docs for this arg descriptor; a string
190         'doc',
191     )
192 
193     def __init__(self, name, n, reader, doc):
194         assert isinstance(name, str)
195         self.name = name
196 
197         assert isinstance(n, int) and (n >= 0 or
198                                        n in (UP_TO_NEWLINE,
199                                              TAKEN_FROM_ARGUMENT1,
200                                              TAKEN_FROM_ARGUMENT4,
201                                              TAKEN_FROM_ARGUMENT4U,
202                                              TAKEN_FROM_ARGUMENT8U))
203         self.n = n
204 
205         self.reader = reader
206 
207         assert isinstance(doc, str)
208         self.doc = doc
209 
210 from struct import unpack as _unpack
211 
212 def read_uint1(f):
213     r"""
214     >>> import io
215     >>> read_uint1(io.BytesIO(b'\xff'))
216     255
217     """
218 
219     data = f.read(1)
220     if data:
221         return data[0]
222     raise ValueError("not enough data in stream to read uint1")
223 
224 uint1 = ArgumentDescriptor(
225             name='uint1',
226             n=1,
227             reader=read_uint1,
228             doc="One-byte unsigned integer.")
229 
230 
231 def read_uint2(f):
232     r"""
233     >>> import io
234     >>> read_uint2(io.BytesIO(b'\xff\x00'))
235     255
236     >>> read_uint2(io.BytesIO(b'\xff\xff'))
237     65535
238     """
239 
240     data = f.read(2)
241     if len(data) == 2:
242         return _unpack("<H", data)[0]
243     raise ValueError("not enough data in stream to read uint2")
244 
245 uint2 = ArgumentDescriptor(
246             name='uint2',
247             n=2,
248             reader=read_uint2,
249             doc="Two-byte unsigned integer, little-endian.")
250 
251 
252 def read_int4(f):
253     r"""
254     >>> import io
255     >>> read_int4(io.BytesIO(b'\xff\x00\x00\x00'))
256     255
257     >>> read_int4(io.BytesIO(b'\x00\x00\x00\x80')) == -(2**31)
258     True
259     """
260 
261     data = f.read(4)
262     if len(data) == 4:
263         return _unpack("<i", data)[0]
264     raise ValueError("not enough data in stream to read int4")
265 
266 int4 = ArgumentDescriptor(
267            name='int4',
268            n=4,
269            reader=read_int4,
270            doc="Four-byte signed integer, little-endian, 2's complement.")
271 
272 
273 def read_uint4(f):
274     r"""
275     >>> import io
276     >>> read_uint4(io.BytesIO(b'\xff\x00\x00\x00'))
277     255
278     >>> read_uint4(io.BytesIO(b'\x00\x00\x00\x80')) == 2**31
279     True
280     """
281 
282     data = f.read(4)
283     if len(data) == 4:
284         return _unpack("<I", data)[0]
285     raise ValueError("not enough data in stream to read uint4")
286 
287 uint4 = ArgumentDescriptor(
288             name='uint4',
289             n=4,
290             reader=read_uint4,
291             doc="Four-byte unsigned integer, little-endian.")
292 
293 
294 def read_uint8(f):
295     r"""
296     >>> import io
297     >>> read_uint8(io.BytesIO(b'\xff\x00\x00\x00\x00\x00\x00\x00'))
298     255
299     >>> read_uint8(io.BytesIO(b'\xff' * 8)) == 2**64-1
300     True
301     """
302 
303     data = f.read(8)
304     if len(data) == 8:
305         return _unpack("<Q", data)[0]
306     raise ValueError("not enough data in stream to read uint8")
307 
308 uint8 = ArgumentDescriptor(
309             name='uint8',
310             n=8,
311             reader=read_uint8,
312             doc="Eight-byte unsigned integer, little-endian.")
313 
314 
315 def read_stringnl(f, decode=True, stripquotes=True):
316     r"""
317     >>> import io
318     >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
319     'abcd'
320 
321     >>> read_stringnl(io.BytesIO(b"\n"))
322     Traceback (most recent call last):
323     ...
324     ValueError: no string quotes around b''
325 
326     >>> read_stringnl(io.BytesIO(b"\n"), stripquotes=False)
327     ''
328 
329     >>> read_stringnl(io.BytesIO(b"''\n"))
330     ''
331 
332     >>> read_stringnl(io.BytesIO(b'"abcd"'))
333     Traceback (most recent call last):
334     ...
335     ValueError: no newline found when trying to read stringnl
336 
337     Embedded escapes are undone in the result.
338     >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'"))
339     'a\n\\b\x00c\td'
340     """
341 
342     data = f.readline()
343     if not data.endswith(b'\n'):
344         raise ValueError("no newline found when trying to read stringnl")
345     data = data[:-1]    # lose the newline
346 
347     if stripquotes:
348         for q in (b'"', b"'"):
349             if data.startswith(q):
350                 if not data.endswith(q):
351                     raise ValueError("strinq quote %r not found at both "
352                                      "ends of %r" % (q, data))
353                 data = data[1:-1]
354                 break
355         else:
356             raise ValueError("no string quotes around %r" % data)
357 
358     if decode:
359         data = codecs.escape_decode(data)[0].decode("ascii")
360     return data
361 
362 stringnl = ArgumentDescriptor(
363                name='stringnl',
364                n=UP_TO_NEWLINE,
365                reader=read_stringnl,
366                doc="""A newline-terminated string.
367 
368                    This is a repr-style string, with embedded escapes, and
369                    bracketing quotes.
370                    """)
371 
372 def read_stringnl_noescape(f):
373     return read_stringnl(f, stripquotes=False)
374 
375 stringnl_noescape = ArgumentDescriptor(
376                         name='stringnl_noescape',
377                         n=UP_TO_NEWLINE,
378                         reader=read_stringnl_noescape,
379                         doc="""A newline-terminated string.
380 
381                         This is a str-style string, without embedded escapes,
382                         or bracketing quotes.  It should consist solely of
383                         printable ASCII characters.
384                         """)
385 
386 def read_stringnl_noescape_pair(f):
387     r"""
388     >>> import io
389     >>> read_stringnl_noescape_pair(io.BytesIO(b"Queue\nEmpty\njunk"))
390     'Queue Empty'
391     """
392 
393     return "%s %s" % (read_stringnl_noescape(f), read_stringnl_noescape(f))
394 
395 stringnl_noescape_pair = ArgumentDescriptor(
396                              name='stringnl_noescape_pair',
397                              n=UP_TO_NEWLINE,
398                              reader=read_stringnl_noescape_pair,
399                              doc="""A pair of newline-terminated strings.
400 
401                              These are str-style strings, without embedded
402                              escapes, or bracketing quotes.  They should
403                              consist solely of printable ASCII characters.
404                              The pair is returned as a single string, with
405                              a single blank separating the two strings.
406                              """)
407 
408 
409 def read_string1(f):
410     r"""
411     >>> import io
412     >>> read_string1(io.BytesIO(b"\x00"))
413     ''
414     >>> read_string1(io.BytesIO(b"\x03abcdef"))
415     'abc'
416     """
417 
418     n = read_uint1(f)
419     assert n >= 0
420     data = f.read(n)
421     if len(data) == n:
422         return data.decode("latin-1")
423     raise ValueError("expected %d bytes in a string1, but only %d remain" %
424                      (n, len(data)))
425 
426 string1 = ArgumentDescriptor(
427               name="string1",
428               n=TAKEN_FROM_ARGUMENT1,
429               reader=read_string1,
430               doc="""A counted string.
431 
432               The first argument is a 1-byte unsigned int giving the number
433               of bytes in the string, and the second argument is that many
434               bytes.
435               """)
436 
437 
438 def read_string4(f):
439     r"""
440     >>> import io
441     >>> read_string4(io.BytesIO(b"\x00\x00\x00\x00abc"))
442     ''
443     >>> read_string4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
444     'abc'
445     >>> read_string4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
446     Traceback (most recent call last):
447     ...
448     ValueError: expected 50331648 bytes in a string4, but only 6 remain
449     """
450 
451     n = read_int4(f)
452     if n < 0:
453         raise ValueError("string4 byte count < 0: %d" % n)
454     data = f.read(n)
455     if len(data) == n:
456         return data.decode("latin-1")
457     raise ValueError("expected %d bytes in a string4, but only %d remain" %
458                      (n, len(data)))
459 
460 string4 = ArgumentDescriptor(
461               name="string4",
462               n=TAKEN_FROM_ARGUMENT4,
463               reader=read_string4,
464               doc="""A counted string.
465 
466               The first argument is a 4-byte little-endian signed int giving
467               the number of bytes in the string, and the second argument is
468               that many bytes.
469               """)
470 
471 
472 def read_bytes1(f):
473     r"""
474     >>> import io
475     >>> read_bytes1(io.BytesIO(b"\x00"))
476     b''
477     >>> read_bytes1(io.BytesIO(b"\x03abcdef"))
478     b'abc'
479     """
480 
481     n = read_uint1(f)
482     assert n >= 0
483     data = f.read(n)
484     if len(data) == n:
485         return data
486     raise ValueError("expected %d bytes in a bytes1, but only %d remain" %
487                      (n, len(data)))
488 
489 bytes1 = ArgumentDescriptor(
490               name="bytes1",
491               n=TAKEN_FROM_ARGUMENT1,
492               reader=read_bytes1,
493               doc="""A counted bytes string.
494 
495               The first argument is a 1-byte unsigned int giving the number
496               of bytes, and the second argument is that many bytes.
497               """)
498 
499 
500 def read_bytes4(f):
501     r"""
502     >>> import io
503     >>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x00abc"))
504     b''
505     >>> read_bytes4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
506     b'abc'
507     >>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
508     Traceback (most recent call last):
509     ...
510     ValueError: expected 50331648 bytes in a bytes4, but only 6 remain
511     """
512 
513     n = read_uint4(f)
514     assert n >= 0
515     if n > sys.maxsize:
516         raise ValueError("bytes4 byte count > sys.maxsize: %d" % n)
517     data = f.read(n)
518     if len(data) == n:
519         return data
520     raise ValueError("expected %d bytes in a bytes4, but only %d remain" %
521                      (n, len(data)))
522 
523 bytes4 = ArgumentDescriptor(
524               name="bytes4",
525               n=TAKEN_FROM_ARGUMENT4U,
526               reader=read_bytes4,
527               doc="""A counted bytes string.
528 
529               The first argument is a 4-byte little-endian unsigned int giving
530               the number of bytes, and the second argument is that many bytes.
531               """)
532 
533 
534 def read_bytes8(f):
535     r"""
536     >>> import io, struct, sys
537     >>> read_bytes8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
538     b''
539     >>> read_bytes8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
540     b'abc'
541     >>> bigsize8 = struct.pack("<Q", sys.maxsize//3)
542     >>> read_bytes8(io.BytesIO(bigsize8 + b"abcdef"))  #doctest: +ELLIPSIS
543     Traceback (most recent call last):
544     ...
545     ValueError: expected ... bytes in a bytes8, but only 6 remain
546     """
547 
548     n = read_uint8(f)
549     assert n >= 0
550     if n > sys.maxsize:
551         raise ValueError("bytes8 byte count > sys.maxsize: %d" % n)
552     data = f.read(n)
553     if len(data) == n:
554         return data
555     raise ValueError("expected %d bytes in a bytes8, but only %d remain" %
556                      (n, len(data)))
557 
558 bytes8 = ArgumentDescriptor(
559               name="bytes8",
560               n=TAKEN_FROM_ARGUMENT8U,
561               reader=read_bytes8,
562               doc="""A counted bytes string.
563 
564               The first argument is an 8-byte little-endian unsigned int giving
565               the number of bytes, and the second argument is that many bytes.
566               """)
567 
568 
569 def read_bytearray8(f):
570     r"""
571     >>> import io, struct, sys
572     >>> read_bytearray8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
573     bytearray(b'')
574     >>> read_bytearray8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
575     bytearray(b'abc')
576     >>> bigsize8 = struct.pack("<Q", sys.maxsize//3)
577     >>> read_bytearray8(io.BytesIO(bigsize8 + b"abcdef"))  #doctest: +ELLIPSIS
578     Traceback (most recent call last):
579     ...
580     ValueError: expected ... bytes in a bytearray8, but only 6 remain
581     """
582 
583     n = read_uint8(f)
584     assert n >= 0
585     if n > sys.maxsize:
586         raise ValueError("bytearray8 byte count > sys.maxsize: %d" % n)
587     data = f.read(n)
588     if len(data) == n:
589         return bytearray(data)
590     raise ValueError("expected %d bytes in a bytearray8, but only %d remain" %
591                      (n, len(data)))
592 
593 bytearray8 = ArgumentDescriptor(
594               name="bytearray8",
595               n=TAKEN_FROM_ARGUMENT8U,
596               reader=read_bytearray8,
597               doc="""A counted bytearray.
598 
599               The first argument is an 8-byte little-endian unsigned int giving
600               the number of bytes, and the second argument is that many bytes.
601               """)
602 
603 def read_unicodestringnl(f):
604     r"""
605     >>> import io
606     >>> read_unicodestringnl(io.BytesIO(b"abc\\uabcd\njunk")) == 'abc\uabcd'
607     True
608     """
609 
610     data = f.readline()
611     if not data.endswith(b'\n'):
612         raise ValueError("no newline found when trying to read "
613                          "unicodestringnl")
614     data = data[:-1]    # lose the newline
615     return str(data, 'raw-unicode-escape')
616 
617 unicodestringnl = ArgumentDescriptor(
618                       name='unicodestringnl',
619                       n=UP_TO_NEWLINE,
620                       reader=read_unicodestringnl,
621                       doc="""A newline-terminated Unicode string.
622 
623                       This is raw-unicode-escape encoded, so consists of
624                       printable ASCII characters, and may contain embedded
625                       escape sequences.
626                       """)
627 
628 
629 def read_unicodestring1(f):
630     r"""
631     >>> import io
632     >>> s = 'abcd\uabcd'
633     >>> enc = s.encode('utf-8')
634     >>> enc
635     b'abcd\xea\xaf\x8d'
636     >>> n = bytes([len(enc)])  # little-endian 1-byte length
637     >>> t = read_unicodestring1(io.BytesIO(n + enc + b'junk'))
638     >>> s == t
639     True
640 
641     >>> read_unicodestring1(io.BytesIO(n + enc[:-1]))
642     Traceback (most recent call last):
643     ...
644     ValueError: expected 7 bytes in a unicodestring1, but only 6 remain
645     """
646 
647     n = read_uint1(f)
648     assert n >= 0
649     data = f.read(n)
650     if len(data) == n:
651         return str(data, 'utf-8', 'surrogatepass')
652     raise ValueError("expected %d bytes in a unicodestring1, but only %d "
653                      "remain" % (n, len(data)))
654 
655 unicodestring1 = ArgumentDescriptor(
656                     name="unicodestring1",
657                     n=TAKEN_FROM_ARGUMENT1,
658                     reader=read_unicodestring1,
659                     doc="""A counted Unicode string.
660 
661                     The first argument is a 1-byte little-endian signed int
662                     giving the number of bytes in the string, and the second
663                     argument-- the UTF-8 encoding of the Unicode string --
664                     contains that many bytes.
665                     """)
666 
667 
668 def read_unicodestring4(f):
669     r"""
670     >>> import io
671     >>> s = 'abcd\uabcd'
672     >>> enc = s.encode('utf-8')
673     >>> enc
674     b'abcd\xea\xaf\x8d'
675     >>> n = bytes([len(enc), 0, 0, 0])  # little-endian 4-byte length
676     >>> t = read_unicodestring4(io.BytesIO(n + enc + b'junk'))
677     >>> s == t
678     True
679 
680     >>> read_unicodestring4(io.BytesIO(n + enc[:-1]))
681     Traceback (most recent call last):
682     ...
683     ValueError: expected 7 bytes in a unicodestring4, but only 6 remain
684     """
685 
686     n = read_uint4(f)
687     assert n >= 0
688     if n > sys.maxsize:
689         raise ValueError("unicodestring4 byte count > sys.maxsize: %d" % n)
690     data = f.read(n)
691     if len(data) == n:
692         return str(data, 'utf-8', 'surrogatepass')
693     raise ValueError("expected %d bytes in a unicodestring4, but only %d "
694                      "remain" % (n, len(data)))
695 
696 unicodestring4 = ArgumentDescriptor(
697                     name="unicodestring4",
698                     n=TAKEN_FROM_ARGUMENT4U,
699                     reader=read_unicodestring4,
700                     doc="""A counted Unicode string.
701 
702                     The first argument is a 4-byte little-endian signed int
703                     giving the number of bytes in the string, and the second
704                     argument-- the UTF-8 encoding of the Unicode string --
705                     contains that many bytes.
706                     """)
707 
708 
709 def read_unicodestring8(f):
710     r"""
711     >>> import io
712     >>> s = 'abcd\uabcd'
713     >>> enc = s.encode('utf-8')
714     >>> enc
715     b'abcd\xea\xaf\x8d'
716     >>> n = bytes([len(enc)]) + b'\0' * 7  # little-endian 8-byte length
717     >>> t = read_unicodestring8(io.BytesIO(n + enc + b'junk'))
718     >>> s == t
719     True
720 
721     >>> read_unicodestring8(io.BytesIO(n + enc[:-1]))
722     Traceback (most recent call last):
723     ...
724     ValueError: expected 7 bytes in a unicodestring8, but only 6 remain
725     """
726 
727     n = read_uint8(f)
728     assert n >= 0
729     if n > sys.maxsize:
730         raise ValueError("unicodestring8 byte count > sys.maxsize: %d" % n)
731     data = f.read(n)
732     if len(data) == n:
733         return str(data, 'utf-8', 'surrogatepass')
734     raise ValueError("expected %d bytes in a unicodestring8, but only %d "
735                      "remain" % (n, len(data)))
736 
737 unicodestring8 = ArgumentDescriptor(
738                     name="unicodestring8",
739                     n=TAKEN_FROM_ARGUMENT8U,
740                     reader=read_unicodestring8,
741                     doc="""A counted Unicode string.
742 
743                     The first argument is an 8-byte little-endian signed int
744                     giving the number of bytes in the string, and the second
745                     argument-- the UTF-8 encoding of the Unicode string --
746                     contains that many bytes.
747                     """)
748 
749 
750 def read_decimalnl_short(f):
751     r"""
752     >>> import io
753     >>> read_decimalnl_short(io.BytesIO(b"1234\n56"))
754     1234
755 
756     >>> read_decimalnl_short(io.BytesIO(b"1234L\n56"))
757     Traceback (most recent call last):
758     ...
759     ValueError: invalid literal for int() with base 10: b'1234L'
760     """
761 
762     s = read_stringnl(f, decode=False, stripquotes=False)
763 
764     # There's a hack for True and False here.
765     if s == b"00":
766         return False
767     elif s == b"01":
768         return True
769 
770     return int(s)
771 
772 def read_decimalnl_long(f):
773     r"""
774     >>> import io
775 
776     >>> read_decimalnl_long(io.BytesIO(b"1234L\n56"))
777     1234
778 
779     >>> read_decimalnl_long(io.BytesIO(b"123456789012345678901234L\n6"))
780     123456789012345678901234
781     """
782 
783     s = read_stringnl(f, decode=False, stripquotes=False)
784     if s[-1:] == b'L':
785         s = s[:-1]
786     return int(s)
787 
788 
789 decimalnl_short = ArgumentDescriptor(
790                       name='decimalnl_short',
791                       n=UP_TO_NEWLINE,
792                       reader=read_decimalnl_short,
793                       doc="""A newline-terminated decimal integer literal.
794 
795                           This never has a trailing 'L', and the integer fit
796                           in a short Python int on the box where the pickle
797                           was written -- but there's no guarantee it will fit
798                           in a short Python int on the box where the pickle
799                           is read.
800                           """)
801 
802 decimalnl_long = ArgumentDescriptor(
803                      name='decimalnl_long',
804                      n=UP_TO_NEWLINE,
805                      reader=read_decimalnl_long,
806                      doc="""A newline-terminated decimal integer literal.
807 
808                          This has a trailing 'L', and can represent integers
809                          of any size.
810                          """)
811 
812 
813 def read_floatnl(f):
814     r"""
815     >>> import io
816     >>> read_floatnl(io.BytesIO(b"-1.25\n6"))
817     -1.25
818     """
819     s = read_stringnl(f, decode=False, stripquotes=False)
820     return float(s)
821 
822 floatnl = ArgumentDescriptor(
823               name='floatnl',
824               n=UP_TO_NEWLINE,
825               reader=read_floatnl,
826               doc="""A newline-terminated decimal floating literal.
827 
828               In general this requires 17 significant digits for roundtrip
829               identity, and pickling then unpickling infinities, NaNs, and
830               minus zero doesn't work across boxes, or on some boxes even
831               on itself (e.g., Windows can't read the strings it produces
832               for infinities or NaNs).
833               """)
834 
835 def read_float8(f):
836     r"""
837     >>> import io, struct
838     >>> raw = struct.pack(">d", -1.25)
839     >>> raw
840     b'\xbf\xf4\x00\x00\x00\x00\x00\x00'
841     >>> read_float8(io.BytesIO(raw + b"\n"))
842     -1.25
843     """
844 
845     data = f.read(8)
846     if len(data) == 8:
847         return _unpack(">d", data)[0]
848     raise ValueError("not enough data in stream to read float8")
849 
850 
851 float8 = ArgumentDescriptor(
852              name='float8',
853              n=8,
854              reader=read_float8,
855              doc="""An 8-byte binary representation of a float, big-endian.
856 
857              The format is unique to Python, and shared with the struct
858              module (format string '>d') "in theory" (the struct and pickle
859              implementations don't share the code -- they should).  It's
860              strongly related to the IEEE-754 double format, and, in normal
861              cases, is in fact identical to the big-endian 754 double format.
862              On other boxes the dynamic range is limited to that of a 754
863              double, and "add a half and chop" rounding is used to reduce
864              the precision to 53 bits.  However, even on a 754 box,
865              infinities, NaNs, and minus zero may not be handled correctly
866              (may not survive roundtrip pickling intact).
867              """)
868 
869 # Protocol 2 formats
870 
871 from pickle import decode_long
872 
873 def read_long1(f):
874     r"""
875     >>> import io
876     >>> read_long1(io.BytesIO(b"\x00"))
877     0
878     >>> read_long1(io.BytesIO(b"\x02\xff\x00"))
879     255
880     >>> read_long1(io.BytesIO(b"\x02\xff\x7f"))
881     32767
882     >>> read_long1(io.BytesIO(b"\x02\x00\xff"))
883     -256
884     >>> read_long1(io.BytesIO(b"\x02\x00\x80"))
885     -32768
886     """
887 
888     n = read_uint1(f)
889     data = f.read(n)
890     if len(data) != n:
891         raise ValueError("not enough data in stream to read long1")
892     return decode_long(data)
893 
894 long1 = ArgumentDescriptor(
895     name="long1",
896     n=TAKEN_FROM_ARGUMENT1,
897     reader=read_long1,
898     doc="""A binary long, little-endian, using 1-byte size.
899 
900     This first reads one byte as an unsigned size, then reads that
901     many bytes and interprets them as a little-endian 2's-complement long.
902     If the size is 0, that's taken as a shortcut for the long 0L.
903     """)
904 
905 def read_long4(f):
906     r"""
907     >>> import io
908     >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x00"))
909     255
910     >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x7f"))
911     32767
912     >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\xff"))
913     -256
914     >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\x80"))
915     -32768
916     >>> read_long1(io.BytesIO(b"\x00\x00\x00\x00"))
917     0
918     """
919 
920     n = read_int4(f)
921     if n < 0:
922         raise ValueError("long4 byte count < 0: %d" % n)
923     data = f.read(n)
924     if len(data) != n:
925         raise ValueError("not enough data in stream to read long4")
926     return decode_long(data)
927 
928 long4 = ArgumentDescriptor(
929     name="long4",
930     n=TAKEN_FROM_ARGUMENT4,
931     reader=read_long4,
932     doc="""A binary representation of a long, little-endian.
933 
934     This first reads four bytes as a signed size (but requires the
935     size to be >= 0), then reads that many bytes and interprets them
936     as a little-endian 2's-complement long.  If the size is 0, that's taken
937     as a shortcut for the int 0, although LONG1 should really be used
938     then instead (and in any case where # of bytes < 256).
939     """)
940 
941 
942 ##############################################################################
943 # Object descriptors.  The stack used by the pickle machine holds objects,
944 # and in the stack_before and stack_after attributes of OpcodeInfo
945 # descriptors we need names to describe the various types of objects that can
946 # appear on the stack.
947 
948 class StackObject(object):
949     __slots__ = (
950         # name of descriptor record, for info only
951         'name',
952 
953         # type of object, or tuple of type objects (meaning the object can
954         # be of any type in the tuple)
955         'obtype',
956 
957         # human-readable docs for this kind of stack object; a string
958         'doc',
959     )
960 
961     def __init__(self, name, obtype, doc):
962         assert isinstance(name, str)
963         self.name = name
964 
965         assert isinstance(obtype, type) or isinstance(obtype, tuple)
966         if isinstance(obtype, tuple):
967             for contained in obtype:
968                 assert isinstance(contained, type)
969         self.obtype = obtype
970 
971         assert isinstance(doc, str)
972         self.doc = doc
973 
974     def __repr__(self):
975         return self.name
976 
977 
978 pyint = pylong = StackObject(
979     name='int',
980     obtype=int,
981     doc="A Python integer object.")
982 
983 pyinteger_or_bool = StackObject(
984     name='int_or_bool',
985     obtype=(int, bool),
986     doc="A Python integer or boolean object.")
987 
988 pybool = StackObject(
989     name='bool',
990     obtype=bool,
991     doc="A Python boolean object.")
992 
993 pyfloat = StackObject(
994     name='float',
995     obtype=float,
996     doc="A Python float object.")
997 
998 pybytes_or_str = pystring = StackObject(
999     name='bytes_or_str',
1000     obtype=(bytes, str),
1001     doc="A Python bytes or (Unicode) string object.")
1002 
1003 pybytes = StackObject(
1004     name='bytes',
1005     obtype=bytes,
1006     doc="A Python bytes object.")
1007 
1008 pybytearray = StackObject(
1009     name='bytearray',
1010     obtype=bytearray,
1011     doc="A Python bytearray object.")
1012 
1013 pyunicode = StackObject(
1014     name='str',
1015     obtype=str,
1016     doc="A Python (Unicode) string object.")
1017 
1018 pynone = StackObject(
1019     name="None",
1020     obtype=type(None),
1021     doc="The Python None object.")
1022 
1023 pytuple = StackObject(
1024     name="tuple",
1025     obtype=tuple,
1026     doc="A Python tuple object.")
1027 
1028 pylist = StackObject(
1029     name="list",
1030     obtype=list,
1031     doc="A Python list object.")
1032 
1033 pydict = StackObject(
1034     name="dict",
1035     obtype=dict,
1036     doc="A Python dict object.")
1037 
1038 pyset = StackObject(
1039     name="set",
1040     obtype=set,
1041     doc="A Python set object.")
1042 
1043 pyfrozenset = StackObject(
1044     name="frozenset",
1045     obtype=set,
1046     doc="A Python frozenset object.")
1047 
1048 pybuffer = StackObject(
1049     name='buffer',
1050     obtype=object,
1051     doc="A Python buffer-like object.")
1052 
1053 anyobject = StackObject(
1054     name='any',
1055     obtype=object,
1056     doc="Any kind of object whatsoever.")
1057 
1058 markobject = StackObject(
1059     name="mark",
1060     obtype=StackObject,
1061     doc="""'The mark' is a unique object.
1062 
1063 Opcodes that operate on a variable number of objects
1064 generally don't embed the count of objects in the opcode,
1065 or pull it off the stack.  Instead the MARK opcode is used
1066 to push a special marker object on the stack, and then
1067 some other opcodes grab all the objects from the top of
1068 the stack down to (but not including) the topmost marker
1069 object.
1070 """)
1071 
1072 stackslice = StackObject(
1073     name="stackslice",
1074     obtype=StackObject,
1075     doc="""An object representing a contiguous slice of the stack.
1076 
1077 This is used in conjunction with markobject, to represent all
1078 of the stack following the topmost markobject.  For example,
1079 the POP_MARK opcode changes the stack from
1080 
1081     [..., markobject, stackslice]
1082 to
1083     [...]
1084 
1085 No matter how many object are on the stack after the topmost
1086 markobject, POP_MARK gets rid of all of them (including the
1087 topmost markobject too).
1088 """)
1089 
1090 ##############################################################################
1091 # Descriptors for pickle opcodes.
1092 
1093 class OpcodeInfo(object):
1094 
1095     __slots__ = (
1096         # symbolic name of opcode; a string
1097         'name',
1098 
1099         # the code used in a bytestream to represent the opcode; a
1100         # one-character string
1101         'code',
1102 
1103         # If the opcode has an argument embedded in the byte string, an
1104         # instance of ArgumentDescriptor specifying its type.  Note that
1105         # arg.reader(s) can be used to read and decode the argument from
1106         # the bytestream s, and arg.doc documents the format of the raw
1107         # argument bytes.  If the opcode doesn't have an argument embedded
1108         # in the bytestream, arg should be None.
1109         'arg',
1110 
1111         # what the stack looks like before this opcode runs; a list
1112         'stack_before',
1113 
1114         # what the stack looks like after this opcode runs; a list
1115         'stack_after',
1116 
1117         # the protocol number in which this opcode was introduced; an int
1118         'proto',
1119 
1120         # human-readable docs for this opcode; a string
1121         'doc',
1122     )
1123 
1124     def __init__(self, name, code, arg,
1125                  stack_before, stack_after, proto, doc):
1126         assert isinstance(name, str)
1127         self.name = name
1128 
1129         assert isinstance(code, str)
1130         assert len(code) == 1
1131         self.code = code
1132 
1133         assert arg is None or isinstance(arg, ArgumentDescriptor)
1134         self.arg = arg
1135 
1136         assert isinstance(stack_before, list)
1137         for x in stack_before:
1138             assert isinstance(x, StackObject)
1139         self.stack_before = stack_before
1140 
1141         assert isinstance(stack_after, list)
1142         for x in stack_after:
1143             assert isinstance(x, StackObject)
1144         self.stack_after = stack_after
1145 
1146         assert isinstance(proto, int) and 0 <= proto <= pickle.HIGHEST_PROTOCOL
1147         self.proto = proto
1148 
1149         assert isinstance(doc, str)
1150         self.doc = doc
1151 
1152 I = OpcodeInfo
1153 opcodes = [
1154 
1155     # Ways to spell integers.
1156 
1157     I(name='INT',
1158       code='I',
1159       arg=decimalnl_short,
1160       stack_before=[],
1161       stack_after=[pyinteger_or_bool],
1162       proto=0,
1163       doc="""Push an integer or bool.
1164 
1165       The argument is a newline-terminated decimal literal string.
1166 
1167       The intent may have been that this always fit in a short Python int,
1168       but INT can be generated in pickles written on a 64-bit box that
1169       require a Python long on a 32-bit box.  The difference between this
1170       and LONG then is that INT skips a trailing 'L', and produces a short
1171       int whenever possible.
1172 
1173       Another difference is due to that, when bool was introduced as a
1174       distinct type in 2.3, builtin names True and False were also added to
1175       2.2.2, mapping to ints 1 and 0.  For compatibility in both directions,
1176       True gets pickled as INT + "I01\\n", and False as INT + "I00\\n".
1177       Leading zeroes are never produced for a genuine integer.  The 2.3
1178       (and later) unpicklers special-case these and return bool instead;
1179       earlier unpicklers ignore the leading "0" and return the int.
1180       """),
1181 
1182     I(name='BININT',
1183       code='J',
1184       arg=int4,
1185       stack_before=[],
1186       stack_after=[pyint],
1187       proto=1,
1188       doc="""Push a four-byte signed integer.
1189 
1190       This handles the full range of Python (short) integers on a 32-bit
1191       box, directly as binary bytes (1 for the opcode and 4 for the integer).
1192       If the integer is non-negative and fits in 1 or 2 bytes, pickling via
1193       BININT1 or BININT2 saves space.
1194       """),
1195 
1196     I(name='BININT1',
1197       code='K',
1198       arg=uint1,
1199       stack_before=[],
1200       stack_after=[pyint],
1201       proto=1,
1202       doc="""Push a one-byte unsigned integer.
1203 
1204       This is a space optimization for pickling very small non-negative ints,
1205       in range(256).
1206       """),
1207 
1208     I(name='BININT2',
1209       code='M',
1210       arg=uint2,
1211       stack_before=[],
1212       stack_after=[pyint],
1213       proto=1,
1214       doc="""Push a two-byte unsigned integer.
1215 
1216       This is a space optimization for pickling small positive ints, in
1217       range(256, 2**16).  Integers in range(256) can also be pickled via
1218       BININT2, but BININT1 instead saves a byte.
1219       """),
1220 
1221     I(name='LONG',
1222       code='L',
1223       arg=decimalnl_long,
1224       stack_before=[],
1225       stack_after=[pyint],
1226       proto=0,
1227       doc="""Push a long integer.
1228 
1229       The same as INT, except that the literal ends with 'L', and always
1230       unpickles to a Python long.  There doesn't seem a real purpose to the
1231       trailing 'L'.
1232 
1233       Note that LONG takes time quadratic in the number of digits when
1234       unpickling (this is simply due to the nature of decimal->binary
1235       conversion).  Proto 2 added linear-time (in C; still quadratic-time
1236       in Python) LONG1 and LONG4 opcodes.
1237       """),
1238 
1239     I(name="LONG1",
1240       code='\x8a',
1241       arg=long1,
1242       stack_before=[],
1243       stack_after=[pyint],
1244       proto=2,
1245       doc="""Long integer using one-byte length.
1246 
1247       A more efficient encoding of a Python long; the long1 encoding
1248       says it all."""),
1249 
1250     I(name="LONG4",
1251       code='\x8b',
1252       arg=long4,
1253       stack_before=[],
1254       stack_after=[pyint],
1255       proto=2,
1256       doc="""Long integer using found-byte length.
1257 
1258       A more efficient encoding of a Python long; the long4 encoding
1259       says it all."""),
1260 
1261     # Ways to spell strings (8-bit, not Unicode).
1262 
1263     I(name='STRING',
1264       code='S',
1265       arg=stringnl,
1266       stack_before=[],
1267       stack_after=[pybytes_or_str],
1268       proto=0,
1269       doc="""Push a Python string object.
1270 
1271       The argument is a repr-style string, with bracketing quote characters,
1272       and perhaps embedded escapes.  The argument extends until the next
1273       newline character.  These are usually decoded into a str instance
1274       using the encoding given to the Unpickler constructor. or the default,
1275       'ASCII'.  If the encoding given was 'bytes' however, they will be
1276       decoded as bytes object instead.
1277       """),
1278 
1279     I(name='BINSTRING',
1280       code='T',
1281       arg=string4,
1282       stack_before=[],
1283       stack_after=[pybytes_or_str],
1284       proto=1,
1285       doc="""Push a Python string object.
1286 
1287       There are two arguments: the first is a 4-byte little-endian
1288       signed int giving the number of bytes in the string, and the
1289       second is that many bytes, which are taken literally as the string
1290       content.  These are usually decoded into a str instance using the
1291       encoding given to the Unpickler constructor. or the default,
1292       'ASCII'.  If the encoding given was 'bytes' however, they will be
1293       decoded as bytes object instead.
1294       """),
1295 
1296     I(name='SHORT_BINSTRING',
1297       code='U',
1298       arg=string1,
1299       stack_before=[],
1300       stack_after=[pybytes_or_str],
1301       proto=1,
1302       doc="""Push a Python string object.
1303 
1304       There are two arguments: the first is a 1-byte unsigned int giving
1305       the number of bytes in the string, and the second is that many
1306       bytes, which are taken literally as the string content.  These are
1307       usually decoded into a str instance using the encoding given to
1308       the Unpickler constructor. or the default, 'ASCII'.  If the
1309       encoding given was 'bytes' however, they will be decoded as bytes
1310       object instead.
1311       """),
1312 
1313     # Bytes (protocol 3 and higher)
1314 
1315     I(name='BINBYTES',
1316       code='B',
1317       arg=bytes4,
1318       stack_before=[],
1319       stack_after=[pybytes],
1320       proto=3,
1321       doc="""Push a Python bytes object.
1322 
1323       There are two arguments:  the first is a 4-byte little-endian unsigned int
1324       giving the number of bytes, and the second is that many bytes, which are
1325       taken literally as the bytes content.
1326       """),
1327 
1328     I(name='SHORT_BINBYTES',
1329       code='C',
1330       arg=bytes1,
1331       stack_before=[],
1332       stack_after=[pybytes],
1333       proto=3,
1334       doc="""Push a Python bytes object.
1335 
1336       There are two arguments:  the first is a 1-byte unsigned int giving
1337       the number of bytes, and the second is that many bytes, which are taken
1338       literally as the string content.
1339       """),
1340 
1341     I(name='BINBYTES8',
1342       code='\x8e',
1343       arg=bytes8,
1344       stack_before=[],
1345       stack_after=[pybytes],
1346       proto=4,
1347       doc="""Push a Python bytes object.
1348 
1349       There are two arguments:  the first is an 8-byte unsigned int giving
1350       the number of bytes in the string, and the second is that many bytes,
1351       which are taken literally as the string content.
1352       """),
1353 
1354     # Bytearray (protocol 5 and higher)
1355 
1356     I(name='BYTEARRAY8',
1357       code='\x96',
1358       arg=bytearray8,
1359       stack_before=[],
1360       stack_after=[pybytearray],
1361       proto=5,
1362       doc="""Push a Python bytearray object.
1363 
1364       There are two arguments:  the first is an 8-byte unsigned int giving
1365       the number of bytes in the bytearray, and the second is that many bytes,
1366       which are taken literally as the bytearray content.
1367       """),
1368 
1369     # Out-of-band buffer (protocol 5 and higher)
1370 
1371     I(name='NEXT_BUFFER',
1372       code='\x97',
1373       arg=None,
1374       stack_before=[],
1375       stack_after=[pybuffer],
1376       proto=5,
1377       doc="Push an out-of-band buffer object."),
1378 
1379     I(name='READONLY_BUFFER',
1380       code='\x98',
1381       arg=None,
1382       stack_before=[pybuffer],
1383       stack_after=[pybuffer],
1384       proto=5,
1385       doc="Make an out-of-band buffer object read-only."),
1386 
1387     # Ways to spell None.
1388 
1389     I(name='NONE',
1390       code='N',
1391       arg=None,
1392       stack_before=[],
1393       stack_after=[pynone],
1394       proto=0,
1395       doc="Push None on the stack."),
1396 
1397     # Ways to spell bools, starting with proto 2.  See INT for how this was
1398     # done before proto 2.
1399 
1400     I(name='NEWTRUE',
1401       code='\x88',
1402       arg=None,
1403       stack_before=[],
1404       stack_after=[pybool],
1405       proto=2,
1406       doc="Push True onto the stack."),
1407 
1408     I(name='NEWFALSE',
1409       code='\x89',
1410       arg=None,
1411       stack_before=[],
1412       stack_after=[pybool],
1413       proto=2,
1414       doc="Push False onto the stack."),
1415 
1416     # Ways to spell Unicode strings.
1417 
1418     I(name='UNICODE',
1419       code='V',
1420       arg=unicodestringnl,
1421       stack_before=[],
1422       stack_after=[pyunicode],
1423       proto=0,  # this may be pure-text, but it's a later addition
1424       doc="""Push a Python Unicode string object.
1425 
1426       The argument is a raw-unicode-escape encoding of a Unicode string,
1427       and so may contain embedded escape sequences.  The argument extends
1428       until the next newline character.
1429       """),
1430 
1431     I(name='SHORT_BINUNICODE',
1432       code='\x8c',
1433       arg=unicodestring1,
1434       stack_before=[],
1435       stack_after=[pyunicode],
1436       proto=4,
1437       doc="""Push a Python Unicode string object.
1438 
1439       There are two arguments:  the first is a 1-byte little-endian signed int
1440       giving the number of bytes in the string.  The second is that many
1441       bytes, and is the UTF-8 encoding of the Unicode string.
1442       """),
1443 
1444     I(name='BINUNICODE',
1445       code='X',
1446       arg=unicodestring4,
1447       stack_before=[],
1448       stack_after=[pyunicode],
1449       proto=1,
1450       doc="""Push a Python Unicode string object.
1451 
1452       There are two arguments:  the first is a 4-byte little-endian unsigned int
1453       giving the number of bytes in the string.  The second is that many
1454       bytes, and is the UTF-8 encoding of the Unicode string.
1455       """),
1456 
1457     I(name='BINUNICODE8',
1458       code='\x8d',
1459       arg=unicodestring8,
1460       stack_before=[],
1461       stack_after=[pyunicode],
1462       proto=4,
1463       doc="""Push a Python Unicode string object.
1464 
1465       There are two arguments:  the first is an 8-byte little-endian signed int
1466       giving the number of bytes in the string.  The second is that many
1467       bytes, and is the UTF-8 encoding of the Unicode string.
1468       """),
1469 
1470     # Ways to spell floats.
1471 
1472     I(name='FLOAT',
1473       code='F',
1474       arg=floatnl,
1475       stack_before=[],
1476       stack_after=[pyfloat],
1477       proto=0,
1478       doc="""Newline-terminated decimal float literal.
1479 
1480       The argument is repr(a_float), and in general requires 17 significant
1481       digits for roundtrip conversion to be an identity (this is so for
1482       IEEE-754 double precision values, which is what Python float maps to
1483       on most boxes).
1484 
1485       In general, FLOAT cannot be used to transport infinities, NaNs, or
1486       minus zero across boxes (or even on a single box, if the platform C
1487       library can't read the strings it produces for such things -- Windows
1488       is like that), but may do less damage than BINFLOAT on boxes with
1489       greater precision or dynamic range than IEEE-754 double.
1490       """),
1491 
1492     I(name='BINFLOAT',
1493       code='G',
1494       arg=float8,
1495       stack_before=[],
1496       stack_after=[pyfloat],
1497       proto=1,
1498       doc="""Float stored in binary form, with 8 bytes of data.
1499 
1500       This generally requires less than half the space of FLOAT encoding.
1501       In general, BINFLOAT cannot be used to transport infinities, NaNs, or
1502       minus zero, raises an exception if the exponent exceeds the range of
1503       an IEEE-754 double, and retains no more than 53 bits of precision (if
1504       there are more than that, "add a half and chop" rounding is used to
1505       cut it back to 53 significant bits).
1506       """),
1507 
1508     # Ways to build lists.
1509 
1510     I(name='EMPTY_LIST',
1511       code=']',
1512       arg=None,
1513       stack_before=[],
1514       stack_after=[pylist],
1515       proto=1,
1516       doc="Push an empty list."),
1517 
1518     I(name='APPEND',
1519       code='a',
1520       arg=None,
1521       stack_before=[pylist, anyobject],
1522       stack_after=[pylist],
1523       proto=0,
1524       doc="""Append an object to a list.
1525 
1526       Stack before:  ... pylist anyobject
1527       Stack after:   ... pylist+[anyobject]
1528 
1529       although pylist is really extended in-place.
1530       """),
1531 
1532     I(name='APPENDS',
1533       code='e',
1534       arg=None,
1535       stack_before=[pylist, markobject, stackslice],
1536       stack_after=[pylist],
1537       proto=1,
1538       doc="""Extend a list by a slice of stack objects.
1539 
1540       Stack before:  ... pylist markobject stackslice
1541       Stack after:   ... pylist+stackslice
1542 
1543       although pylist is really extended in-place.
1544       """),
1545 
1546     I(name='LIST',
1547       code='l',
1548       arg=None,
1549       stack_before=[markobject, stackslice],
1550       stack_after=[pylist],
1551       proto=0,
1552       doc="""Build a list out of the topmost stack slice, after markobject.
1553 
1554       All the stack entries following the topmost markobject are placed into
1555       a single Python list, which single list object replaces all of the
1556       stack from the topmost markobject onward.  For example,
1557 
1558       Stack before: ... markobject 1 2 3 'abc'
1559       Stack after:  ... [1, 2, 3, 'abc']
1560       """),
1561 
1562     # Ways to build tuples.
1563 
1564     I(name='EMPTY_TUPLE',
1565       code=')',
1566       arg=None,
1567       stack_before=[],
1568       stack_after=[pytuple],
1569       proto=1,
1570       doc="Push an empty tuple."),
1571 
1572     I(name='TUPLE',
1573       code='t',
1574       arg=None,
1575       stack_before=[markobject, stackslice],
1576       stack_after=[pytuple],
1577       proto=0,
1578       doc="""Build a tuple out of the topmost stack slice, after markobject.
1579 
1580       All the stack entries following the topmost markobject are placed into
1581       a single Python tuple, which single tuple object replaces all of the
1582       stack from the topmost markobject onward.  For example,
1583 
1584       Stack before: ... markobject 1 2 3 'abc'
1585       Stack after:  ... (1, 2, 3, 'abc')
1586       """),
1587 
1588     I(name='TUPLE1',
1589       code='\x85',
1590       arg=None,
1591       stack_before=[anyobject],
1592       stack_after=[pytuple],
1593       proto=2,
1594       doc="""Build a one-tuple out of the topmost item on the stack.
1595 
1596       This code pops one value off the stack and pushes a tuple of
1597       length 1 whose one item is that value back onto it.  In other
1598       words:
1599 
1600           stack[-1] = tuple(stack[-1:])
1601       """),
1602 
1603     I(name='TUPLE2',
1604       code='\x86',
1605       arg=None,
1606       stack_before=[anyobject, anyobject],
1607       stack_after=[pytuple],
1608       proto=2,
1609       doc="""Build a two-tuple out of the top two items on the stack.
1610 
1611       This code pops two values off the stack and pushes a tuple of
1612       length 2 whose items are those values back onto it.  In other
1613       words:
1614 
1615           stack[-2:] = [tuple(stack[-2:])]
1616       """),
1617 
1618     I(name='TUPLE3',
1619       code='\x87',
1620       arg=None,
1621       stack_before=[anyobject, anyobject, anyobject],
1622       stack_after=[pytuple],
1623       proto=2,
1624       doc="""Build a three-tuple out of the top three items on the stack.
1625 
1626       This code pops three values off the stack and pushes a tuple of
1627       length 3 whose items are those values back onto it.  In other
1628       words:
1629 
1630           stack[-3:] = [tuple(stack[-3:])]
1631       """),
1632 
1633     # Ways to build dicts.
1634 
1635     I(name='EMPTY_DICT',
1636       code='}',
1637       arg=None,
1638       stack_before=[],
1639       stack_after=[pydict],
1640       proto=1,
1641       doc="Push an empty dict."),
1642 
1643     I(name='DICT',
1644       code='d',
1645       arg=None,
1646       stack_before=[markobject, stackslice],
1647       stack_after=[pydict],
1648       proto=0,
1649       doc="""Build a dict out of the topmost stack slice, after markobject.
1650 
1651       All the stack entries following the topmost markobject are placed into
1652       a single Python dict, which single dict object replaces all of the
1653       stack from the topmost markobject onward.  The stack slice alternates
1654       key, value, key, value, ....  For example,
1655 
1656       Stack before: ... markobject 1 2 3 'abc'
1657       Stack after:  ... {1: 2, 3: 'abc'}
1658       """),
1659 
1660     I(name='SETITEM',
1661       code='s',
1662       arg=None,
1663       stack_before=[pydict, anyobject, anyobject],
1664       stack_after=[pydict],
1665       proto=0,
1666       doc="""Add a key+value pair to an existing dict.
1667 
1668       Stack before:  ... pydict key value
1669       Stack after:   ... pydict
1670 
1671       where pydict has been modified via pydict[key] = value.
1672       """),
1673 
1674     I(name='SETITEMS',
1675       code='u',
1676       arg=None,
1677       stack_before=[pydict, markobject, stackslice],
1678       stack_after=[pydict],
1679       proto=1,
1680       doc="""Add an arbitrary number of key+value pairs to an existing dict.
1681 
1682       The slice of the stack following the topmost markobject is taken as
1683       an alternating sequence of keys and values, added to the dict
1684       immediately under the topmost markobject.  Everything at and after the
1685       topmost markobject is popped, leaving the mutated dict at the top
1686       of the stack.
1687 
1688       Stack before:  ... pydict markobject key_1 value_1 ... key_n value_n
1689       Stack after:   ... pydict
1690 
1691       where pydict has been modified via pydict[key_i] = value_i for i in
1692       1, 2, ..., n, and in that order.
1693       """),
1694 
1695     # Ways to build sets
1696 
1697     I(name='EMPTY_SET',
1698       code='\x8f',
1699       arg=None,
1700       stack_before=[],
1701       stack_after=[pyset],
1702       proto=4,
1703       doc="Push an empty set."),
1704 
1705     I(name='ADDITEMS',
1706       code='\x90',
1707       arg=None,
1708       stack_before=[pyset, markobject, stackslice],
1709       stack_after=[pyset],
1710       proto=4,
1711       doc="""Add an arbitrary number of items to an existing set.
1712 
1713       The slice of the stack following the topmost markobject is taken as
1714       a sequence of items, added to the set immediately under the topmost
1715       markobject.  Everything at and after the topmost markobject is popped,
1716       leaving the mutated set at the top of the stack.
1717 
1718       Stack before:  ... pyset markobject item_1 ... item_n
1719       Stack after:   ... pyset
1720 
1721       where pyset has been modified via pyset.add(item_i) = item_i for i in
1722       1, 2, ..., n, and in that order.
1723       """),
1724 
1725     # Way to build frozensets
1726 
1727     I(name='FROZENSET',
1728       code='\x91',
1729       arg=None,
1730       stack_before=[markobject, stackslice],
1731       stack_after=[pyfrozenset],
1732       proto=4,
1733       doc="""Build a frozenset out of the topmost slice, after markobject.
1734 
1735       All the stack entries following the topmost markobject are placed into
1736       a single Python frozenset, which single frozenset object replaces all
1737       of the stack from the topmost markobject onward.  For example,
1738 
1739       Stack before: ... markobject 1 2 3
1740       Stack after:  ... frozenset({1, 2, 3})
1741       """),
1742 
1743     # Stack manipulation.
1744 
1745     I(name='POP',
1746       code='0',
1747       arg=None,
1748       stack_before=[anyobject],
1749       stack_after=[],
1750       proto=0,
1751       doc="Discard the top stack item, shrinking the stack by one item."),
1752 
1753     I(name='DUP',
1754       code='2',
1755       arg=None,
1756       stack_before=[anyobject],
1757       stack_after=[anyobject, anyobject],
1758       proto=0,
1759       doc="Push the top stack item onto the stack again, duplicating it."),
1760 
1761     I(name='MARK',
1762       code='(',
1763       arg=None,
1764       stack_before=[],
1765       stack_after=[markobject],
1766       proto=0,
1767       doc="""Push markobject onto the stack.
1768 
1769       markobject is a unique object, used by other opcodes to identify a
1770       region of the stack containing a variable number of objects for them
1771       to work on.  See markobject.doc for more detail.
1772       """),
1773 
1774     I(name='POP_MARK',
1775       code='1',
1776       arg=None,
1777       stack_before=[markobject, stackslice],
1778       stack_after=[],
1779       proto=1,
1780       doc="""Pop all the stack objects at and above the topmost markobject.
1781 
1782       When an opcode using a variable number of stack objects is done,
1783       POP_MARK is used to remove those objects, and to remove the markobject
1784       that delimited their starting position on the stack.
1785       """),
1786 
1787     # Memo manipulation.  There are really only two operations (get and put),
1788     # each in all-text, "short binary", and "long binary" flavors.
1789 
1790     I(name='GET',
1791       code='g',
1792       arg=decimalnl_short,
1793       stack_before=[],
1794       stack_after=[anyobject],
1795       proto=0,
1796       doc="""Read an object from the memo and push it on the stack.
1797 
1798       The index of the memo object to push is given by the newline-terminated
1799       decimal string following.  BINGET and LONG_BINGET are space-optimized
1800       versions.
1801       """),
1802 
1803     I(name='BINGET',
1804       code='h',
1805       arg=uint1,
1806       stack_before=[],
1807       stack_after=[anyobject],
1808       proto=1,
1809       doc="""Read an object from the memo and push it on the stack.
1810 
1811       The index of the memo object to push is given by the 1-byte unsigned
1812       integer following.
1813       """),
1814 
1815     I(name='LONG_BINGET',
1816       code='j',
1817       arg=uint4,
1818       stack_before=[],
1819       stack_after=[anyobject],
1820       proto=1,
1821       doc="""Read an object from the memo and push it on the stack.
1822 
1823       The index of the memo object to push is given by the 4-byte unsigned
1824       little-endian integer following.
1825       """),
1826 
1827     I(name='PUT',
1828       code='p',
1829       arg=decimalnl_short,
1830       stack_before=[],
1831       stack_after=[],
1832       proto=0,
1833       doc="""Store the stack top into the memo.  The stack is not popped.
1834 
1835       The index of the memo location to write into is given by the newline-
1836       terminated decimal string following.  BINPUT and LONG_BINPUT are
1837       space-optimized versions.
1838       """),
1839 
1840     I(name='BINPUT',
1841       code='q',
1842       arg=uint1,
1843       stack_before=[],
1844       stack_after=[],
1845       proto=1,
1846       doc="""Store the stack top into the memo.  The stack is not popped.
1847 
1848       The index of the memo location to write into is given by the 1-byte
1849       unsigned integer following.
1850       """),
1851 
1852     I(name='LONG_BINPUT',
1853       code='r',
1854       arg=uint4,
1855       stack_before=[],
1856       stack_after=[],
1857       proto=1,
1858       doc="""Store the stack top into the memo.  The stack is not popped.
1859 
1860       The index of the memo location to write into is given by the 4-byte
1861       unsigned little-endian integer following.
1862       """),
1863 
1864     I(name='MEMOIZE',
1865       code='\x94',
1866       arg=None,
1867       stack_before=[anyobject],
1868       stack_after=[anyobject],
1869       proto=4,
1870       doc="""Store the stack top into the memo.  The stack is not popped.
1871 
1872       The index of the memo location to write is the number of
1873       elements currently present in the memo.
1874       """),
1875 
1876     # Access the extension registry (predefined objects).  Akin to the GET
1877     # family.
1878 
1879     I(name='EXT1',
1880       code='\x82',
1881       arg=uint1,
1882       stack_before=[],
1883       stack_after=[anyobject],
1884       proto=2,
1885       doc="""Extension code.
1886 
1887       This code and the similar EXT2 and EXT4 allow using a registry
1888       of popular objects that are pickled by name, typically classes.
1889       It is envisioned that through a global negotiation and
1890       registration process, third parties can set up a mapping between
1891       ints and object names.
1892 
1893       In order to guarantee pickle interchangeability, the extension
1894       code registry ought to be global, although a range of codes may
1895       be reserved for private use.
1896 
1897       EXT1 has a 1-byte integer argument.  This is used to index into the
1898       extension registry, and the object at that index is pushed on the stack.
1899       """),
1900 
1901     I(name='EXT2',
1902       code='\x83',
1903       arg=uint2,
1904       stack_before=[],
1905       stack_after=[anyobject],
1906       proto=2,
1907       doc="""Extension code.
1908 
1909       See EXT1.  EXT2 has a two-byte integer argument.
1910       """),
1911 
1912     I(name='EXT4',
1913       code='\x84',
1914       arg=int4,
1915       stack_before=[],
1916       stack_after=[anyobject],
1917       proto=2,
1918       doc="""Extension code.
1919 
1920       See EXT1.  EXT4 has a four-byte integer argument.
1921       """),
1922 
1923     # Push a class object, or module function, on the stack, via its module
1924     # and name.
1925 
1926     I(name='GLOBAL',
1927       code='c',
1928       arg=stringnl_noescape_pair,
1929       stack_before=[],
1930       stack_after=[anyobject],
1931       proto=0,
1932       doc="""Push a global object (module.attr) on the stack.
1933 
1934       Two newline-terminated strings follow the GLOBAL opcode.  The first is
1935       taken as a module name, and the second as a class name.  The class
1936       object module.class is pushed on the stack.  More accurately, the
1937       object returned by self.find_class(module, class) is pushed on the
1938       stack, so unpickling subclasses can override this form of lookup.
1939       """),
1940 
1941     I(name='STACK_GLOBAL',
1942       code='\x93',
1943       arg=None,
1944       stack_before=[pyunicode, pyunicode],
1945       stack_after=[anyobject],
1946       proto=4,
1947       doc="""Push a global object (module.attr) on the stack.
1948       """),
1949 
1950     # Ways to build objects of classes pickle doesn't know about directly
1951     # (user-defined classes).  I despair of documenting this accurately
1952     # and comprehensibly -- you really have to read the pickle code to
1953     # find all the special cases.
1954 
1955     I(name='REDUCE',
1956       code='R',
1957       arg=None,
1958       stack_before=[anyobject, anyobject],
1959       stack_after=[anyobject],
1960       proto=0,
1961       doc="""Push an object built from a callable and an argument tuple.
1962 
1963       The opcode is named to remind of the __reduce__() method.
1964 
1965       Stack before: ... callable pytuple
1966       Stack after:  ... callable(*pytuple)
1967 
1968       The callable and the argument tuple are the first two items returned
1969       by a __reduce__ method.  Applying the callable to the argtuple is
1970       supposed to reproduce the original object, or at least get it started.
1971       If the __reduce__ method returns a 3-tuple, the last component is an
1972       argument to be passed to the object's __setstate__, and then the REDUCE
1973       opcode is followed by code to create setstate's argument, and then a
1974       BUILD opcode to apply  __setstate__ to that argument.
1975 
1976       If not isinstance(callable, type), REDUCE complains unless the
1977       callable has been registered with the copyreg module's
1978       safe_constructors dict, or the callable has a magic
1979       '__safe_for_unpickling__' attribute with a true value.  I'm not sure
1980       why it does this, but I've sure seen this complaint often enough when
1981       I didn't want to <wink>.
1982       """),
1983 
1984     I(name='BUILD',
1985       code='b',
1986       arg=None,
1987       stack_before=[anyobject, anyobject],
1988       stack_after=[anyobject],
1989       proto=0,
1990       doc="""Finish building an object, via __setstate__ or dict update.
1991 
1992       Stack before: ... anyobject argument
1993       Stack after:  ... anyobject
1994 
1995       where anyobject may have been mutated, as follows:
1996 
1997       If the object has a __setstate__ method,
1998 
1999           anyobject.__setstate__(argument)
2000 
2001       is called.
2002 
2003       Else the argument must be a dict, the object must have a __dict__, and
2004       the object is updated via
2005 
2006           anyobject.__dict__.update(argument)
2007       """),
2008 
2009     I(name='INST',
2010       code='i',
2011       arg=stringnl_noescape_pair,
2012       stack_before=[markobject, stackslice],
2013       stack_after=[anyobject],
2014       proto=0,
2015       doc="""Build a class instance.
2016 
2017       This is the protocol 0 version of protocol 1's OBJ opcode.
2018       INST is followed by two newline-terminated strings, giving a
2019       module and class name, just as for the GLOBAL opcode (and see
2020       GLOBAL for more details about that).  self.find_class(module, name)
2021       is used to get a class object.
2022 
2023       In addition, all the objects on the stack following the topmost
2024       markobject are gathered into a tuple and popped (along with the
2025       topmost markobject), just as for the TUPLE opcode.
2026 
2027       Now it gets complicated.  If all of these are true:
2028 
2029         + The argtuple is empty (markobject was at the top of the stack
2030           at the start).
2031 
2032         + The class object does not have a __getinitargs__ attribute.
2033 
2034       then we want to create an old-style class instance without invoking
2035       its __init__() method (pickle has waffled on this over the years; not
2036       calling __init__() is current wisdom).  In this case, an instance of
2037       an old-style dummy class is created, and then we try to rebind its
2038       __class__ attribute to the desired class object.  If this succeeds,
2039       the new instance object is pushed on the stack, and we're done.
2040 
2041       Else (the argtuple is not empty, it's not an old-style class object,
2042       or the class object does have a __getinitargs__ attribute), the code
2043       first insists that the class object have a __safe_for_unpickling__
2044       attribute.  Unlike as for the __safe_for_unpickling__ check in REDUCE,
2045       it doesn't matter whether this attribute has a true or false value, it
2046       only matters whether it exists (XXX this is a bug).  If
2047       __safe_for_unpickling__ doesn't exist, UnpicklingError is raised.
2048 
2049       Else (the class object does have a __safe_for_unpickling__ attr),
2050       the class object obtained from INST's arguments is applied to the
2051       argtuple obtained from the stack, and the resulting instance object
2052       is pushed on the stack.
2053 
2054       NOTE:  checks for __safe_for_unpickling__ went away in Python 2.3.
2055       NOTE:  the distinction between old-style and new-style classes does
2056              not make sense in Python 3.
2057       """),
2058 
2059     I(name='OBJ',
2060       code='o',
2061       arg=None,
2062       stack_before=[markobject, anyobject, stackslice],
2063       stack_after=[anyobject],
2064       proto=1,
2065       doc="""Build a class instance.
2066 
2067       This is the protocol 1 version of protocol 0's INST opcode, and is
2068       very much like it.  The major difference is that the class object
2069       is taken off the stack, allowing it to be retrieved from the memo
2070       repeatedly if several instances of the same class are created.  This
2071       can be much more efficient (in both time and space) than repeatedly
2072       embedding the module and class names in INST opcodes.
2073 
2074       Unlike INST, OBJ takes no arguments from the opcode stream.  Instead
2075       the class object is taken off the stack, immediately above the
2076       topmost markobject:
2077 
2078       Stack before: ... markobject classobject stackslice
2079       Stack after:  ... new_instance_object
2080 
2081       As for INST, the remainder of the stack above the markobject is
2082       gathered into an argument tuple, and then the logic seems identical,
2083       except that no __safe_for_unpickling__ check is done (XXX this is
2084       a bug).  See INST for the gory details.
2085 
2086       NOTE:  In Python 2.3, INST and OBJ are identical except for how they
2087       get the class object.  That was always the intent; the implementations
2088       had diverged for accidental reasons.
2089       """),
2090 
2091     I(name='NEWOBJ',
2092       code='\x81',
2093       arg=None,
2094       stack_before=[anyobject, anyobject],
2095       stack_after=[anyobject],
2096       proto=2,
2097       doc="""Build an object instance.
2098 
2099       The stack before should be thought of as containing a class
2100       object followed by an argument tuple (the tuple being the stack
2101       top).  Call these cls and args.  They are popped off the stack,
2102       and the value returned by cls.__new__(cls, *args) is pushed back
2103       onto the stack.
2104       """),
2105 
2106     I(name='NEWOBJ_EX',
2107       code='\x92',
2108       arg=None,
2109       stack_before=[anyobject, anyobject, anyobject],
2110       stack_after=[anyobject],
2111       proto=4,
2112       doc="""Build an object instance.
2113 
2114       The stack before should be thought of as containing a class
2115       object followed by an argument tuple and by a keyword argument dict
2116       (the dict being the stack top).  Call these cls and args.  They are
2117       popped off the stack, and the value returned by
2118       cls.__new__(cls, *args, *kwargs) is  pushed back  onto the stack.
2119       """),
2120 
2121     # Machine control.
2122 
2123     I(name='PROTO',
2124       code='\x80',
2125       arg=uint1,
2126       stack_before=[],
2127       stack_after=[],
2128       proto=2,
2129       doc="""Protocol version indicator.
2130 
2131       For protocol 2 and above, a pickle must start with this opcode.
2132       The argument is the protocol version, an int in range(2, 256).
2133       """),
2134 
2135     I(name='STOP',
2136       code='.',
2137       arg=None,
2138       stack_before=[anyobject],
2139       stack_after=[],
2140       proto=0,
2141       doc="""Stop the unpickling machine.
2142 
2143       Every pickle ends with this opcode.  The object at the top of the stack
2144       is popped, and that's the result of unpickling.  The stack should be
2145       empty then.
2146       """),
2147 
2148     # Framing support.
2149 
2150     I(name='FRAME',
2151       code='\x95',
2152       arg=uint8,
2153       stack_before=[],
2154       stack_after=[],
2155       proto=4,
2156       doc="""Indicate the beginning of a new frame.
2157 
2158       The unpickler may use this opcode to safely prefetch data from its
2159       underlying stream.
2160       """),
2161 
2162     # Ways to deal with persistent IDs.
2163 
2164     I(name='PERSID',
2165       code='P',
2166       arg=stringnl_noescape,
2167       stack_before=[],
2168       stack_after=[anyobject],
2169       proto=0,
2170       doc="""Push an object identified by a persistent ID.
2171 
2172       The pickle module doesn't define what a persistent ID means.  PERSID's
2173       argument is a newline-terminated str-style (no embedded escapes, no
2174       bracketing quote characters) string, which *is* "the persistent ID".
2175       The unpickler passes this string to self.persistent_load().  Whatever
2176       object that returns is pushed on the stack.  There is no implementation
2177       of persistent_load() in Python's unpickler:  it must be supplied by an
2178       unpickler subclass.
2179       """),
2180 
2181     I(name='BINPERSID',
2182       code='Q',
2183       arg=None,
2184       stack_before=[anyobject],
2185       stack_after=[anyobject],
2186       proto=1,
2187       doc="""Push an object identified by a persistent ID.
2188 
2189       Like PERSID, except the persistent ID is popped off the stack (instead
2190       of being a string embedded in the opcode bytestream).  The persistent
2191       ID is passed to self.persistent_load(), and whatever object that
2192       returns is pushed on the stack.  See PERSID for more detail.
2193       """),
2194 ]
2195 del I
2196 
2197 # Verify uniqueness of .name and .code members.
2198 name2i = {}
2199 code2i = {}
2200 
2201 for i, d in enumerate(opcodes):
2202     if d.name in name2i:
2203         raise ValueError("repeated name %r at indices %d and %d" %
2204                          (d.name, name2i[d.name], i))
2205     if d.code in code2i:
2206         raise ValueError("repeated code %r at indices %d and %d" %
2207                          (d.code, code2i[d.code], i))
2208 
2209     name2i[d.name] = i
2210     code2i[d.code] = i
2211 
2212 del name2i, code2i, i, d
2213 
2214 ##############################################################################
2215 # Build a code2op dict, mapping opcode characters to OpcodeInfo records.
2216 # Also ensure we've got the same stuff as pickle.py, although the
2217 # introspection here is dicey.
2218 
2219 code2op = {}
2220 for d in opcodes:
2221     code2op[d.code] = d
2222 del d
2223 
2224 def assure_pickle_consistency(verbose=False):
2225 
2226     copy = code2op.copy()
2227     for name in pickle.__all__:
2228         if not re.match("[A-Z][A-Z0-9_]+$", name):
2229             if verbose:
2230                 print("skipping %r: it doesn't look like an opcode name" % name)
2231             continue
2232         picklecode = getattr(pickle, name)
2233         if not isinstance(picklecode, bytes) or len(picklecode) != 1:
2234             if verbose:
2235                 print(("skipping %r: value %r doesn't look like a pickle "
2236                        "code" % (name, picklecode)))
2237             continue
2238         picklecode = picklecode.decode("latin-1")
2239         if picklecode in copy:
2240             if verbose:
2241                 print("checking name %r w/ code %r for consistency" % (
2242                       name, picklecode))
2243             d = copy[picklecode]
2244             if d.name != name:
2245                 raise ValueError("for pickle code %r, pickle.py uses name %r "
2246                                  "but we're using name %r" % (picklecode,
2247                                                               name,
2248                                                               d.name))
2249             # Forget this one.  Any left over in copy at the end are a problem
2250             # of a different kind.
2251             del copy[picklecode]
2252         else:
2253             raise ValueError("pickle.py appears to have a pickle opcode with "
2254                              "name %r and code %r, but we don't" %
2255                              (name, picklecode))
2256     if copy:
2257         msg = ["we appear to have pickle opcodes that pickle.py doesn't have:"]
2258         for code, d in copy.items():
2259             msg.append("    name %r with code %r" % (d.name, code))
2260         raise ValueError("\n".join(msg))
2261 
2262 assure_pickle_consistency()
2263 del assure_pickle_consistency
2264 
2265 ##############################################################################
2266 # A pickle opcode generator.
2267 
2268 def _genops(data, yield_end_pos=False):
2269     if isinstance(data, bytes_types):
2270         data = io.BytesIO(data)
2271 
2272     if hasattr(data, "tell"):
2273         getpos = data.tell
2274     else:
2275         getpos = lambda: None
2276 
2277     while True:
2278         pos = getpos()
2279         code = data.read(1)
2280         opcode = code2op.get(code.decode("latin-1"))
2281         if opcode is None:
2282             if code == b"":
2283                 raise ValueError("pickle exhausted before seeing STOP")
2284             else:
2285                 raise ValueError("at position %s, opcode %r unknown" % (
2286                                  "<unknown>" if pos is None else pos,
2287                                  code))
2288         if opcode.arg is None:
2289             arg = None
2290         else:
2291             arg = opcode.arg.reader(data)
2292         if yield_end_pos:
2293             yield opcode, arg, pos, getpos()
2294         else:
2295             yield opcode, arg, pos
2296         if code == b'.':
2297             assert opcode.name == 'STOP'
2298             break
2299 
2300 def genops(pickle):
2301     """Generate all the opcodes in a pickle.
2302 
2303     'pickle' is a file-like object, or string, containing the pickle.
2304 
2305     Each opcode in the pickle is generated, from the current pickle position,
2306     stopping after a STOP opcode is delivered.  A triple is generated for
2307     each opcode:
2308 
2309         opcode, arg, pos
2310 
2311     opcode is an OpcodeInfo record, describing the current opcode.
2312 
2313     If the opcode has an argument embedded in the pickle, arg is its decoded
2314     value, as a Python object.  If the opcode doesn't have an argument, arg
2315     is None.
2316 
2317     If the pickle has a tell() method, pos was the value of pickle.tell()
2318     before reading the current opcode.  If the pickle is a bytes object,
2319     it's wrapped in a BytesIO object, and the latter's tell() result is
2320     used.  Else (the pickle doesn't have a tell(), and it's not obvious how
2321     to query its current position) pos is None.
2322     """
2323     return _genops(pickle)
2324 
2325 ##############################################################################
2326 # A pickle optimizer.
2327 
2328 def optimize(p):
2329     'Optimize a pickle string by removing unused PUT opcodes'
2330     put = 'PUT'
2331     get = 'GET'
2332     oldids = set()          # set of all PUT ids
2333     newids = {}             # set of ids used by a GET opcode
2334     opcodes = []            # (op, idx) or (pos, end_pos)
2335     proto = 0
2336     protoheader = b''
2337     for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
2338         if 'PUT' in opcode.name:
2339             oldids.add(arg)
2340             opcodes.append((put, arg))
2341         elif opcode.name == 'MEMOIZE':
2342             idx = len(oldids)
2343             oldids.add(idx)
2344             opcodes.append((put, idx))
2345         elif 'FRAME' in opcode.name:
2346             pass
2347         elif 'GET' in opcode.name:
2348             if opcode.proto > proto:
2349                 proto = opcode.proto
2350             newids[arg] = None
2351             opcodes.append((get, arg))
2352         elif opcode.name == 'PROTO':
2353             if arg > proto:
2354                 proto = arg
2355             if pos == 0:
2356                 protoheader = p[pos:end_pos]
2357             else:
2358                 opcodes.append((pos, end_pos))
2359         else:
2360             opcodes.append((pos, end_pos))
2361     del oldids
2362 
2363     # Copy the opcodes except for PUTS without a corresponding GET
2364     out = io.BytesIO()
2365     # Write the PROTO header before any framing
2366     out.write(protoheader)
2367     pickler = pickle._Pickler(out, proto)
2368     if proto >= 4:
2369         pickler.framer.start_framing()
2370     idx = 0
2371     for op, arg in opcodes:
2372         frameless = False
2373         if op is put:
2374             if arg not in newids:
2375                 continue
2376             data = pickler.put(idx)
2377             newids[arg] = idx
2378             idx += 1
2379         elif op is get:
2380             data = pickler.get(newids[arg])
2381         else:
2382             data = p[op:arg]
2383             frameless = len(data) > pickler.framer._FRAME_SIZE_TARGET
2384         pickler.framer.commit_frame(force=frameless)
2385         if frameless:
2386             pickler.framer.file_write(data)
2387         else:
2388             pickler.write(data)
2389     pickler.framer.end_framing()
2390     return out.getvalue()
2391 
2392 ##############################################################################
2393 # A symbolic pickle disassembler.
2394 
2395 def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
2396     """Produce a symbolic disassembly of a pickle.
2397 
2398     'pickle' is a file-like object, or string, containing a (at least one)
2399     pickle.  The pickle is disassembled from the current position, through
2400     the first STOP opcode encountered.
2401 
2402     Optional arg 'out' is a file-like object to which the disassembly is
2403     printed.  It defaults to sys.stdout.
2404 
2405     Optional arg 'memo' is a Python dict, used as the pickle's memo.  It
2406     may be mutated by dis(), if the pickle contains PUT or BINPUT opcodes.
2407     Passing the same memo object to another dis() call then allows disassembly
2408     to proceed across multiple pickles that were all created by the same
2409     pickler with the same memo.  Ordinarily you don't need to worry about this.
2410 
2411     Optional arg 'indentlevel' is the number of blanks by which to indent
2412     a new MARK level.  It defaults to 4.
2413 
2414     Optional arg 'annotate' if nonzero instructs dis() to add short
2415     description of the opcode on each line of disassembled output.
2416     The value given to 'annotate' must be an integer and is used as a
2417     hint for the column where annotation should start.  The default
2418     value is 0, meaning no annotations.
2419 
2420     In addition to printing the disassembly, some sanity checks are made:
2421 
2422     + All embedded opcode arguments "make sense".
2423 
2424     + Explicit and implicit pop operations have enough items on the stack.
2425 
2426     + When an opcode implicitly refers to a markobject, a markobject is
2427       actually on the stack.
2428 
2429     + A memo entry isn't referenced before it's defined.
2430 
2431     + The markobject isn't stored in the memo.
2432 
2433     + A memo entry isn't redefined.
2434     """
2435 
2436     # Most of the hair here is for sanity checks, but most of it is needed
2437     # anyway to detect when a protocol 0 POP takes a MARK off the stack
2438     # (which in turn is needed to indent MARK blocks correctly).
2439 
2440     stack = []          # crude emulation of unpickler stack
2441     if memo is None:
2442         memo = {}       # crude emulation of unpickler memo
2443     maxproto = -1       # max protocol number seen
2444     markstack = []      # bytecode positions of MARK opcodes
2445     indentchunk = ' ' * indentlevel
2446     errormsg = None
2447     annocol = annotate  # column hint for annotations
2448     for opcode, arg, pos in genops(pickle):
2449         if pos is not None:
2450             print("%5d:" % pos, end=' ', file=out)
2451 
2452         line = "%-4s %s%s" % (repr(opcode.code)[1:-1],
2453                               indentchunk * len(markstack),
2454                               opcode.name)
2455 
2456         maxproto = max(maxproto, opcode.proto)
2457         before = opcode.stack_before    # don't mutate
2458         after = opcode.stack_after      # don't mutate
2459         numtopop = len(before)
2460 
2461         # See whether a MARK should be popped.
2462         markmsg = None
2463         if markobject in before or (opcode.name == "POP" and
2464                                     stack and
2465                                     stack[-1] is markobject):
2466             assert markobject not in after
2467             if __debug__:
2468                 if markobject in before:
2469                     assert before[-1] is stackslice
2470             if markstack:
2471                 markpos = markstack.pop()
2472                 if markpos is None:
2473                     markmsg = "(MARK at unknown opcode offset)"
2474                 else:
2475                     markmsg = "(MARK at %d)" % markpos
2476                 # Pop everything at and after the topmost markobject.
2477                 while stack[-1] is not markobject:
2478                     stack.pop()
2479                 stack.pop()
2480                 # Stop later code from popping too much.
2481                 try:
2482                     numtopop = before.index(markobject)
2483                 except ValueError:
2484                     assert opcode.name == "POP"
2485                     numtopop = 0
2486             else:
2487                 errormsg = markmsg = "no MARK exists on stack"
2488 
2489         # Check for correct memo usage.
2490         if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT", "MEMOIZE"):
2491             if opcode.name == "MEMOIZE":
2492                 memo_idx = len(memo)
2493                 markmsg = "(as %d)" % memo_idx
2494             else:
2495                 assert arg is not None
2496                 memo_idx = arg
2497             if memo_idx in memo:
2498                 errormsg = "memo key %r already defined" % arg
2499             elif not stack:
2500                 errormsg = "stack is empty -- can't store into memo"
2501             elif stack[-1] is markobject:
2502                 errormsg = "can't store markobject in the memo"
2503             else:
2504                 memo[memo_idx] = stack[-1]
2505         elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
2506             if arg in memo:
2507                 assert len(after) == 1
2508                 after = [memo[arg]]     # for better stack emulation
2509             else:
2510                 errormsg = "memo key %r has never been stored into" % arg
2511 
2512         if arg is not None or markmsg:
2513             # make a mild effort to align arguments
2514             line += ' ' * (10 - len(opcode.name))
2515             if arg is not None:
2516                 line += ' ' + repr(arg)
2517             if markmsg:
2518                 line += ' ' + markmsg
2519         if annotate:
2520             line += ' ' * (annocol - len(line))
2521             # make a mild effort to align annotations
2522             annocol = len(line)
2523             if annocol > 50:
2524                 annocol = annotate
2525             line += ' ' + opcode.doc.split('\n', 1)[0]
2526         print(line, file=out)
2527 
2528         if errormsg:
2529             # Note that we delayed complaining until the offending opcode
2530             # was printed.
2531             raise ValueError(errormsg)
2532 
2533         # Emulate the stack effects.
2534         if len(stack) < numtopop:
2535             raise ValueError("tries to pop %d items from stack with "
2536                              "only %d items" % (numtopop, len(stack)))
2537         if numtopop:
2538             del stack[-numtopop:]
2539         if markobject in after:
2540             assert markobject not in before
2541             markstack.append(pos)
2542 
2543         stack.extend(after)
2544 
2545     print("highest protocol among opcodes =", maxproto, file=out)
2546     if stack:
2547         raise ValueError("stack not empty after STOP: %r" % stack)
2548 
2549 # For use in the doctest, simply as an example of a class to pickle.
2550 class _Example:
2551     def __init__(self, value):
2552         self.value = value
2553 
2554 _dis_test = r"""
2555 >>> import pickle
2556 >>> x = [1, 2, (3, 4), {b'abc': "def"}]
2557 >>> pkl0 = pickle.dumps(x, 0)
2558 >>> dis(pkl0)
2559     0: (    MARK
2560     1: l        LIST       (MARK at 0)
2561     2: p    PUT        0
2562     5: I    INT        1
2563     8: a    APPEND
2564     9: I    INT        2
2565    12: a    APPEND
2566    13: (    MARK
2567    14: I        INT        3
2568    17: I        INT        4
2569    20: t        TUPLE      (MARK at 13)
2570    21: p    PUT        1
2571    24: a    APPEND
2572    25: (    MARK
2573    26: d        DICT       (MARK at 25)
2574    27: p    PUT        2
2575    30: c    GLOBAL     '_codecs encode'
2576    46: p    PUT        3
2577    49: (    MARK
2578    50: V        UNICODE    'abc'
2579    55: p        PUT        4
2580    58: V        UNICODE    'latin1'
2581    66: p        PUT        5
2582    69: t        TUPLE      (MARK at 49)
2583    70: p    PUT        6
2584    73: R    REDUCE
2585    74: p    PUT        7
2586    77: V    UNICODE    'def'
2587    82: p    PUT        8
2588    85: s    SETITEM
2589    86: a    APPEND
2590    87: .    STOP
2591 highest protocol among opcodes = 0
2592 
2593 Try again with a "binary" pickle.
2594 
2595 >>> pkl1 = pickle.dumps(x, 1)
2596 >>> dis(pkl1)
2597     0: ]    EMPTY_LIST
2598     1: q    BINPUT     0
2599     3: (    MARK
2600     4: K        BININT1    1
2601     6: K        BININT1    2
2602     8: (        MARK
2603     9: K            BININT1    3
2604    11: K            BININT1    4
2605    13: t            TUPLE      (MARK at 8)
2606    14: q        BINPUT     1
2607    16: }        EMPTY_DICT
2608    17: q        BINPUT     2
2609    19: c        GLOBAL     '_codecs encode'
2610    35: q        BINPUT     3
2611    37: (        MARK
2612    38: X            BINUNICODE 'abc'
2613    46: q            BINPUT     4
2614    48: X            BINUNICODE 'latin1'
2615    59: q            BINPUT     5
2616    61: t            TUPLE      (MARK at 37)
2617    62: q        BINPUT     6
2618    64: R        REDUCE
2619    65: q        BINPUT     7
2620    67: X        BINUNICODE 'def'
2621    75: q        BINPUT     8
2622    77: s        SETITEM
2623    78: e        APPENDS    (MARK at 3)
2624    79: .    STOP
2625 highest protocol among opcodes = 1
2626 
2627 Exercise the INST/OBJ/BUILD family.
2628 
2629 >>> import pickletools
2630 >>> dis(pickle.dumps(pickletools.dis, 0))
2631     0: c    GLOBAL     'pickletools dis'
2632    17: p    PUT        0
2633    20: .    STOP
2634 highest protocol among opcodes = 0
2635 
2636 >>> from pickletools import _Example
2637 >>> x = [_Example(42)] * 2
2638 >>> dis(pickle.dumps(x, 0))
2639     0: (    MARK
2640     1: l        LIST       (MARK at 0)
2641     2: p    PUT        0
2642     5: c    GLOBAL     'copy_reg _reconstructor'
2643    30: p    PUT        1
2644    33: (    MARK
2645    34: c        GLOBAL     'pickletools _Example'
2646    56: p        PUT        2
2647    59: c        GLOBAL     '__builtin__ object'
2648    79: p        PUT        3
2649    82: N        NONE
2650    83: t        TUPLE      (MARK at 33)
2651    84: p    PUT        4
2652    87: R    REDUCE
2653    88: p    PUT        5
2654    91: (    MARK
2655    92: d        DICT       (MARK at 91)
2656    93: p    PUT        6
2657    96: V    UNICODE    'value'
2658   103: p    PUT        7
2659   106: I    INT        42
2660   110: s    SETITEM
2661   111: b    BUILD
2662   112: a    APPEND
2663   113: g    GET        5
2664   116: a    APPEND
2665   117: .    STOP
2666 highest protocol among opcodes = 0
2667 
2668 >>> dis(pickle.dumps(x, 1))
2669     0: ]    EMPTY_LIST
2670     1: q    BINPUT     0
2671     3: (    MARK
2672     4: c        GLOBAL     'copy_reg _reconstructor'
2673    29: q        BINPUT     1
2674    31: (        MARK
2675    32: c            GLOBAL     'pickletools _Example'
2676    54: q            BINPUT     2
2677    56: c            GLOBAL     '__builtin__ object'
2678    76: q            BINPUT     3
2679    78: N            NONE
2680    79: t            TUPLE      (MARK at 31)
2681    80: q        BINPUT     4
2682    82: R        REDUCE
2683    83: q        BINPUT     5
2684    85: }        EMPTY_DICT
2685    86: q        BINPUT     6
2686    88: X        BINUNICODE 'value'
2687    98: q        BINPUT     7
2688   100: K        BININT1    42
2689   102: s        SETITEM
2690   103: b        BUILD
2691   104: h        BINGET     5
2692   106: e        APPENDS    (MARK at 3)
2693   107: .    STOP
2694 highest protocol among opcodes = 1
2695 
2696 Try "the canonical" recursive-object test.
2697 
2698 >>> L = []
2699 >>> T = L,
2700 >>> L.append(T)
2701 >>> L[0] is T
2702 True
2703 >>> T[0] is L
2704 True
2705 >>> L[0][0] is L
2706 True
2707 >>> T[0][0] is T
2708 True
2709 >>> dis(pickle.dumps(L, 0))
2710     0: (    MARK
2711     1: l        LIST       (MARK at 0)
2712     2: p    PUT        0
2713     5: (    MARK
2714     6: g        GET        0
2715     9: t        TUPLE      (MARK at 5)
2716    10: p    PUT        1
2717    13: a    APPEND
2718    14: .    STOP
2719 highest protocol among opcodes = 0
2720 
2721 >>> dis(pickle.dumps(L, 1))
2722     0: ]    EMPTY_LIST
2723     1: q    BINPUT     0
2724     3: (    MARK
2725     4: h        BINGET     0
2726     6: t        TUPLE      (MARK at 3)
2727     7: q    BINPUT     1
2728     9: a    APPEND
2729    10: .    STOP
2730 highest protocol among opcodes = 1
2731 
2732 Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
2733 has to emulate the stack in order to realize that the POP opcode at 16 gets
2734 rid of the MARK at 0.
2735 
2736 >>> dis(pickle.dumps(T, 0))
2737     0: (    MARK
2738     1: (        MARK
2739     2: l            LIST       (MARK at 1)
2740     3: p        PUT        0
2741     6: (        MARK
2742     7: g            GET        0
2743    10: t            TUPLE      (MARK at 6)
2744    11: p        PUT        1
2745    14: a        APPEND
2746    15: 0        POP
2747    16: 0        POP        (MARK at 0)
2748    17: g    GET        1
2749    20: .    STOP
2750 highest protocol among opcodes = 0
2751 
2752 >>> dis(pickle.dumps(T, 1))
2753     0: (    MARK
2754     1: ]        EMPTY_LIST
2755     2: q        BINPUT     0
2756     4: (        MARK
2757     5: h            BINGET     0
2758     7: t            TUPLE      (MARK at 4)
2759     8: q        BINPUT     1
2760    10: a        APPEND
2761    11: 1        POP_MARK   (MARK at 0)
2762    12: h    BINGET     1
2763    14: .    STOP
2764 highest protocol among opcodes = 1
2765 
2766 Try protocol 2.
2767 
2768 >>> dis(pickle.dumps(L, 2))
2769     0: \x80 PROTO      2
2770     2: ]    EMPTY_LIST
2771     3: q    BINPUT     0
2772     5: h    BINGET     0
2773     7: \x85 TUPLE1
2774     8: q    BINPUT     1
2775    10: a    APPEND
2776    11: .    STOP
2777 highest protocol among opcodes = 2
2778 
2779 >>> dis(pickle.dumps(T, 2))
2780     0: \x80 PROTO      2
2781     2: ]    EMPTY_LIST
2782     3: q    BINPUT     0
2783     5: h    BINGET     0
2784     7: \x85 TUPLE1
2785     8: q    BINPUT     1
2786    10: a    APPEND
2787    11: 0    POP
2788    12: h    BINGET     1
2789    14: .    STOP
2790 highest protocol among opcodes = 2
2791 
2792 Try protocol 3 with annotations:
2793 
2794 >>> dis(pickle.dumps(T, 3), annotate=1)
2795     0: \x80 PROTO      3 Protocol version indicator.
2796     2: ]    EMPTY_LIST   Push an empty list.
2797     3: q    BINPUT     0 Store the stack top into the memo.  The stack is not popped.
2798     5: h    BINGET     0 Read an object from the memo and push it on the stack.
2799     7: \x85 TUPLE1       Build a one-tuple out of the topmost item on the stack.
2800     8: q    BINPUT     1 Store the stack top into the memo.  The stack is not popped.
2801    10: a    APPEND       Append an object to a list.
2802    11: 0    POP          Discard the top stack item, shrinking the stack by one item.
2803    12: h    BINGET     1 Read an object from the memo and push it on the stack.
2804    14: .    STOP         Stop the unpickling machine.
2805 highest protocol among opcodes = 2
2806 
2807 """
2808 
2809 _memo_test = r"""
2810 >>> import pickle
2811 >>> import io
2812 >>> f = io.BytesIO()
2813 >>> p = pickle.Pickler(f, 2)
2814 >>> x = [1, 2, 3]
2815 >>> p.dump(x)
2816 >>> p.dump(x)
2817 >>> f.seek(0)
2818 0
2819 >>> memo = {}
2820 >>> dis(f, memo=memo)
2821     0: \x80 PROTO      2
2822     2: ]    EMPTY_LIST
2823     3: q    BINPUT     0
2824     5: (    MARK
2825     6: K        BININT1    1
2826     8: K        BININT1    2
2827    10: K        BININT1    3
2828    12: e        APPENDS    (MARK at 5)
2829    13: .    STOP
2830 highest protocol among opcodes = 2
2831 >>> dis(f, memo=memo)
2832    14: \x80 PROTO      2
2833    16: h    BINGET     0
2834    18: .    STOP
2835 highest protocol among opcodes = 2
2836 """
2837 
2838 __test__ = {'disassembler_test': _dis_test,
2839             'disassembler_memo_test': _memo_test,
2840            }
2841 
2842 def _test():
2843     import doctest
2844     return doctest.testmod()
2845 
2846 if __name__ == "__main__":
2847     import argparse
2848     parser = argparse.ArgumentParser(
2849         description='disassemble one or more pickle files')
2850     parser.add_argument(
2851         'pickle_file', type=argparse.FileType('br'),
2852         nargs='*', help='the pickle file')
2853     parser.add_argument(
2854         '-o', '--output', default=sys.stdout, type=argparse.FileType('w'),
2855         help='the file where the output should be written')
2856     parser.add_argument(
2857         '-m', '--memo', action='store_true',
2858         help='preserve memo between disassemblies')
2859     parser.add_argument(
2860         '-l', '--indentlevel', default=4, type=int,
2861         help='the number of blanks by which to indent a new MARK level')
2862     parser.add_argument(
2863         '-a', '--annotate',  action='store_true',
2864         help='annotate each line with a short opcode description')
2865     parser.add_argument(
2866         '-p', '--preamble', default="==> {name} <==",
2867         help='if more than one pickle file is specified, print this before'
2868         ' each disassembly')
2869     parser.add_argument(
2870         '-t', '--test', action='store_true',
2871         help='run self-test suite')
2872     parser.add_argument(
2873         '-v', action='store_true',
2874         help='run verbosely; only affects self-test run')
2875     args = parser.parse_args()
2876     if args.test:
2877         _test()
2878     else:
2879         annotate = 30 if args.annotate else 0
2880         if not args.pickle_file:
2881             parser.print_help()
2882         elif len(args.pickle_file) == 1:
2883             dis(args.pickle_file[0], args.output, None,
2884                 args.indentlevel, annotate)
2885         else:
2886             memo = {} if args.memo else None
2887             for f in args.pickle_file:
2888                 preamble = args.preamble.format(name=f.name)
2889                 args.output.write(preamble + '\n')
2890                 dis(f, args.output, memo, args.indentlevel, annotate)
2891