1  /*
2  
3  Unicode implementation based on original code by Fredrik Lundh,
4  modified by Marc-Andre Lemburg <mal@lemburg.com>.
5  
6  Major speed upgrades to the method implementations at the Reykjavik
7  NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8  
9  Copyright (c) Corporation for National Research Initiatives.
10  
11  --------------------------------------------------------------------
12  The original string type implementation is:
13  
14    Copyright (c) 1999 by Secret Labs AB
15    Copyright (c) 1999 by Fredrik Lundh
16  
17  By obtaining, using, and/or copying this software and/or its
18  associated documentation, you agree that you have read, understood,
19  and will comply with the following terms and conditions:
20  
21  Permission to use, copy, modify, and distribute this software and its
22  associated documentation for any purpose and without fee is hereby
23  granted, provided that the above copyright notice appears in all
24  copies, and that both that copyright notice and this permission notice
25  appear in supporting documentation, and that the name of Secret Labs
26  AB or the author not be used in advertising or publicity pertaining to
27  distribution of the software without specific, written prior
28  permission.
29  
30  SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32  FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33  ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37  --------------------------------------------------------------------
38  
39  */
40  
41  #define PY_SSIZE_T_CLEAN
42  #include "Python.h"
43  #include "pycore_abstract.h"      // _PyIndex_Check()
44  #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45  #include "pycore_bytesobject.h"   // _PyBytes_Repeat()
46  #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47  #include "pycore_format.h"        // F_LJUST
48  #include "pycore_initconfig.h"    // _PyStatus_OK()
49  #include "pycore_interp.h"        // PyInterpreterState.fs_codec
50  #include "pycore_long.h"          // _PyLong_FormatWriter()
51  #include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52  #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
53  #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
54  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
55  #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
56  #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57  #include "stringlib/eq.h"         // unicode_eq()
58  
59  #ifdef MS_WINDOWS
60  #include <windows.h>
61  #endif
62  
63  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64  #  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
65  #endif
66  
67  /* Uncomment to display statistics on interned strings at exit
68     in _PyUnicode_ClearInterned(). */
69  /* #define INTERNED_STATS 1 */
70  
71  
72  /*[clinic input]
73  class str "PyObject *" "&PyUnicode_Type"
74  [clinic start generated code]*/
75  /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76  
77  /*[python input]
78  class Py_UCS4_converter(CConverter):
79      type = 'Py_UCS4'
80      converter = 'convert_uc'
81  
82      def converter_init(self):
83          if self.default is not unspecified:
84              self.c_default = ascii(self.default)
85              if len(self.c_default) > 4 or self.c_default[0] != "'":
86                  self.c_default = hex(ord(self.default))
87  
88  [python start generated code]*/
89  /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90  
91  /* --- Globals ------------------------------------------------------------
92  
93  NOTE: In the interpreter's initialization phase, some globals are currently
94        initialized dynamically as needed. In the process Unicode objects may
95        be created before the Unicode type is ready.
96  
97  */
98  
99  
100  #ifdef __cplusplus
101  extern "C" {
102  #endif
103  
104  // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105  // The value must be the same in fileutils.c.
106  #define MAX_UNICODE 0x10ffff
107  
108  #ifdef Py_DEBUG
109  #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110  #else
111  #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112  #endif
113  
114  #define _PyUnicode_UTF8(op)                             \
115      (_PyCompactUnicodeObject_CAST(op)->utf8)
116  #define PyUnicode_UTF8(op)                              \
117      (assert(_PyUnicode_CHECK(op)),                      \
118       assert(PyUnicode_IS_READY(op)),                    \
119       PyUnicode_IS_COMPACT_ASCII(op) ?                   \
120           ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
121           _PyUnicode_UTF8(op))
122  #define _PyUnicode_UTF8_LENGTH(op)                      \
123      (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124  #define PyUnicode_UTF8_LENGTH(op)                       \
125      (assert(_PyUnicode_CHECK(op)),                      \
126       assert(PyUnicode_IS_READY(op)),                    \
127       PyUnicode_IS_COMPACT_ASCII(op) ?                   \
128           _PyASCIIObject_CAST(op)->length :              \
129           _PyUnicode_UTF8_LENGTH(op))
130  #define _PyUnicode_WSTR(op)                             \
131      (_PyASCIIObject_CAST(op)->wstr)
132  
133  /* Don't use deprecated macro of unicodeobject.h */
134  #undef PyUnicode_WSTR_LENGTH
135  #define PyUnicode_WSTR_LENGTH(op) \
136      (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
137       _PyASCIIObject_CAST(op)->length :                  \
138       _PyCompactUnicodeObject_CAST(op)->wstr_length)
139  #define _PyUnicode_WSTR_LENGTH(op)                      \
140      (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141  #define _PyUnicode_LENGTH(op)                           \
142      (_PyASCIIObject_CAST(op)->length)
143  #define _PyUnicode_STATE(op)                            \
144      (_PyASCIIObject_CAST(op)->state)
145  #define _PyUnicode_HASH(op)                             \
146      (_PyASCIIObject_CAST(op)->hash)
147  #define _PyUnicode_KIND(op)                             \
148      (assert(_PyUnicode_CHECK(op)),                      \
149       _PyASCIIObject_CAST(op)->state.kind)
150  #define _PyUnicode_GET_LENGTH(op)                       \
151      (assert(_PyUnicode_CHECK(op)),                      \
152       _PyASCIIObject_CAST(op)->length)
153  #define _PyUnicode_DATA_ANY(op)                         \
154      (_PyUnicodeObject_CAST(op)->data.any)
155  
156  #undef PyUnicode_READY
157  #define PyUnicode_READY(op)                             \
158      (assert(_PyUnicode_CHECK(op)),                      \
159       (PyUnicode_IS_READY(op) ?                          \
160        0 :                                               \
161        _PyUnicode_Ready(op)))
162  
163  #define _PyUnicode_SHARE_UTF8(op)                       \
164      (assert(_PyUnicode_CHECK(op)),                      \
165       assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
166       (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167  #define _PyUnicode_SHARE_WSTR(op)                       \
168      (assert(_PyUnicode_CHECK(op)),                      \
169       (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170  
171  /* true if the Unicode object has an allocated UTF-8 memory block
172     (not shared with other data) */
173  #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
174      ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
175        && _PyUnicode_UTF8(op)                            \
176        && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177  
178  /* true if the Unicode object has an allocated wstr memory block
179     (not shared with other data) */
180  #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
181      ((_PyUnicode_WSTR(op) &&                            \
182        (!PyUnicode_IS_READY(op) ||                       \
183         _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184  
185  /* Generic helper macro to convert characters of different types.
186     from_type and to_type have to be valid type names, begin and end
187     are pointers to the source characters which should be of type
188     "from_type *".  to is a pointer of type "to_type *" and points to the
189     buffer where the result characters are written to. */
190  #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191      do {                                                \
192          to_type *_to = (to_type *)(to);                 \
193          const from_type *_iter = (const from_type *)(begin);\
194          const from_type *_end = (const from_type *)(end);\
195          Py_ssize_t n = (_end) - (_iter);                \
196          const from_type *_unrolled_end =                \
197              _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
198          while (_iter < (_unrolled_end)) {               \
199              _to[0] = (to_type) _iter[0];                \
200              _to[1] = (to_type) _iter[1];                \
201              _to[2] = (to_type) _iter[2];                \
202              _to[3] = (to_type) _iter[3];                \
203              _iter += 4; _to += 4;                       \
204          }                                               \
205          while (_iter < (_end))                          \
206              *_to++ = (to_type) *_iter++;                \
207      } while (0)
208  
209  #define LATIN1(ch)  \
210      (ch < 128 \
211       ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212       : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213  
214  #ifdef MS_WINDOWS
215     /* On Windows, overallocate by 50% is the best factor */
216  #  define OVERALLOCATE_FACTOR 2
217  #else
218     /* On Linux, overallocate by 25% is the best factor */
219  #  define OVERALLOCATE_FACTOR 4
220  #endif
221  
222  /* This dictionary holds all interned unicode strings.  Note that references
223     to strings in this dictionary are *not* counted in the string's ob_refcnt.
224     When the interned string reaches a refcnt of 0 the string deallocation
225     function will delete the reference from this dictionary.
226  
227     Another way to look at this is that to say that the actual reference
228     count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
229  */
230  static PyObject *interned = NULL;
231  
232  /* Forward declaration */
233  static inline int
234  _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235  static inline void
236  _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237  static PyObject *
238  unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239                      const char *errors);
240  static PyObject *
241  unicode_decode_utf8(const char *s, Py_ssize_t size,
242                      _Py_error_handler error_handler, const char *errors,
243                      Py_ssize_t *consumed);
244  #ifdef Py_DEBUG
245  static inline int unicode_is_finalizing(void);
246  static int unicode_is_singleton(PyObject *unicode);
247  #endif
248  
249  
250  // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)251  static inline PyObject* unicode_get_empty(void)
252  {
253      _Py_DECLARE_STR(empty, "");
254      return &_Py_STR(empty);
255  }
256  
257  
258  // Return a strong reference to the empty string singleton.
unicode_new_empty(void)259  static inline PyObject* unicode_new_empty(void)
260  {
261      PyObject *empty = unicode_get_empty();
262      Py_INCREF(empty);
263      return empty;
264  }
265  
266  #define _Py_RETURN_UNICODE_EMPTY()   \
267      do {                             \
268          return unicode_new_empty();  \
269      } while (0)
270  
271  static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272  unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273               Py_ssize_t start, Py_ssize_t length)
274  {
275      assert(0 <= start);
276      assert(kind != PyUnicode_WCHAR_KIND);
277      switch (kind) {
278      case PyUnicode_1BYTE_KIND: {
279          assert(value <= 0xff);
280          Py_UCS1 ch = (unsigned char)value;
281          Py_UCS1 *to = (Py_UCS1 *)data + start;
282          memset(to, ch, length);
283          break;
284      }
285      case PyUnicode_2BYTE_KIND: {
286          assert(value <= 0xffff);
287          Py_UCS2 ch = (Py_UCS2)value;
288          Py_UCS2 *to = (Py_UCS2 *)data + start;
289          const Py_UCS2 *end = to + length;
290          for (; to < end; ++to) *to = ch;
291          break;
292      }
293      case PyUnicode_4BYTE_KIND: {
294          assert(value <= MAX_UNICODE);
295          Py_UCS4 ch = value;
296          Py_UCS4 * to = (Py_UCS4 *)data + start;
297          const Py_UCS4 *end = to + length;
298          for (; to < end; ++to) *to = ch;
299          break;
300      }
301      default: Py_UNREACHABLE();
302      }
303  }
304  
305  
306  /* Fast detection of the most frequent whitespace characters */
307  const unsigned char _Py_ascii_whitespace[] = {
308      0, 0, 0, 0, 0, 0, 0, 0,
309  /*     case 0x0009: * CHARACTER TABULATION */
310  /*     case 0x000A: * LINE FEED */
311  /*     case 0x000B: * LINE TABULATION */
312  /*     case 0x000C: * FORM FEED */
313  /*     case 0x000D: * CARRIAGE RETURN */
314      0, 1, 1, 1, 1, 1, 0, 0,
315      0, 0, 0, 0, 0, 0, 0, 0,
316  /*     case 0x001C: * FILE SEPARATOR */
317  /*     case 0x001D: * GROUP SEPARATOR */
318  /*     case 0x001E: * RECORD SEPARATOR */
319  /*     case 0x001F: * UNIT SEPARATOR */
320      0, 0, 0, 0, 1, 1, 1, 1,
321  /*     case 0x0020: * SPACE */
322      1, 0, 0, 0, 0, 0, 0, 0,
323      0, 0, 0, 0, 0, 0, 0, 0,
324      0, 0, 0, 0, 0, 0, 0, 0,
325      0, 0, 0, 0, 0, 0, 0, 0,
326  
327      0, 0, 0, 0, 0, 0, 0, 0,
328      0, 0, 0, 0, 0, 0, 0, 0,
329      0, 0, 0, 0, 0, 0, 0, 0,
330      0, 0, 0, 0, 0, 0, 0, 0,
331      0, 0, 0, 0, 0, 0, 0, 0,
332      0, 0, 0, 0, 0, 0, 0, 0,
333      0, 0, 0, 0, 0, 0, 0, 0,
334      0, 0, 0, 0, 0, 0, 0, 0
335  };
336  
337  /* forward */
338  static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339  static PyObject* get_latin1_char(unsigned char ch);
340  static int unicode_modifiable(PyObject *unicode);
341  
342  
343  static PyObject *
344  _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345  static PyObject *
346  _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347  static PyObject *
348  _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349  
350  static PyObject *
351  unicode_encode_call_errorhandler(const char *errors,
352         PyObject **errorHandler,const char *encoding, const char *reason,
353         PyObject *unicode, PyObject **exceptionObject,
354         Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355  
356  static void
357  raise_encode_exception(PyObject **exceptionObject,
358                         const char *encoding,
359                         PyObject *unicode,
360                         Py_ssize_t startpos, Py_ssize_t endpos,
361                         const char *reason);
362  
363  /* Same for linebreaks */
364  static const unsigned char ascii_linebreak[] = {
365      0, 0, 0, 0, 0, 0, 0, 0,
366  /*         0x000A, * LINE FEED */
367  /*         0x000B, * LINE TABULATION */
368  /*         0x000C, * FORM FEED */
369  /*         0x000D, * CARRIAGE RETURN */
370      0, 0, 1, 1, 1, 1, 0, 0,
371      0, 0, 0, 0, 0, 0, 0, 0,
372  /*         0x001C, * FILE SEPARATOR */
373  /*         0x001D, * GROUP SEPARATOR */
374  /*         0x001E, * RECORD SEPARATOR */
375      0, 0, 0, 0, 1, 1, 1, 0,
376      0, 0, 0, 0, 0, 0, 0, 0,
377      0, 0, 0, 0, 0, 0, 0, 0,
378      0, 0, 0, 0, 0, 0, 0, 0,
379      0, 0, 0, 0, 0, 0, 0, 0,
380  
381      0, 0, 0, 0, 0, 0, 0, 0,
382      0, 0, 0, 0, 0, 0, 0, 0,
383      0, 0, 0, 0, 0, 0, 0, 0,
384      0, 0, 0, 0, 0, 0, 0, 0,
385      0, 0, 0, 0, 0, 0, 0, 0,
386      0, 0, 0, 0, 0, 0, 0, 0,
387      0, 0, 0, 0, 0, 0, 0, 0,
388      0, 0, 0, 0, 0, 0, 0, 0
389  };
390  
391  static int convert_uc(PyObject *obj, void *addr);
392  
393  struct encoding_map;
394  #include "clinic/unicodeobject.c.h"
395  
396  _Py_error_handler
_Py_GetErrorHandler(const char * errors)397  _Py_GetErrorHandler(const char *errors)
398  {
399      if (errors == NULL || strcmp(errors, "strict") == 0) {
400          return _Py_ERROR_STRICT;
401      }
402      if (strcmp(errors, "surrogateescape") == 0) {
403          return _Py_ERROR_SURROGATEESCAPE;
404      }
405      if (strcmp(errors, "replace") == 0) {
406          return _Py_ERROR_REPLACE;
407      }
408      if (strcmp(errors, "ignore") == 0) {
409          return _Py_ERROR_IGNORE;
410      }
411      if (strcmp(errors, "backslashreplace") == 0) {
412          return _Py_ERROR_BACKSLASHREPLACE;
413      }
414      if (strcmp(errors, "surrogatepass") == 0) {
415          return _Py_ERROR_SURROGATEPASS;
416      }
417      if (strcmp(errors, "xmlcharrefreplace") == 0) {
418          return _Py_ERROR_XMLCHARREFREPLACE;
419      }
420      return _Py_ERROR_OTHER;
421  }
422  
423  
424  static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)425  get_error_handler_wide(const wchar_t *errors)
426  {
427      if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428          return _Py_ERROR_STRICT;
429      }
430      if (wcscmp(errors, L"surrogateescape") == 0) {
431          return _Py_ERROR_SURROGATEESCAPE;
432      }
433      if (wcscmp(errors, L"replace") == 0) {
434          return _Py_ERROR_REPLACE;
435      }
436      if (wcscmp(errors, L"ignore") == 0) {
437          return _Py_ERROR_IGNORE;
438      }
439      if (wcscmp(errors, L"backslashreplace") == 0) {
440          return _Py_ERROR_BACKSLASHREPLACE;
441      }
442      if (wcscmp(errors, L"surrogatepass") == 0) {
443          return _Py_ERROR_SURROGATEPASS;
444      }
445      if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446          return _Py_ERROR_XMLCHARREFREPLACE;
447      }
448      return _Py_ERROR_OTHER;
449  }
450  
451  
452  static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)453  unicode_check_encoding_errors(const char *encoding, const char *errors)
454  {
455      if (encoding == NULL && errors == NULL) {
456          return 0;
457      }
458  
459      PyInterpreterState *interp = _PyInterpreterState_GET();
460  #ifndef Py_DEBUG
461      /* In release mode, only check in development mode (-X dev) */
462      if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463          return 0;
464      }
465  #else
466      /* Always check in debug mode */
467  #endif
468  
469      /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470         codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471      if (!interp->unicode.fs_codec.encoding) {
472          return 0;
473      }
474  
475      /* Disable checks during Python finalization. For example, it allows to
476         call _PyObject_Dump() during finalization for debugging purpose. */
477      if (interp->finalizing) {
478          return 0;
479      }
480  
481      if (encoding != NULL) {
482          PyObject *handler = _PyCodec_Lookup(encoding);
483          if (handler == NULL) {
484              return -1;
485          }
486          Py_DECREF(handler);
487      }
488  
489      if (errors != NULL) {
490          PyObject *handler = PyCodec_LookupError(errors);
491          if (handler == NULL) {
492              return -1;
493          }
494          Py_DECREF(handler);
495      }
496      return 0;
497  }
498  
499  
500  int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)501  _PyUnicode_CheckConsistency(PyObject *op, int check_content)
502  {
503  #define CHECK(expr) \
504      do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505  
506      assert(op != NULL);
507      CHECK(PyUnicode_Check(op));
508  
509      PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510      unsigned int kind = ascii->state.kind;
511  
512      if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513          CHECK(kind == PyUnicode_1BYTE_KIND);
514          CHECK(ascii->state.ready == 1);
515      }
516      else {
517          PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518          void *data;
519  
520          if (ascii->state.compact == 1) {
521              data = compact + 1;
522              CHECK(kind == PyUnicode_1BYTE_KIND
523                                   || kind == PyUnicode_2BYTE_KIND
524                                   || kind == PyUnicode_4BYTE_KIND);
525              CHECK(ascii->state.ascii == 0);
526              CHECK(ascii->state.ready == 1);
527              CHECK(compact->utf8 != data);
528          }
529          else {
530              PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531  
532              data = unicode->data.any;
533              if (kind == PyUnicode_WCHAR_KIND) {
534                  CHECK(ascii->length == 0);
535                  CHECK(ascii->hash == -1);
536                  CHECK(ascii->state.compact == 0);
537                  CHECK(ascii->state.ascii == 0);
538                  CHECK(ascii->state.ready == 0);
539                  CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540                  CHECK(ascii->wstr != NULL);
541                  CHECK(data == NULL);
542                  CHECK(compact->utf8 == NULL);
543              }
544              else {
545                  CHECK(kind == PyUnicode_1BYTE_KIND
546                                       || kind == PyUnicode_2BYTE_KIND
547                                       || kind == PyUnicode_4BYTE_KIND);
548                  CHECK(ascii->state.compact == 0);
549                  CHECK(ascii->state.ready == 1);
550                  CHECK(data != NULL);
551                  if (ascii->state.ascii) {
552                      CHECK(compact->utf8 == data);
553                      CHECK(compact->utf8_length == ascii->length);
554                  }
555                  else
556                      CHECK(compact->utf8 != data);
557              }
558          }
559          if (kind != PyUnicode_WCHAR_KIND) {
560              if (
561  #if SIZEOF_WCHAR_T == 2
562                  kind == PyUnicode_2BYTE_KIND
563  #else
564                  kind == PyUnicode_4BYTE_KIND
565  #endif
566                 )
567              {
568                  CHECK(ascii->wstr == data);
569                  CHECK(compact->wstr_length == ascii->length);
570              } else
571                  CHECK(ascii->wstr != data);
572          }
573  
574          if (compact->utf8 == NULL)
575              CHECK(compact->utf8_length == 0);
576          if (ascii->wstr == NULL)
577              CHECK(compact->wstr_length == 0);
578      }
579  
580      /* check that the best kind is used: O(n) operation */
581      if (check_content && kind != PyUnicode_WCHAR_KIND) {
582          Py_ssize_t i;
583          Py_UCS4 maxchar = 0;
584          const void *data;
585          Py_UCS4 ch;
586  
587          data = PyUnicode_DATA(ascii);
588          for (i=0; i < ascii->length; i++)
589          {
590              ch = PyUnicode_READ(kind, data, i);
591              if (ch > maxchar)
592                  maxchar = ch;
593          }
594          if (kind == PyUnicode_1BYTE_KIND) {
595              if (ascii->state.ascii == 0) {
596                  CHECK(maxchar >= 128);
597                  CHECK(maxchar <= 255);
598              }
599              else
600                  CHECK(maxchar < 128);
601          }
602          else if (kind == PyUnicode_2BYTE_KIND) {
603              CHECK(maxchar >= 0x100);
604              CHECK(maxchar <= 0xFFFF);
605          }
606          else {
607              CHECK(maxchar >= 0x10000);
608              CHECK(maxchar <= MAX_UNICODE);
609          }
610          CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611      }
612      return 1;
613  
614  #undef CHECK
615  }
616  
617  
618  static PyObject*
unicode_result_wchar(PyObject * unicode)619  unicode_result_wchar(PyObject *unicode)
620  {
621  #ifndef Py_DEBUG
622      Py_ssize_t len;
623  
624      len = _PyUnicode_WSTR_LENGTH(unicode);
625      if (len == 0) {
626          Py_DECREF(unicode);
627          _Py_RETURN_UNICODE_EMPTY();
628      }
629  
630      if (len == 1) {
631          wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632          if ((Py_UCS4)ch < 256) {
633              Py_DECREF(unicode);
634              return get_latin1_char((unsigned char)ch);
635          }
636      }
637  
638      if (_PyUnicode_Ready(unicode) < 0) {
639          Py_DECREF(unicode);
640          return NULL;
641      }
642  #else
643      assert(Py_REFCNT(unicode) == 1);
644  
645      /* don't make the result ready in debug mode to ensure that the caller
646         makes the string ready before using it */
647      assert(_PyUnicode_CheckConsistency(unicode, 1));
648  #endif
649      return unicode;
650  }
651  
652  static PyObject*
unicode_result_ready(PyObject * unicode)653  unicode_result_ready(PyObject *unicode)
654  {
655      Py_ssize_t length;
656  
657      length = PyUnicode_GET_LENGTH(unicode);
658      if (length == 0) {
659          PyObject *empty = unicode_get_empty();
660          if (unicode != empty) {
661              Py_DECREF(unicode);
662              Py_INCREF(empty);
663          }
664          return empty;
665      }
666  
667      if (length == 1) {
668          int kind = PyUnicode_KIND(unicode);
669          if (kind == PyUnicode_1BYTE_KIND) {
670              const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671              Py_UCS1 ch = data[0];
672              PyObject *latin1_char = LATIN1(ch);
673              if (unicode != latin1_char) {
674                  Py_INCREF(latin1_char);
675                  Py_DECREF(unicode);
676              }
677              return latin1_char;
678          }
679      }
680  
681      assert(_PyUnicode_CheckConsistency(unicode, 1));
682      return unicode;
683  }
684  
685  static PyObject*
unicode_result(PyObject * unicode)686  unicode_result(PyObject *unicode)
687  {
688      assert(_PyUnicode_CHECK(unicode));
689      if (PyUnicode_IS_READY(unicode))
690          return unicode_result_ready(unicode);
691      else
692          return unicode_result_wchar(unicode);
693  }
694  
695  static PyObject*
unicode_result_unchanged(PyObject * unicode)696  unicode_result_unchanged(PyObject *unicode)
697  {
698      if (PyUnicode_CheckExact(unicode)) {
699          if (PyUnicode_READY(unicode) == -1)
700              return NULL;
701          Py_INCREF(unicode);
702          return unicode;
703      }
704      else
705          /* Subtype -- return genuine unicode string with the same value. */
706          return _PyUnicode_Copy(unicode);
707  }
708  
709  /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710     ASCII, Latin1, UTF-8, etc. */
711  static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712  backslashreplace(_PyBytesWriter *writer, char *str,
713                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714  {
715      Py_ssize_t size, i;
716      Py_UCS4 ch;
717      enum PyUnicode_Kind kind;
718      const void *data;
719  
720      assert(PyUnicode_IS_READY(unicode));
721      kind = PyUnicode_KIND(unicode);
722      data = PyUnicode_DATA(unicode);
723  
724      size = 0;
725      /* determine replacement size */
726      for (i = collstart; i < collend; ++i) {
727          Py_ssize_t incr;
728  
729          ch = PyUnicode_READ(kind, data, i);
730          if (ch < 0x100)
731              incr = 2+2;
732          else if (ch < 0x10000)
733              incr = 2+4;
734          else {
735              assert(ch <= MAX_UNICODE);
736              incr = 2+8;
737          }
738          if (size > PY_SSIZE_T_MAX - incr) {
739              PyErr_SetString(PyExc_OverflowError,
740                              "encoded result is too long for a Python string");
741              return NULL;
742          }
743          size += incr;
744      }
745  
746      str = _PyBytesWriter_Prepare(writer, str, size);
747      if (str == NULL)
748          return NULL;
749  
750      /* generate replacement */
751      for (i = collstart; i < collend; ++i) {
752          ch = PyUnicode_READ(kind, data, i);
753          *str++ = '\\';
754          if (ch >= 0x00010000) {
755              *str++ = 'U';
756              *str++ = Py_hexdigits[(ch>>28)&0xf];
757              *str++ = Py_hexdigits[(ch>>24)&0xf];
758              *str++ = Py_hexdigits[(ch>>20)&0xf];
759              *str++ = Py_hexdigits[(ch>>16)&0xf];
760              *str++ = Py_hexdigits[(ch>>12)&0xf];
761              *str++ = Py_hexdigits[(ch>>8)&0xf];
762          }
763          else if (ch >= 0x100) {
764              *str++ = 'u';
765              *str++ = Py_hexdigits[(ch>>12)&0xf];
766              *str++ = Py_hexdigits[(ch>>8)&0xf];
767          }
768          else
769              *str++ = 'x';
770          *str++ = Py_hexdigits[(ch>>4)&0xf];
771          *str++ = Py_hexdigits[ch&0xf];
772      }
773      return str;
774  }
775  
776  /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777     ASCII, Latin1, UTF-8, etc. */
778  static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779  xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780                    PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781  {
782      Py_ssize_t size, i;
783      Py_UCS4 ch;
784      enum PyUnicode_Kind kind;
785      const void *data;
786  
787      assert(PyUnicode_IS_READY(unicode));
788      kind = PyUnicode_KIND(unicode);
789      data = PyUnicode_DATA(unicode);
790  
791      size = 0;
792      /* determine replacement size */
793      for (i = collstart; i < collend; ++i) {
794          Py_ssize_t incr;
795  
796          ch = PyUnicode_READ(kind, data, i);
797          if (ch < 10)
798              incr = 2+1+1;
799          else if (ch < 100)
800              incr = 2+2+1;
801          else if (ch < 1000)
802              incr = 2+3+1;
803          else if (ch < 10000)
804              incr = 2+4+1;
805          else if (ch < 100000)
806              incr = 2+5+1;
807          else if (ch < 1000000)
808              incr = 2+6+1;
809          else {
810              assert(ch <= MAX_UNICODE);
811              incr = 2+7+1;
812          }
813          if (size > PY_SSIZE_T_MAX - incr) {
814              PyErr_SetString(PyExc_OverflowError,
815                              "encoded result is too long for a Python string");
816              return NULL;
817          }
818          size += incr;
819      }
820  
821      str = _PyBytesWriter_Prepare(writer, str, size);
822      if (str == NULL)
823          return NULL;
824  
825      /* generate replacement */
826      for (i = collstart; i < collend; ++i) {
827          size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828          if (size < 0) {
829              return NULL;
830          }
831          str += size;
832      }
833      return str;
834  }
835  
836  /* --- Bloom Filters ----------------------------------------------------- */
837  
838  /* stuff to implement simple "bloom filters" for Unicode characters.
839     to keep things simple, we use a single bitmask, using the least 5
840     bits from each unicode characters as the bit index. */
841  
842  /* the linebreak mask is set up by _PyUnicode_Init() below */
843  
844  #if LONG_BIT >= 128
845  #define BLOOM_WIDTH 128
846  #elif LONG_BIT >= 64
847  #define BLOOM_WIDTH 64
848  #elif LONG_BIT >= 32
849  #define BLOOM_WIDTH 32
850  #else
851  #error "LONG_BIT is smaller than 32"
852  #endif
853  
854  #define BLOOM_MASK unsigned long
855  
856  static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857  
858  #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859  
860  #define BLOOM_LINEBREAK(ch)                                             \
861      ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
862       (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863  
864  static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865  make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866  {
867  #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
868      do {                                               \
869          TYPE *data = (TYPE *)PTR;                      \
870          TYPE *end = data + LEN;                        \
871          Py_UCS4 ch;                                    \
872          for (; data != end; data++) {                  \
873              ch = *data;                                \
874              MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875          }                                              \
876          break;                                         \
877      } while (0)
878  
879      /* calculate simple bloom-style bitmask for a given unicode string */
880  
881      BLOOM_MASK mask;
882  
883      mask = 0;
884      switch (kind) {
885      case PyUnicode_1BYTE_KIND:
886          BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887          break;
888      case PyUnicode_2BYTE_KIND:
889          BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890          break;
891      case PyUnicode_4BYTE_KIND:
892          BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893          break;
894      default:
895          Py_UNREACHABLE();
896      }
897      return mask;
898  
899  #undef BLOOM_UPDATE
900  }
901  
902  static int
ensure_unicode(PyObject * obj)903  ensure_unicode(PyObject *obj)
904  {
905      if (!PyUnicode_Check(obj)) {
906          PyErr_Format(PyExc_TypeError,
907                       "must be str, not %.100s",
908                       Py_TYPE(obj)->tp_name);
909          return -1;
910      }
911      return PyUnicode_READY(obj);
912  }
913  
914  /* Compilation of templated routines */
915  
916  #define STRINGLIB_GET_EMPTY() unicode_get_empty()
917  
918  #include "stringlib/asciilib.h"
919  #include "stringlib/fastsearch.h"
920  #include "stringlib/partition.h"
921  #include "stringlib/split.h"
922  #include "stringlib/count.h"
923  #include "stringlib/find.h"
924  #include "stringlib/find_max_char.h"
925  #include "stringlib/undef.h"
926  
927  #include "stringlib/ucs1lib.h"
928  #include "stringlib/fastsearch.h"
929  #include "stringlib/partition.h"
930  #include "stringlib/split.h"
931  #include "stringlib/count.h"
932  #include "stringlib/find.h"
933  #include "stringlib/replace.h"
934  #include "stringlib/find_max_char.h"
935  #include "stringlib/undef.h"
936  
937  #include "stringlib/ucs2lib.h"
938  #include "stringlib/fastsearch.h"
939  #include "stringlib/partition.h"
940  #include "stringlib/split.h"
941  #include "stringlib/count.h"
942  #include "stringlib/find.h"
943  #include "stringlib/replace.h"
944  #include "stringlib/find_max_char.h"
945  #include "stringlib/undef.h"
946  
947  #include "stringlib/ucs4lib.h"
948  #include "stringlib/fastsearch.h"
949  #include "stringlib/partition.h"
950  #include "stringlib/split.h"
951  #include "stringlib/count.h"
952  #include "stringlib/find.h"
953  #include "stringlib/replace.h"
954  #include "stringlib/find_max_char.h"
955  #include "stringlib/undef.h"
956  
957  _Py_COMP_DIAG_PUSH
958  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
959  #include "stringlib/unicodedefs.h"
960  #include "stringlib/fastsearch.h"
961  #include "stringlib/count.h"
962  #include "stringlib/find.h"
963  #include "stringlib/undef.h"
964  _Py_COMP_DIAG_POP
965  
966  #undef STRINGLIB_GET_EMPTY
967  
968  /* --- Unicode Object ----------------------------------------------------- */
969  
970  static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971  findchar(const void *s, int kind,
972           Py_ssize_t size, Py_UCS4 ch,
973           int direction)
974  {
975      switch (kind) {
976      case PyUnicode_1BYTE_KIND:
977          if ((Py_UCS1) ch != ch)
978              return -1;
979          if (direction > 0)
980              return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981          else
982              return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983      case PyUnicode_2BYTE_KIND:
984          if ((Py_UCS2) ch != ch)
985              return -1;
986          if (direction > 0)
987              return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988          else
989              return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990      case PyUnicode_4BYTE_KIND:
991          if (direction > 0)
992              return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993          else
994              return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995      default:
996          Py_UNREACHABLE();
997      }
998  }
999  
1000  #ifdef Py_DEBUG
1001  /* Fill the data of a Unicode string with invalid characters to detect bugs
1002     earlier.
1003  
1004     _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005     ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006     invalid character in Unicode 6.0. */
1007  static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008  unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009  {
1010      int kind = PyUnicode_KIND(unicode);
1011      Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012      Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013      if (length <= old_length)
1014          return;
1015      memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016  }
1017  #endif
1018  
1019  static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1020  resize_compact(PyObject *unicode, Py_ssize_t length)
1021  {
1022      Py_ssize_t char_size;
1023      Py_ssize_t struct_size;
1024      Py_ssize_t new_size;
1025      int share_wstr;
1026      PyObject *new_unicode;
1027  #ifdef Py_DEBUG
1028      Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029  #endif
1030  
1031      assert(unicode_modifiable(unicode));
1032      assert(PyUnicode_IS_READY(unicode));
1033      assert(PyUnicode_IS_COMPACT(unicode));
1034  
1035      char_size = PyUnicode_KIND(unicode);
1036      if (PyUnicode_IS_ASCII(unicode))
1037          struct_size = sizeof(PyASCIIObject);
1038      else
1039          struct_size = sizeof(PyCompactUnicodeObject);
1040      share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041  
1042      if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043          PyErr_NoMemory();
1044          return NULL;
1045      }
1046      new_size = (struct_size + (length + 1) * char_size);
1047  
1048      if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049          PyObject_Free(_PyUnicode_UTF8(unicode));
1050          _PyUnicode_UTF8(unicode) = NULL;
1051          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052      }
1053  #ifdef Py_REF_DEBUG
1054      _Py_RefTotal--;
1055  #endif
1056  #ifdef Py_TRACE_REFS
1057      _Py_ForgetReference(unicode);
1058  #endif
1059  
1060      new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061      if (new_unicode == NULL) {
1062          _Py_NewReference(unicode);
1063          PyErr_NoMemory();
1064          return NULL;
1065      }
1066      unicode = new_unicode;
1067      _Py_NewReference(unicode);
1068  
1069      _PyUnicode_LENGTH(unicode) = length;
1070      if (share_wstr) {
1071          _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072          if (!PyUnicode_IS_ASCII(unicode))
1073              _PyUnicode_WSTR_LENGTH(unicode) = length;
1074      }
1075      else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076          PyObject_Free(_PyUnicode_WSTR(unicode));
1077          _PyUnicode_WSTR(unicode) = NULL;
1078          if (!PyUnicode_IS_ASCII(unicode))
1079              _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080      }
1081  #ifdef Py_DEBUG
1082      unicode_fill_invalid(unicode, old_length);
1083  #endif
1084      PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085                      length, 0);
1086      assert(_PyUnicode_CheckConsistency(unicode, 0));
1087      return unicode;
1088  }
1089  
1090  static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1091  resize_inplace(PyObject *unicode, Py_ssize_t length)
1092  {
1093      wchar_t *wstr;
1094      Py_ssize_t new_size;
1095      assert(!PyUnicode_IS_COMPACT(unicode));
1096      assert(Py_REFCNT(unicode) == 1);
1097  
1098      if (PyUnicode_IS_READY(unicode)) {
1099          Py_ssize_t char_size;
1100          int share_wstr, share_utf8;
1101          void *data;
1102  #ifdef Py_DEBUG
1103          Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104  #endif
1105  
1106          data = _PyUnicode_DATA_ANY(unicode);
1107          char_size = PyUnicode_KIND(unicode);
1108          share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109          share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110  
1111          if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112              PyErr_NoMemory();
1113              return -1;
1114          }
1115          new_size = (length + 1) * char_size;
1116  
1117          if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118          {
1119              PyObject_Free(_PyUnicode_UTF8(unicode));
1120              _PyUnicode_UTF8(unicode) = NULL;
1121              _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122          }
1123  
1124          data = (PyObject *)PyObject_Realloc(data, new_size);
1125          if (data == NULL) {
1126              PyErr_NoMemory();
1127              return -1;
1128          }
1129          _PyUnicode_DATA_ANY(unicode) = data;
1130          if (share_wstr) {
1131              _PyUnicode_WSTR(unicode) = data;
1132              _PyUnicode_WSTR_LENGTH(unicode) = length;
1133          }
1134          if (share_utf8) {
1135              _PyUnicode_UTF8(unicode) = data;
1136              _PyUnicode_UTF8_LENGTH(unicode) = length;
1137          }
1138          _PyUnicode_LENGTH(unicode) = length;
1139          PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140  #ifdef Py_DEBUG
1141          unicode_fill_invalid(unicode, old_length);
1142  #endif
1143          if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144              assert(_PyUnicode_CheckConsistency(unicode, 0));
1145              return 0;
1146          }
1147      }
1148      assert(_PyUnicode_WSTR(unicode) != NULL);
1149  
1150      /* check for integer overflow */
1151      if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152          PyErr_NoMemory();
1153          return -1;
1154      }
1155      new_size = sizeof(wchar_t) * (length + 1);
1156      wstr =  _PyUnicode_WSTR(unicode);
1157      wstr = PyObject_Realloc(wstr, new_size);
1158      if (!wstr) {
1159          PyErr_NoMemory();
1160          return -1;
1161      }
1162      _PyUnicode_WSTR(unicode) = wstr;
1163      _PyUnicode_WSTR(unicode)[length] = 0;
1164      _PyUnicode_WSTR_LENGTH(unicode) = length;
1165      assert(_PyUnicode_CheckConsistency(unicode, 0));
1166      return 0;
1167  }
1168  
1169  static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1170  resize_copy(PyObject *unicode, Py_ssize_t length)
1171  {
1172      Py_ssize_t copy_length;
1173      if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174          PyObject *copy;
1175  
1176          assert(PyUnicode_IS_READY(unicode));
1177  
1178          copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179          if (copy == NULL)
1180              return NULL;
1181  
1182          copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183          _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184          return copy;
1185      }
1186      else {
1187          PyObject *w;
1188  
1189          w = (PyObject*)_PyUnicode_New(length);
1190          if (w == NULL)
1191              return NULL;
1192          copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193          copy_length = Py_MIN(copy_length, length);
1194          memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195                    copy_length * sizeof(wchar_t));
1196          return w;
1197      }
1198  }
1199  
1200  /* We allocate one more byte to make sure the string is
1201     Ux0000 terminated; some code (e.g. new_identifier)
1202     relies on that.
1203  
1204     XXX This allocator could further be enhanced by assuring that the
1205     free list never reduces its size below 1.
1206  
1207  */
1208  
1209  static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1210  _PyUnicode_New(Py_ssize_t length)
1211  {
1212      PyUnicodeObject *unicode;
1213      size_t new_size;
1214  
1215      /* Optimization for empty strings */
1216      if (length == 0) {
1217          return (PyUnicodeObject *)unicode_new_empty();
1218      }
1219  
1220      /* Ensure we won't overflow the size. */
1221      if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222          return (PyUnicodeObject *)PyErr_NoMemory();
1223      }
1224      if (length < 0) {
1225          PyErr_SetString(PyExc_SystemError,
1226                          "Negative size passed to _PyUnicode_New");
1227          return NULL;
1228      }
1229  
1230      unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231      if (unicode == NULL)
1232          return NULL;
1233      new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234  
1235      _PyUnicode_WSTR_LENGTH(unicode) = length;
1236      _PyUnicode_HASH(unicode) = -1;
1237      _PyUnicode_STATE(unicode).interned = 0;
1238      _PyUnicode_STATE(unicode).kind = 0;
1239      _PyUnicode_STATE(unicode).compact = 0;
1240      _PyUnicode_STATE(unicode).ready = 0;
1241      _PyUnicode_STATE(unicode).ascii = 0;
1242      _PyUnicode_DATA_ANY(unicode) = NULL;
1243      _PyUnicode_LENGTH(unicode) = 0;
1244      _PyUnicode_UTF8(unicode) = NULL;
1245      _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246  
1247      _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248      if (!_PyUnicode_WSTR(unicode)) {
1249          Py_DECREF(unicode);
1250          PyErr_NoMemory();
1251          return NULL;
1252      }
1253  
1254      /* Initialize the first element to guard against cases where
1255       * the caller fails before initializing str -- unicode_resize()
1256       * reads str[0], and the Keep-Alive optimization can keep memory
1257       * allocated for str alive across a call to unicode_dealloc(unicode).
1258       * We don't want unicode_resize to read uninitialized memory in
1259       * that case.
1260       */
1261      _PyUnicode_WSTR(unicode)[0] = 0;
1262      _PyUnicode_WSTR(unicode)[length] = 0;
1263  
1264      assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265      return unicode;
1266  }
1267  
1268  static const char*
unicode_kind_name(PyObject * unicode)1269  unicode_kind_name(PyObject *unicode)
1270  {
1271      /* don't check consistency: unicode_kind_name() is called from
1272         _PyUnicode_Dump() */
1273      if (!PyUnicode_IS_COMPACT(unicode))
1274      {
1275          if (!PyUnicode_IS_READY(unicode))
1276              return "wstr";
1277          switch (PyUnicode_KIND(unicode))
1278          {
1279          case PyUnicode_1BYTE_KIND:
1280              if (PyUnicode_IS_ASCII(unicode))
1281                  return "legacy ascii";
1282              else
1283                  return "legacy latin1";
1284          case PyUnicode_2BYTE_KIND:
1285              return "legacy UCS2";
1286          case PyUnicode_4BYTE_KIND:
1287              return "legacy UCS4";
1288          default:
1289              return "<legacy invalid kind>";
1290          }
1291      }
1292      assert(PyUnicode_IS_READY(unicode));
1293      switch (PyUnicode_KIND(unicode)) {
1294      case PyUnicode_1BYTE_KIND:
1295          if (PyUnicode_IS_ASCII(unicode))
1296              return "ascii";
1297          else
1298              return "latin1";
1299      case PyUnicode_2BYTE_KIND:
1300          return "UCS2";
1301      case PyUnicode_4BYTE_KIND:
1302          return "UCS4";
1303      default:
1304          return "<invalid compact kind>";
1305      }
1306  }
1307  
1308  #ifdef Py_DEBUG
1309  /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1310  const char *_PyUnicode_utf8(void *unicode_raw){
1311      PyObject *unicode = _PyObject_CAST(unicode_raw);
1312      return PyUnicode_UTF8(unicode);
1313  }
1314  
_PyUnicode_compact_data(void * unicode_raw)1315  const void *_PyUnicode_compact_data(void *unicode_raw) {
1316      PyObject *unicode = _PyObject_CAST(unicode_raw);
1317      return _PyUnicode_COMPACT_DATA(unicode);
1318  }
_PyUnicode_data(void * unicode_raw)1319  const void *_PyUnicode_data(void *unicode_raw) {
1320      PyObject *unicode = _PyObject_CAST(unicode_raw);
1321      printf("obj %p\n", (void*)unicode);
1322      printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323      printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324      printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325      printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326      printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327      return PyUnicode_DATA(unicode);
1328  }
1329  
1330  void
_PyUnicode_Dump(PyObject * op)1331  _PyUnicode_Dump(PyObject *op)
1332  {
1333      PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334      PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335      PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336      const void *data;
1337  
1338      if (ascii->state.compact)
1339      {
1340          if (ascii->state.ascii)
1341              data = (ascii + 1);
1342          else
1343              data = (compact + 1);
1344      }
1345      else
1346          data = unicode->data.any;
1347      printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348  
1349      if (ascii->wstr == data)
1350          printf("shared ");
1351      printf("wstr=%p", (void *)ascii->wstr);
1352  
1353      if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354          printf(" (%zu), ", compact->wstr_length);
1355          if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356              printf("shared ");
1357          }
1358          printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359      }
1360      printf(", data=%p\n", data);
1361  }
1362  #endif
1363  
1364  
1365  PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366  PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367  {
1368      /* Optimization for empty strings */
1369      if (size == 0) {
1370          return unicode_new_empty();
1371      }
1372  
1373      PyObject *obj;
1374      PyCompactUnicodeObject *unicode;
1375      void *data;
1376      enum PyUnicode_Kind kind;
1377      int is_sharing, is_ascii;
1378      Py_ssize_t char_size;
1379      Py_ssize_t struct_size;
1380  
1381      is_ascii = 0;
1382      is_sharing = 0;
1383      struct_size = sizeof(PyCompactUnicodeObject);
1384      if (maxchar < 128) {
1385          kind = PyUnicode_1BYTE_KIND;
1386          char_size = 1;
1387          is_ascii = 1;
1388          struct_size = sizeof(PyASCIIObject);
1389      }
1390      else if (maxchar < 256) {
1391          kind = PyUnicode_1BYTE_KIND;
1392          char_size = 1;
1393      }
1394      else if (maxchar < 65536) {
1395          kind = PyUnicode_2BYTE_KIND;
1396          char_size = 2;
1397          if (sizeof(wchar_t) == 2)
1398              is_sharing = 1;
1399      }
1400      else {
1401          if (maxchar > MAX_UNICODE) {
1402              PyErr_SetString(PyExc_SystemError,
1403                              "invalid maximum character passed to PyUnicode_New");
1404              return NULL;
1405          }
1406          kind = PyUnicode_4BYTE_KIND;
1407          char_size = 4;
1408          if (sizeof(wchar_t) == 4)
1409              is_sharing = 1;
1410      }
1411  
1412      /* Ensure we won't overflow the size. */
1413      if (size < 0) {
1414          PyErr_SetString(PyExc_SystemError,
1415                          "Negative size passed to PyUnicode_New");
1416          return NULL;
1417      }
1418      if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419          return PyErr_NoMemory();
1420  
1421      /* Duplicated allocation code from _PyObject_New() instead of a call to
1422       * PyObject_New() so we are able to allocate space for the object and
1423       * it's data buffer.
1424       */
1425      obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426      if (obj == NULL) {
1427          return PyErr_NoMemory();
1428      }
1429      _PyObject_Init(obj, &PyUnicode_Type);
1430  
1431      unicode = (PyCompactUnicodeObject *)obj;
1432      if (is_ascii)
1433          data = ((PyASCIIObject*)obj) + 1;
1434      else
1435          data = unicode + 1;
1436      _PyUnicode_LENGTH(unicode) = size;
1437      _PyUnicode_HASH(unicode) = -1;
1438      _PyUnicode_STATE(unicode).interned = 0;
1439      _PyUnicode_STATE(unicode).kind = kind;
1440      _PyUnicode_STATE(unicode).compact = 1;
1441      _PyUnicode_STATE(unicode).ready = 1;
1442      _PyUnicode_STATE(unicode).ascii = is_ascii;
1443      if (is_ascii) {
1444          ((char*)data)[size] = 0;
1445          _PyUnicode_WSTR(unicode) = NULL;
1446      }
1447      else if (kind == PyUnicode_1BYTE_KIND) {
1448          ((char*)data)[size] = 0;
1449          _PyUnicode_WSTR(unicode) = NULL;
1450          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451          unicode->utf8 = NULL;
1452          unicode->utf8_length = 0;
1453      }
1454      else {
1455          unicode->utf8 = NULL;
1456          unicode->utf8_length = 0;
1457          if (kind == PyUnicode_2BYTE_KIND)
1458              ((Py_UCS2*)data)[size] = 0;
1459          else /* kind == PyUnicode_4BYTE_KIND */
1460              ((Py_UCS4*)data)[size] = 0;
1461          if (is_sharing) {
1462              _PyUnicode_WSTR_LENGTH(unicode) = size;
1463              _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464          }
1465          else {
1466              _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467              _PyUnicode_WSTR(unicode) = NULL;
1468          }
1469      }
1470  #ifdef Py_DEBUG
1471      unicode_fill_invalid((PyObject*)unicode, 0);
1472  #endif
1473      assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474      return obj;
1475  }
1476  
1477  #if SIZEOF_WCHAR_T == 2
1478  /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479     will decode surrogate pairs, the other conversions are implemented as macros
1480     for efficiency.
1481  
1482     This function assumes that unicode can hold one more code point than wstr
1483     characters for a terminating null character. */
1484  static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485  unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486                                PyObject *unicode)
1487  {
1488      const wchar_t *iter;
1489      Py_UCS4 *ucs4_out;
1490  
1491      assert(unicode != NULL);
1492      assert(_PyUnicode_CHECK(unicode));
1493      assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494      ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495  
1496      for (iter = begin; iter < end; ) {
1497          assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498                             _PyUnicode_GET_LENGTH(unicode)));
1499          if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500              && (iter+1) < end
1501              && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502          {
1503              *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504              iter += 2;
1505          }
1506          else {
1507              *ucs4_out++ = *iter;
1508              iter++;
1509          }
1510      }
1511      assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512                          _PyUnicode_GET_LENGTH(unicode)));
1513  
1514  }
1515  #endif
1516  
1517  static int
unicode_check_modifiable(PyObject * unicode)1518  unicode_check_modifiable(PyObject *unicode)
1519  {
1520      if (!unicode_modifiable(unicode)) {
1521          PyErr_SetString(PyExc_SystemError,
1522                          "Cannot modify a string currently used");
1523          return -1;
1524      }
1525      return 0;
1526  }
1527  
1528  static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529  _copy_characters(PyObject *to, Py_ssize_t to_start,
1530                   PyObject *from, Py_ssize_t from_start,
1531                   Py_ssize_t how_many, int check_maxchar)
1532  {
1533      unsigned int from_kind, to_kind;
1534      const void *from_data;
1535      void *to_data;
1536  
1537      assert(0 <= how_many);
1538      assert(0 <= from_start);
1539      assert(0 <= to_start);
1540      assert(PyUnicode_Check(from));
1541      assert(PyUnicode_IS_READY(from));
1542      assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543  
1544      assert(PyUnicode_Check(to));
1545      assert(PyUnicode_IS_READY(to));
1546      assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547  
1548      if (how_many == 0)
1549          return 0;
1550  
1551      from_kind = PyUnicode_KIND(from);
1552      from_data = PyUnicode_DATA(from);
1553      to_kind = PyUnicode_KIND(to);
1554      to_data = PyUnicode_DATA(to);
1555  
1556  #ifdef Py_DEBUG
1557      if (!check_maxchar
1558          && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559      {
1560          Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561          Py_UCS4 ch;
1562          Py_ssize_t i;
1563          for (i=0; i < how_many; i++) {
1564              ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565              assert(ch <= to_maxchar);
1566          }
1567      }
1568  #endif
1569  
1570      if (from_kind == to_kind) {
1571          if (check_maxchar
1572              && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573          {
1574              /* Writing Latin-1 characters into an ASCII string requires to
1575                 check that all written characters are pure ASCII */
1576              Py_UCS4 max_char;
1577              max_char = ucs1lib_find_max_char(from_data,
1578                                               (const Py_UCS1*)from_data + how_many);
1579              if (max_char >= 128)
1580                  return -1;
1581          }
1582          memcpy((char*)to_data + to_kind * to_start,
1583                    (const char*)from_data + from_kind * from_start,
1584                    to_kind * how_many);
1585      }
1586      else if (from_kind == PyUnicode_1BYTE_KIND
1587               && to_kind == PyUnicode_2BYTE_KIND)
1588      {
1589          _PyUnicode_CONVERT_BYTES(
1590              Py_UCS1, Py_UCS2,
1591              PyUnicode_1BYTE_DATA(from) + from_start,
1592              PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593              PyUnicode_2BYTE_DATA(to) + to_start
1594              );
1595      }
1596      else if (from_kind == PyUnicode_1BYTE_KIND
1597               && to_kind == PyUnicode_4BYTE_KIND)
1598      {
1599          _PyUnicode_CONVERT_BYTES(
1600              Py_UCS1, Py_UCS4,
1601              PyUnicode_1BYTE_DATA(from) + from_start,
1602              PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603              PyUnicode_4BYTE_DATA(to) + to_start
1604              );
1605      }
1606      else if (from_kind == PyUnicode_2BYTE_KIND
1607               && to_kind == PyUnicode_4BYTE_KIND)
1608      {
1609          _PyUnicode_CONVERT_BYTES(
1610              Py_UCS2, Py_UCS4,
1611              PyUnicode_2BYTE_DATA(from) + from_start,
1612              PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613              PyUnicode_4BYTE_DATA(to) + to_start
1614              );
1615      }
1616      else {
1617          assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618  
1619          if (!check_maxchar) {
1620              if (from_kind == PyUnicode_2BYTE_KIND
1621                  && to_kind == PyUnicode_1BYTE_KIND)
1622              {
1623                  _PyUnicode_CONVERT_BYTES(
1624                      Py_UCS2, Py_UCS1,
1625                      PyUnicode_2BYTE_DATA(from) + from_start,
1626                      PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627                      PyUnicode_1BYTE_DATA(to) + to_start
1628                      );
1629              }
1630              else if (from_kind == PyUnicode_4BYTE_KIND
1631                       && to_kind == PyUnicode_1BYTE_KIND)
1632              {
1633                  _PyUnicode_CONVERT_BYTES(
1634                      Py_UCS4, Py_UCS1,
1635                      PyUnicode_4BYTE_DATA(from) + from_start,
1636                      PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637                      PyUnicode_1BYTE_DATA(to) + to_start
1638                      );
1639              }
1640              else if (from_kind == PyUnicode_4BYTE_KIND
1641                       && to_kind == PyUnicode_2BYTE_KIND)
1642              {
1643                  _PyUnicode_CONVERT_BYTES(
1644                      Py_UCS4, Py_UCS2,
1645                      PyUnicode_4BYTE_DATA(from) + from_start,
1646                      PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647                      PyUnicode_2BYTE_DATA(to) + to_start
1648                      );
1649              }
1650              else {
1651                  Py_UNREACHABLE();
1652              }
1653          }
1654          else {
1655              const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656              Py_UCS4 ch;
1657              Py_ssize_t i;
1658  
1659              for (i=0; i < how_many; i++) {
1660                  ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661                  if (ch > to_maxchar)
1662                      return -1;
1663                  PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664              }
1665          }
1666      }
1667      return 0;
1668  }
1669  
1670  void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671  _PyUnicode_FastCopyCharacters(
1672      PyObject *to, Py_ssize_t to_start,
1673      PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674  {
1675      (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676  }
1677  
1678  Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679  PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680                           PyObject *from, Py_ssize_t from_start,
1681                           Py_ssize_t how_many)
1682  {
1683      int err;
1684  
1685      if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686          PyErr_BadInternalCall();
1687          return -1;
1688      }
1689  
1690      if (PyUnicode_READY(from) == -1)
1691          return -1;
1692      if (PyUnicode_READY(to) == -1)
1693          return -1;
1694  
1695      if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696          PyErr_SetString(PyExc_IndexError, "string index out of range");
1697          return -1;
1698      }
1699      if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700          PyErr_SetString(PyExc_IndexError, "string index out of range");
1701          return -1;
1702      }
1703      if (how_many < 0) {
1704          PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705          return -1;
1706      }
1707      how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708      if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709          PyErr_Format(PyExc_SystemError,
1710                       "Cannot write %zi characters at %zi "
1711                       "in a string of %zi characters",
1712                       how_many, to_start, PyUnicode_GET_LENGTH(to));
1713          return -1;
1714      }
1715  
1716      if (how_many == 0)
1717          return 0;
1718  
1719      if (unicode_check_modifiable(to))
1720          return -1;
1721  
1722      err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723      if (err) {
1724          PyErr_Format(PyExc_SystemError,
1725                       "Cannot copy %s characters "
1726                       "into a string of %s characters",
1727                       unicode_kind_name(from),
1728                       unicode_kind_name(to));
1729          return -1;
1730      }
1731      return how_many;
1732  }
1733  
1734  /* Find the maximum code point and count the number of surrogate pairs so a
1735     correct string length can be computed before converting a string to UCS4.
1736     This function counts single surrogates as a character and not as a pair.
1737  
1738     Return 0 on success, or -1 on error. */
1739  static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740  find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741                          Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742  {
1743      const wchar_t *iter;
1744      Py_UCS4 ch;
1745  
1746      assert(num_surrogates != NULL && maxchar != NULL);
1747      *num_surrogates = 0;
1748      *maxchar = 0;
1749  
1750      for (iter = begin; iter < end; ) {
1751  #if SIZEOF_WCHAR_T == 2
1752          if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753              && (iter+1) < end
1754              && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755          {
1756              ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757              ++(*num_surrogates);
1758              iter += 2;
1759          }
1760          else
1761  #endif
1762          {
1763              ch = *iter;
1764              iter++;
1765          }
1766          if (ch > *maxchar) {
1767              *maxchar = ch;
1768              if (*maxchar > MAX_UNICODE) {
1769                  PyErr_Format(PyExc_ValueError,
1770                               "character U+%x is not in range [U+0000; U+%x]",
1771                               ch, MAX_UNICODE);
1772                  return -1;
1773              }
1774          }
1775      }
1776      return 0;
1777  }
1778  
1779  int
_PyUnicode_Ready(PyObject * unicode)1780  _PyUnicode_Ready(PyObject *unicode)
1781  {
1782      wchar_t *end;
1783      Py_UCS4 maxchar = 0;
1784      Py_ssize_t num_surrogates;
1785  #if SIZEOF_WCHAR_T == 2
1786      Py_ssize_t length_wo_surrogates;
1787  #endif
1788  
1789      /* _PyUnicode_Ready() is only intended for old-style API usage where
1790         strings were created using _PyObject_New() and where no canonical
1791         representation (the str field) has been set yet aka strings
1792         which are not yet ready. */
1793      assert(_PyUnicode_CHECK(unicode));
1794      assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795      assert(_PyUnicode_WSTR(unicode) != NULL);
1796      assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797      assert(_PyUnicode_UTF8(unicode) == NULL);
1798      /* Actually, it should neither be interned nor be anything else: */
1799      assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800  
1801      end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802      if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803                                  &maxchar, &num_surrogates) == -1)
1804          return -1;
1805  
1806      if (maxchar < 256) {
1807          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808          if (!_PyUnicode_DATA_ANY(unicode)) {
1809              PyErr_NoMemory();
1810              return -1;
1811          }
1812          _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813                                  _PyUnicode_WSTR(unicode), end,
1814                                  PyUnicode_1BYTE_DATA(unicode));
1815          PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817          _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818          if (maxchar < 128) {
1819              _PyUnicode_STATE(unicode).ascii = 1;
1820              _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821              _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822          }
1823          else {
1824              _PyUnicode_STATE(unicode).ascii = 0;
1825              _PyUnicode_UTF8(unicode) = NULL;
1826              _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827          }
1828          PyObject_Free(_PyUnicode_WSTR(unicode));
1829          _PyUnicode_WSTR(unicode) = NULL;
1830          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831      }
1832      /* In this case we might have to convert down from 4-byte native
1833         wchar_t to 2-byte unicode. */
1834      else if (maxchar < 65536) {
1835          assert(num_surrogates == 0 &&
1836                 "FindMaxCharAndNumSurrogatePairs() messed up");
1837  
1838  #if SIZEOF_WCHAR_T == 2
1839          /* We can share representations and are done. */
1840          _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841          PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843          _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844          _PyUnicode_UTF8(unicode) = NULL;
1845          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846  #else
1847          /* sizeof(wchar_t) == 4 */
1848          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849              2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850          if (!_PyUnicode_DATA_ANY(unicode)) {
1851              PyErr_NoMemory();
1852              return -1;
1853          }
1854          _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855                                  _PyUnicode_WSTR(unicode), end,
1856                                  PyUnicode_2BYTE_DATA(unicode));
1857          PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859          _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860          _PyUnicode_UTF8(unicode) = NULL;
1861          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862          PyObject_Free(_PyUnicode_WSTR(unicode));
1863          _PyUnicode_WSTR(unicode) = NULL;
1864          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865  #endif
1866      }
1867      /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868      else {
1869  #if SIZEOF_WCHAR_T == 2
1870          /* in case the native representation is 2-bytes, we need to allocate a
1871             new normalized 4-byte version. */
1872          length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873          if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874              PyErr_NoMemory();
1875              return -1;
1876          }
1877          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878          if (!_PyUnicode_DATA_ANY(unicode)) {
1879              PyErr_NoMemory();
1880              return -1;
1881          }
1882          _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883          _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884          _PyUnicode_UTF8(unicode) = NULL;
1885          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886          /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887          _PyUnicode_STATE(unicode).ready = 1;
1888          unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889          PyObject_Free(_PyUnicode_WSTR(unicode));
1890          _PyUnicode_WSTR(unicode) = NULL;
1891          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892  #else
1893          assert(num_surrogates == 0);
1894  
1895          _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897          _PyUnicode_UTF8(unicode) = NULL;
1898          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899          _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900  #endif
1901          PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902      }
1903      _PyUnicode_STATE(unicode).ready = 1;
1904      assert(_PyUnicode_CheckConsistency(unicode, 1));
1905      return 0;
1906  }
1907  
1908  static void
unicode_dealloc(PyObject * unicode)1909  unicode_dealloc(PyObject *unicode)
1910  {
1911  #ifdef Py_DEBUG
1912      if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913          _Py_FatalRefcountError("deallocating an Unicode singleton");
1914      }
1915  #endif
1916  
1917      switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918      case SSTATE_NOT_INTERNED:
1919          break;
1920      case SSTATE_INTERNED_MORTAL:
1921      {
1922          /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923             references (key and value) which were ignored by
1924             PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925             to prevent calling unicode_dealloc() again. Adjust refcnt after
1926             PyDict_DelItem(). */
1927          assert(Py_REFCNT(unicode) == 0);
1928          Py_SET_REFCNT(unicode, 3);
1929          if (PyDict_DelItem(interned, unicode) != 0) {
1930              _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931                                        NULL);
1932          }
1933          assert(Py_REFCNT(unicode) == 1);
1934          Py_SET_REFCNT(unicode, 0);
1935          break;
1936      }
1937  
1938      case SSTATE_INTERNED_IMMORTAL:
1939          _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940          break;
1941  
1942      default:
1943          Py_UNREACHABLE();
1944      }
1945  
1946      if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947          PyObject_Free(_PyUnicode_WSTR(unicode));
1948      }
1949      if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950          PyObject_Free(_PyUnicode_UTF8(unicode));
1951      }
1952      if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953          PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954      }
1955  
1956      Py_TYPE(unicode)->tp_free(unicode);
1957  }
1958  
1959  #ifdef Py_DEBUG
1960  static int
unicode_is_singleton(PyObject * unicode)1961  unicode_is_singleton(PyObject *unicode)
1962  {
1963      if (unicode == &_Py_STR(empty)) {
1964          return 1;
1965      }
1966  
1967      PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968      if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969          Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970          if (ch < 256 && LATIN1(ch) == unicode) {
1971              return 1;
1972          }
1973      }
1974      return 0;
1975  }
1976  #endif
1977  
1978  static int
unicode_modifiable(PyObject * unicode)1979  unicode_modifiable(PyObject *unicode)
1980  {
1981      assert(_PyUnicode_CHECK(unicode));
1982      if (Py_REFCNT(unicode) != 1)
1983          return 0;
1984      if (_PyUnicode_HASH(unicode) != -1)
1985          return 0;
1986      if (PyUnicode_CHECK_INTERNED(unicode))
1987          return 0;
1988      if (!PyUnicode_CheckExact(unicode))
1989          return 0;
1990  #ifdef Py_DEBUG
1991      /* singleton refcount is greater than 1 */
1992      assert(!unicode_is_singleton(unicode));
1993  #endif
1994      return 1;
1995  }
1996  
1997  static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998  unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999  {
2000      PyObject *unicode;
2001      Py_ssize_t old_length;
2002  
2003      assert(p_unicode != NULL);
2004      unicode = *p_unicode;
2005  
2006      assert(unicode != NULL);
2007      assert(PyUnicode_Check(unicode));
2008      assert(0 <= length);
2009  
2010      if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011          old_length = PyUnicode_WSTR_LENGTH(unicode);
2012      else
2013          old_length = PyUnicode_GET_LENGTH(unicode);
2014      if (old_length == length)
2015          return 0;
2016  
2017      if (length == 0) {
2018          PyObject *empty = unicode_new_empty();
2019          Py_SETREF(*p_unicode, empty);
2020          return 0;
2021      }
2022  
2023      if (!unicode_modifiable(unicode)) {
2024          PyObject *copy = resize_copy(unicode, length);
2025          if (copy == NULL)
2026              return -1;
2027          Py_SETREF(*p_unicode, copy);
2028          return 0;
2029      }
2030  
2031      if (PyUnicode_IS_COMPACT(unicode)) {
2032          PyObject *new_unicode = resize_compact(unicode, length);
2033          if (new_unicode == NULL)
2034              return -1;
2035          *p_unicode = new_unicode;
2036          return 0;
2037      }
2038      return resize_inplace(unicode, length);
2039  }
2040  
2041  int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042  PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043  {
2044      PyObject *unicode;
2045      if (p_unicode == NULL) {
2046          PyErr_BadInternalCall();
2047          return -1;
2048      }
2049      unicode = *p_unicode;
2050      if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051      {
2052          PyErr_BadInternalCall();
2053          return -1;
2054      }
2055      return unicode_resize(p_unicode, length);
2056  }
2057  
2058  /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059  
2060     WARNING: The function doesn't copy the terminating null character and
2061     doesn't check the maximum character (may write a latin1 character in an
2062     ASCII string). */
2063  static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064  unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065                     const char *str, Py_ssize_t len)
2066  {
2067      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068      const void *data = PyUnicode_DATA(unicode);
2069      const char *end = str + len;
2070  
2071      assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072      switch (kind) {
2073      case PyUnicode_1BYTE_KIND: {
2074  #ifdef Py_DEBUG
2075          if (PyUnicode_IS_ASCII(unicode)) {
2076              Py_UCS4 maxchar = ucs1lib_find_max_char(
2077                  (const Py_UCS1*)str,
2078                  (const Py_UCS1*)str + len);
2079              assert(maxchar < 128);
2080          }
2081  #endif
2082          memcpy((char *) data + index, str, len);
2083          break;
2084      }
2085      case PyUnicode_2BYTE_KIND: {
2086          Py_UCS2 *start = (Py_UCS2 *)data + index;
2087          Py_UCS2 *ucs2 = start;
2088  
2089          for (; str < end; ++ucs2, ++str)
2090              *ucs2 = (Py_UCS2)*str;
2091  
2092          assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093          break;
2094      }
2095      case PyUnicode_4BYTE_KIND: {
2096          Py_UCS4 *start = (Py_UCS4 *)data + index;
2097          Py_UCS4 *ucs4 = start;
2098  
2099          for (; str < end; ++ucs4, ++str)
2100              *ucs4 = (Py_UCS4)*str;
2101  
2102          assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103          break;
2104      }
2105      default:
2106          Py_UNREACHABLE();
2107      }
2108  }
2109  
2110  static PyObject*
get_latin1_char(Py_UCS1 ch)2111  get_latin1_char(Py_UCS1 ch)
2112  {
2113      return Py_NewRef(LATIN1(ch));
2114  }
2115  
2116  static PyObject*
unicode_char(Py_UCS4 ch)2117  unicode_char(Py_UCS4 ch)
2118  {
2119      PyObject *unicode;
2120  
2121      assert(ch <= MAX_UNICODE);
2122  
2123      if (ch < 256) {
2124          return get_latin1_char(ch);
2125      }
2126  
2127      unicode = PyUnicode_New(1, ch);
2128      if (unicode == NULL)
2129          return NULL;
2130  
2131      assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132      if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133          PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134      } else {
2135          assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136          PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137      }
2138      assert(_PyUnicode_CheckConsistency(unicode, 1));
2139      return unicode;
2140  }
2141  
2142  PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143  PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144  {
2145      if (u == NULL) {
2146          if (size > 0) {
2147              if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148                      "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149                      "use PyUnicode_New() instead", 1) < 0) {
2150                  return NULL;
2151              }
2152          }
2153          return (PyObject*)_PyUnicode_New(size);
2154      }
2155  
2156      if (size < 0) {
2157          PyErr_BadInternalCall();
2158          return NULL;
2159      }
2160  
2161      return PyUnicode_FromWideChar(u, size);
2162  }
2163  
2164  PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165  PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166  {
2167      PyObject *unicode;
2168      Py_UCS4 maxchar = 0;
2169      Py_ssize_t num_surrogates;
2170  
2171      if (u == NULL && size != 0) {
2172          PyErr_BadInternalCall();
2173          return NULL;
2174      }
2175  
2176      if (size == -1) {
2177          size = wcslen(u);
2178      }
2179  
2180      /* If the Unicode data is known at construction time, we can apply
2181         some optimizations which share commonly used objects. */
2182  
2183      /* Optimization for empty strings */
2184      if (size == 0)
2185          _Py_RETURN_UNICODE_EMPTY();
2186  
2187  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188      /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189         non-Unicode locales and hence needs conversion to UCS-4 first. */
2190      if (_Py_LocaleUsesNonUnicodeWchar()) {
2191          wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192          if (!converted) {
2193              return NULL;
2194          }
2195          PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196          PyMem_Free(converted);
2197          return unicode;
2198      }
2199  #endif
2200  
2201      /* Single character Unicode objects in the Latin-1 range are
2202         shared when using this constructor */
2203      if (size == 1 && (Py_UCS4)*u < 256)
2204          return get_latin1_char((unsigned char)*u);
2205  
2206      /* If not empty and not single character, copy the Unicode data
2207         into the new object */
2208      if (find_maxchar_surrogates(u, u + size,
2209                                  &maxchar, &num_surrogates) == -1)
2210          return NULL;
2211  
2212      unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213      if (!unicode)
2214          return NULL;
2215  
2216      switch (PyUnicode_KIND(unicode)) {
2217      case PyUnicode_1BYTE_KIND:
2218          _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219                                  u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220          break;
2221      case PyUnicode_2BYTE_KIND:
2222  #if Py_UNICODE_SIZE == 2
2223          memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224  #else
2225          _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226                                  u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227  #endif
2228          break;
2229      case PyUnicode_4BYTE_KIND:
2230  #if SIZEOF_WCHAR_T == 2
2231          /* This is the only case which has to process surrogates, thus
2232             a simple copy loop is not enough and we need a function. */
2233          unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234  #else
2235          assert(num_surrogates == 0);
2236          memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237  #endif
2238          break;
2239      default:
2240          Py_UNREACHABLE();
2241      }
2242  
2243      return unicode_result(unicode);
2244  }
2245  
2246  PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247  PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248  {
2249      if (size < 0) {
2250          PyErr_SetString(PyExc_SystemError,
2251                          "Negative size passed to PyUnicode_FromStringAndSize");
2252          return NULL;
2253      }
2254      if (u != NULL) {
2255          return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256      }
2257      else {
2258          if (size > 0) {
2259              if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260                      "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261                      "use PyUnicode_New() instead", 1) < 0) {
2262                  return NULL;
2263              }
2264          }
2265          return (PyObject *)_PyUnicode_New(size);
2266      }
2267  }
2268  
2269  PyObject *
PyUnicode_FromString(const char * u)2270  PyUnicode_FromString(const char *u)
2271  {
2272      size_t size = strlen(u);
2273      if (size > PY_SSIZE_T_MAX) {
2274          PyErr_SetString(PyExc_OverflowError, "input too long");
2275          return NULL;
2276      }
2277      return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278  }
2279  
2280  
2281  PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2282  _PyUnicode_FromId(_Py_Identifier *id)
2283  {
2284      PyInterpreterState *interp = _PyInterpreterState_GET();
2285      struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286  
2287      Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288      if (index < 0) {
2289          struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290  
2291          PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292          // Check again to detect concurrent access. Another thread can have
2293          // initialized the index while this thread waited for the lock.
2294          index = _Py_atomic_size_get(&id->index);
2295          if (index < 0) {
2296              assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297              index = rt_ids->next_index;
2298              rt_ids->next_index++;
2299              _Py_atomic_size_set(&id->index, index);
2300          }
2301          PyThread_release_lock(rt_ids->lock);
2302      }
2303      assert(index >= 0);
2304  
2305      PyObject *obj;
2306      if (index < ids->size) {
2307          obj = ids->array[index];
2308          if (obj) {
2309              // Return a borrowed reference
2310              return obj;
2311          }
2312      }
2313  
2314      obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315                                         NULL, NULL);
2316      if (!obj) {
2317          return NULL;
2318      }
2319      PyUnicode_InternInPlace(&obj);
2320  
2321      if (index >= ids->size) {
2322          // Overallocate to reduce the number of realloc
2323          Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324          Py_ssize_t item_size = sizeof(ids->array[0]);
2325          PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326          if (new_array == NULL) {
2327              PyErr_NoMemory();
2328              return NULL;
2329          }
2330          memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331          ids->array = new_array;
2332          ids->size = new_size;
2333      }
2334  
2335      // The array stores a strong reference
2336      ids->array[index] = obj;
2337  
2338      // Return a borrowed reference
2339      return obj;
2340  }
2341  
2342  
2343  static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2344  unicode_clear_identifiers(struct _Py_unicode_state *state)
2345  {
2346      struct _Py_unicode_ids *ids = &state->ids;
2347      for (Py_ssize_t i=0; i < ids->size; i++) {
2348          Py_XDECREF(ids->array[i]);
2349      }
2350      ids->size = 0;
2351      PyMem_Free(ids->array);
2352      ids->array = NULL;
2353      // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354      // after Py_Finalize().
2355  }
2356  
2357  
2358  /* Internal function, doesn't check maximum character */
2359  
2360  PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361  _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362  {
2363      const unsigned char *s = (const unsigned char *)buffer;
2364      PyObject *unicode;
2365      if (size == 1) {
2366  #ifdef Py_DEBUG
2367          assert((unsigned char)s[0] < 128);
2368  #endif
2369          return get_latin1_char(s[0]);
2370      }
2371      unicode = PyUnicode_New(size, 127);
2372      if (!unicode)
2373          return NULL;
2374      memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375      assert(_PyUnicode_CheckConsistency(unicode, 1));
2376      return unicode;
2377  }
2378  
2379  static Py_UCS4
kind_maxchar_limit(unsigned int kind)2380  kind_maxchar_limit(unsigned int kind)
2381  {
2382      switch (kind) {
2383      case PyUnicode_1BYTE_KIND:
2384          return 0x80;
2385      case PyUnicode_2BYTE_KIND:
2386          return 0x100;
2387      case PyUnicode_4BYTE_KIND:
2388          return 0x10000;
2389      default:
2390          Py_UNREACHABLE();
2391      }
2392  }
2393  
2394  static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395  _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396  {
2397      PyObject *res;
2398      unsigned char max_char;
2399  
2400      if (size == 0) {
2401          _Py_RETURN_UNICODE_EMPTY();
2402      }
2403      assert(size > 0);
2404      if (size == 1) {
2405          return get_latin1_char(u[0]);
2406      }
2407  
2408      max_char = ucs1lib_find_max_char(u, u + size);
2409      res = PyUnicode_New(size, max_char);
2410      if (!res)
2411          return NULL;
2412      memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413      assert(_PyUnicode_CheckConsistency(res, 1));
2414      return res;
2415  }
2416  
2417  static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418  _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419  {
2420      PyObject *res;
2421      Py_UCS2 max_char;
2422  
2423      if (size == 0)
2424          _Py_RETURN_UNICODE_EMPTY();
2425      assert(size > 0);
2426      if (size == 1)
2427          return unicode_char(u[0]);
2428  
2429      max_char = ucs2lib_find_max_char(u, u + size);
2430      res = PyUnicode_New(size, max_char);
2431      if (!res)
2432          return NULL;
2433      if (max_char >= 256)
2434          memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435      else {
2436          _PyUnicode_CONVERT_BYTES(
2437              Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438      }
2439      assert(_PyUnicode_CheckConsistency(res, 1));
2440      return res;
2441  }
2442  
2443  static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444  _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445  {
2446      PyObject *res;
2447      Py_UCS4 max_char;
2448  
2449      if (size == 0)
2450          _Py_RETURN_UNICODE_EMPTY();
2451      assert(size > 0);
2452      if (size == 1)
2453          return unicode_char(u[0]);
2454  
2455      max_char = ucs4lib_find_max_char(u, u + size);
2456      res = PyUnicode_New(size, max_char);
2457      if (!res)
2458          return NULL;
2459      if (max_char < 256)
2460          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461                                   PyUnicode_1BYTE_DATA(res));
2462      else if (max_char < 0x10000)
2463          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464                                   PyUnicode_2BYTE_DATA(res));
2465      else
2466          memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467      assert(_PyUnicode_CheckConsistency(res, 1));
2468      return res;
2469  }
2470  
2471  PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472  PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473  {
2474      if (size < 0) {
2475          PyErr_SetString(PyExc_ValueError, "size must be positive");
2476          return NULL;
2477      }
2478      switch (kind) {
2479      case PyUnicode_1BYTE_KIND:
2480          return _PyUnicode_FromUCS1(buffer, size);
2481      case PyUnicode_2BYTE_KIND:
2482          return _PyUnicode_FromUCS2(buffer, size);
2483      case PyUnicode_4BYTE_KIND:
2484          return _PyUnicode_FromUCS4(buffer, size);
2485      default:
2486          PyErr_SetString(PyExc_SystemError, "invalid kind");
2487          return NULL;
2488      }
2489  }
2490  
2491  Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492  _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493  {
2494      enum PyUnicode_Kind kind;
2495      const void *startptr, *endptr;
2496  
2497      assert(PyUnicode_IS_READY(unicode));
2498      assert(0 <= start);
2499      assert(end <= PyUnicode_GET_LENGTH(unicode));
2500      assert(start <= end);
2501  
2502      if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503          return PyUnicode_MAX_CHAR_VALUE(unicode);
2504  
2505      if (start == end)
2506          return 127;
2507  
2508      if (PyUnicode_IS_ASCII(unicode))
2509          return 127;
2510  
2511      kind = PyUnicode_KIND(unicode);
2512      startptr = PyUnicode_DATA(unicode);
2513      endptr = (char *)startptr + end * kind;
2514      startptr = (char *)startptr + start * kind;
2515      switch(kind) {
2516      case PyUnicode_1BYTE_KIND:
2517          return ucs1lib_find_max_char(startptr, endptr);
2518      case PyUnicode_2BYTE_KIND:
2519          return ucs2lib_find_max_char(startptr, endptr);
2520      case PyUnicode_4BYTE_KIND:
2521          return ucs4lib_find_max_char(startptr, endptr);
2522      default:
2523          Py_UNREACHABLE();
2524      }
2525  }
2526  
2527  /* Ensure that a string uses the most efficient storage, if it is not the
2528     case: create a new string with of the right kind. Write NULL into *p_unicode
2529     on error. */
2530  static void
unicode_adjust_maxchar(PyObject ** p_unicode)2531  unicode_adjust_maxchar(PyObject **p_unicode)
2532  {
2533      PyObject *unicode, *copy;
2534      Py_UCS4 max_char;
2535      Py_ssize_t len;
2536      unsigned int kind;
2537  
2538      assert(p_unicode != NULL);
2539      unicode = *p_unicode;
2540      assert(PyUnicode_IS_READY(unicode));
2541      if (PyUnicode_IS_ASCII(unicode))
2542          return;
2543  
2544      len = PyUnicode_GET_LENGTH(unicode);
2545      kind = PyUnicode_KIND(unicode);
2546      if (kind == PyUnicode_1BYTE_KIND) {
2547          const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548          max_char = ucs1lib_find_max_char(u, u + len);
2549          if (max_char >= 128)
2550              return;
2551      }
2552      else if (kind == PyUnicode_2BYTE_KIND) {
2553          const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554          max_char = ucs2lib_find_max_char(u, u + len);
2555          if (max_char >= 256)
2556              return;
2557      }
2558      else if (kind == PyUnicode_4BYTE_KIND) {
2559          const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560          max_char = ucs4lib_find_max_char(u, u + len);
2561          if (max_char >= 0x10000)
2562              return;
2563      }
2564      else
2565          Py_UNREACHABLE();
2566  
2567      copy = PyUnicode_New(len, max_char);
2568      if (copy != NULL)
2569          _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570      Py_DECREF(unicode);
2571      *p_unicode = copy;
2572  }
2573  
2574  PyObject*
_PyUnicode_Copy(PyObject * unicode)2575  _PyUnicode_Copy(PyObject *unicode)
2576  {
2577      Py_ssize_t length;
2578      PyObject *copy;
2579  
2580      if (!PyUnicode_Check(unicode)) {
2581          PyErr_BadInternalCall();
2582          return NULL;
2583      }
2584      if (PyUnicode_READY(unicode) == -1)
2585          return NULL;
2586  
2587      length = PyUnicode_GET_LENGTH(unicode);
2588      copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589      if (!copy)
2590          return NULL;
2591      assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592  
2593      memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594                length * PyUnicode_KIND(unicode));
2595      assert(_PyUnicode_CheckConsistency(copy, 1));
2596      return copy;
2597  }
2598  
2599  
2600  /* Widen Unicode objects to larger buffers. Don't write terminating null
2601     character. Return NULL on error. */
2602  
2603  static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604  unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605  {
2606      void *result;
2607  
2608      assert(skind < kind);
2609      switch (kind) {
2610      case PyUnicode_2BYTE_KIND:
2611          result = PyMem_New(Py_UCS2, len);
2612          if (!result)
2613              return PyErr_NoMemory();
2614          assert(skind == PyUnicode_1BYTE_KIND);
2615          _PyUnicode_CONVERT_BYTES(
2616              Py_UCS1, Py_UCS2,
2617              (const Py_UCS1 *)data,
2618              ((const Py_UCS1 *)data) + len,
2619              result);
2620          return result;
2621      case PyUnicode_4BYTE_KIND:
2622          result = PyMem_New(Py_UCS4, len);
2623          if (!result)
2624              return PyErr_NoMemory();
2625          if (skind == PyUnicode_2BYTE_KIND) {
2626              _PyUnicode_CONVERT_BYTES(
2627                  Py_UCS2, Py_UCS4,
2628                  (const Py_UCS2 *)data,
2629                  ((const Py_UCS2 *)data) + len,
2630                  result);
2631          }
2632          else {
2633              assert(skind == PyUnicode_1BYTE_KIND);
2634              _PyUnicode_CONVERT_BYTES(
2635                  Py_UCS1, Py_UCS4,
2636                  (const Py_UCS1 *)data,
2637                  ((const Py_UCS1 *)data) + len,
2638                  result);
2639          }
2640          return result;
2641      default:
2642          Py_UNREACHABLE();
2643          return NULL;
2644      }
2645  }
2646  
2647  static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648  as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649          int copy_null)
2650  {
2651      int kind;
2652      const void *data;
2653      Py_ssize_t len, targetlen;
2654      if (PyUnicode_READY(string) == -1)
2655          return NULL;
2656      kind = PyUnicode_KIND(string);
2657      data = PyUnicode_DATA(string);
2658      len = PyUnicode_GET_LENGTH(string);
2659      targetlen = len;
2660      if (copy_null)
2661          targetlen++;
2662      if (!target) {
2663          target = PyMem_New(Py_UCS4, targetlen);
2664          if (!target) {
2665              PyErr_NoMemory();
2666              return NULL;
2667          }
2668      }
2669      else {
2670          if (targetsize < targetlen) {
2671              PyErr_Format(PyExc_SystemError,
2672                           "string is longer than the buffer");
2673              if (copy_null && 0 < targetsize)
2674                  target[0] = 0;
2675              return NULL;
2676          }
2677      }
2678      if (kind == PyUnicode_1BYTE_KIND) {
2679          const Py_UCS1 *start = (const Py_UCS1 *) data;
2680          _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681      }
2682      else if (kind == PyUnicode_2BYTE_KIND) {
2683          const Py_UCS2 *start = (const Py_UCS2 *) data;
2684          _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685      }
2686      else if (kind == PyUnicode_4BYTE_KIND) {
2687          memcpy(target, data, len * sizeof(Py_UCS4));
2688      }
2689      else {
2690          Py_UNREACHABLE();
2691      }
2692      if (copy_null)
2693          target[len] = 0;
2694      return target;
2695  }
2696  
2697  Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698  PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699                   int copy_null)
2700  {
2701      if (target == NULL || targetsize < 0) {
2702          PyErr_BadInternalCall();
2703          return NULL;
2704      }
2705      return as_ucs4(string, target, targetsize, copy_null);
2706  }
2707  
2708  Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2709  PyUnicode_AsUCS4Copy(PyObject *string)
2710  {
2711      return as_ucs4(string, NULL, 0, 1);
2712  }
2713  
2714  /* maximum number of characters required for output of %lld or %p.
2715     We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716     plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2717  #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718  
2719  static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720  unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721                               Py_ssize_t width, Py_ssize_t precision)
2722  {
2723      Py_ssize_t length, fill, arglen;
2724      Py_UCS4 maxchar;
2725  
2726      if (PyUnicode_READY(str) == -1)
2727          return -1;
2728  
2729      length = PyUnicode_GET_LENGTH(str);
2730      if ((precision == -1 || precision >= length)
2731          && width <= length)
2732          return _PyUnicodeWriter_WriteStr(writer, str);
2733  
2734      if (precision != -1)
2735          length = Py_MIN(precision, length);
2736  
2737      arglen = Py_MAX(length, width);
2738      if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739          maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740      else
2741          maxchar = writer->maxchar;
2742  
2743      if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744          return -1;
2745  
2746      if (width > length) {
2747          fill = width - length;
2748          if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749              return -1;
2750          writer->pos += fill;
2751      }
2752  
2753      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754                                    str, 0, length);
2755      writer->pos += length;
2756      return 0;
2757  }
2758  
2759  static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760  unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761                                Py_ssize_t width, Py_ssize_t precision)
2762  {
2763      /* UTF-8 */
2764      Py_ssize_t length;
2765      PyObject *unicode;
2766      int res;
2767  
2768      if (precision == -1) {
2769          length = strlen(str);
2770      }
2771      else {
2772          length = 0;
2773          while (length < precision && str[length]) {
2774              length++;
2775          }
2776      }
2777      unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778      if (unicode == NULL)
2779          return -1;
2780  
2781      res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782      Py_DECREF(unicode);
2783      return res;
2784  }
2785  
2786  static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787  unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788                         const char *f, va_list *vargs)
2789  {
2790      const char *p;
2791      Py_ssize_t len;
2792      int zeropad;
2793      Py_ssize_t width;
2794      Py_ssize_t precision;
2795      int longflag;
2796      int longlongflag;
2797      int size_tflag;
2798      Py_ssize_t fill;
2799  
2800      p = f;
2801      f++;
2802      zeropad = 0;
2803      if (*f == '0') {
2804          zeropad = 1;
2805          f++;
2806      }
2807  
2808      /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809      width = -1;
2810      if (Py_ISDIGIT((unsigned)*f)) {
2811          width = *f - '0';
2812          f++;
2813          while (Py_ISDIGIT((unsigned)*f)) {
2814              if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815                  PyErr_SetString(PyExc_ValueError,
2816                                  "width too big");
2817                  return NULL;
2818              }
2819              width = (width * 10) + (*f - '0');
2820              f++;
2821          }
2822      }
2823      precision = -1;
2824      if (*f == '.') {
2825          f++;
2826          if (Py_ISDIGIT((unsigned)*f)) {
2827              precision = (*f - '0');
2828              f++;
2829              while (Py_ISDIGIT((unsigned)*f)) {
2830                  if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831                      PyErr_SetString(PyExc_ValueError,
2832                                      "precision too big");
2833                      return NULL;
2834                  }
2835                  precision = (precision * 10) + (*f - '0');
2836                  f++;
2837              }
2838          }
2839          if (*f == '%') {
2840              /* "%.3%s" => f points to "3" */
2841              f--;
2842          }
2843      }
2844      if (*f == '\0') {
2845          /* bogus format "%.123" => go backward, f points to "3" */
2846          f--;
2847      }
2848  
2849      /* Handle %ld, %lu, %lld and %llu. */
2850      longflag = 0;
2851      longlongflag = 0;
2852      size_tflag = 0;
2853      if (*f == 'l') {
2854          if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855              longflag = 1;
2856              ++f;
2857          }
2858          else if (f[1] == 'l' &&
2859                   (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860              longlongflag = 1;
2861              f += 2;
2862          }
2863      }
2864      /* handle the size_t flag. */
2865      else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866          size_tflag = 1;
2867          ++f;
2868      }
2869  
2870      if (f[1] == '\0')
2871          writer->overallocate = 0;
2872  
2873      switch (*f) {
2874      case 'c':
2875      {
2876          int ordinal = va_arg(*vargs, int);
2877          if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878              PyErr_SetString(PyExc_OverflowError,
2879                              "character argument not in range(0x110000)");
2880              return NULL;
2881          }
2882          if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883              return NULL;
2884          break;
2885      }
2886  
2887      case 'i':
2888      case 'd':
2889      case 'u':
2890      case 'x':
2891      {
2892          /* used by sprintf */
2893          char buffer[MAX_LONG_LONG_CHARS];
2894          Py_ssize_t arglen;
2895  
2896          if (*f == 'u') {
2897              if (longflag) {
2898                  len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899              }
2900              else if (longlongflag) {
2901                  len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902              }
2903              else if (size_tflag) {
2904                  len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905              }
2906              else {
2907                  len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908              }
2909          }
2910          else if (*f == 'x') {
2911              len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912          }
2913          else {
2914              if (longflag) {
2915                  len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916              }
2917              else if (longlongflag) {
2918                  len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919              }
2920              else if (size_tflag) {
2921                  len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922              }
2923              else {
2924                  len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925              }
2926          }
2927          assert(len >= 0);
2928  
2929          if (precision < len)
2930              precision = len;
2931  
2932          arglen = Py_MAX(precision, width);
2933          if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934              return NULL;
2935  
2936          if (width > precision) {
2937              Py_UCS4 fillchar;
2938              fill = width - precision;
2939              fillchar = zeropad?'0':' ';
2940              if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941                  return NULL;
2942              writer->pos += fill;
2943          }
2944          if (precision > len) {
2945              fill = precision - len;
2946              if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947                  return NULL;
2948              writer->pos += fill;
2949          }
2950  
2951          if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952              return NULL;
2953          break;
2954      }
2955  
2956      case 'p':
2957      {
2958          char number[MAX_LONG_LONG_CHARS];
2959  
2960          len = sprintf(number, "%p", va_arg(*vargs, void*));
2961          assert(len >= 0);
2962  
2963          /* %p is ill-defined:  ensure leading 0x. */
2964          if (number[1] == 'X')
2965              number[1] = 'x';
2966          else if (number[1] != 'x') {
2967              memmove(number + 2, number,
2968                      strlen(number) + 1);
2969              number[0] = '0';
2970              number[1] = 'x';
2971              len += 2;
2972          }
2973  
2974          if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975              return NULL;
2976          break;
2977      }
2978  
2979      case 's':
2980      {
2981          /* UTF-8 */
2982          const char *s = va_arg(*vargs, const char*);
2983          if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984              return NULL;
2985          break;
2986      }
2987  
2988      case 'U':
2989      {
2990          PyObject *obj = va_arg(*vargs, PyObject *);
2991          assert(obj && _PyUnicode_CHECK(obj));
2992  
2993          if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994              return NULL;
2995          break;
2996      }
2997  
2998      case 'V':
2999      {
3000          PyObject *obj = va_arg(*vargs, PyObject *);
3001          const char *str = va_arg(*vargs, const char *);
3002          if (obj) {
3003              assert(_PyUnicode_CHECK(obj));
3004              if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005                  return NULL;
3006          }
3007          else {
3008              assert(str != NULL);
3009              if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010                  return NULL;
3011          }
3012          break;
3013      }
3014  
3015      case 'S':
3016      {
3017          PyObject *obj = va_arg(*vargs, PyObject *);
3018          PyObject *str;
3019          assert(obj);
3020          str = PyObject_Str(obj);
3021          if (!str)
3022              return NULL;
3023          if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024              Py_DECREF(str);
3025              return NULL;
3026          }
3027          Py_DECREF(str);
3028          break;
3029      }
3030  
3031      case 'R':
3032      {
3033          PyObject *obj = va_arg(*vargs, PyObject *);
3034          PyObject *repr;
3035          assert(obj);
3036          repr = PyObject_Repr(obj);
3037          if (!repr)
3038              return NULL;
3039          if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040              Py_DECREF(repr);
3041              return NULL;
3042          }
3043          Py_DECREF(repr);
3044          break;
3045      }
3046  
3047      case 'A':
3048      {
3049          PyObject *obj = va_arg(*vargs, PyObject *);
3050          PyObject *ascii;
3051          assert(obj);
3052          ascii = PyObject_ASCII(obj);
3053          if (!ascii)
3054              return NULL;
3055          if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056              Py_DECREF(ascii);
3057              return NULL;
3058          }
3059          Py_DECREF(ascii);
3060          break;
3061      }
3062  
3063      case '%':
3064          if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065              return NULL;
3066          break;
3067  
3068      default:
3069          /* if we stumble upon an unknown formatting code, copy the rest
3070             of the format string to the output string. (we cannot just
3071             skip the code, since there's no way to know what's in the
3072             argument list) */
3073          len = strlen(p);
3074          if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075              return NULL;
3076          f = p+len;
3077          return f;
3078      }
3079  
3080      f++;
3081      return f;
3082  }
3083  
3084  PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3085  PyUnicode_FromFormatV(const char *format, va_list vargs)
3086  {
3087      va_list vargs2;
3088      const char *f;
3089      _PyUnicodeWriter writer;
3090  
3091      _PyUnicodeWriter_Init(&writer);
3092      writer.min_length = strlen(format) + 100;
3093      writer.overallocate = 1;
3094  
3095      // Copy varags to be able to pass a reference to a subfunction.
3096      va_copy(vargs2, vargs);
3097  
3098      for (f = format; *f; ) {
3099          if (*f == '%') {
3100              f = unicode_fromformat_arg(&writer, f, &vargs2);
3101              if (f == NULL)
3102                  goto fail;
3103          }
3104          else {
3105              const char *p;
3106              Py_ssize_t len;
3107  
3108              p = f;
3109              do
3110              {
3111                  if ((unsigned char)*p > 127) {
3112                      PyErr_Format(PyExc_ValueError,
3113                          "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114                          "string, got a non-ASCII byte: 0x%02x",
3115                          (unsigned char)*p);
3116                      goto fail;
3117                  }
3118                  p++;
3119              }
3120              while (*p != '\0' && *p != '%');
3121              len = p - f;
3122  
3123              if (*p == '\0')
3124                  writer.overallocate = 0;
3125  
3126              if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127                  goto fail;
3128  
3129              f = p;
3130          }
3131      }
3132      va_end(vargs2);
3133      return _PyUnicodeWriter_Finish(&writer);
3134  
3135    fail:
3136      va_end(vargs2);
3137      _PyUnicodeWriter_Dealloc(&writer);
3138      return NULL;
3139  }
3140  
3141  PyObject *
PyUnicode_FromFormat(const char * format,...)3142  PyUnicode_FromFormat(const char *format, ...)
3143  {
3144      PyObject* ret;
3145      va_list vargs;
3146  
3147  #ifdef HAVE_STDARG_PROTOTYPES
3148      va_start(vargs, format);
3149  #else
3150      va_start(vargs);
3151  #endif
3152      ret = PyUnicode_FromFormatV(format, vargs);
3153      va_end(vargs);
3154      return ret;
3155  }
3156  
3157  static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3158  unicode_get_widechar_size(PyObject *unicode)
3159  {
3160      Py_ssize_t res;
3161  
3162      assert(unicode != NULL);
3163      assert(_PyUnicode_CHECK(unicode));
3164  
3165  #if USE_UNICODE_WCHAR_CACHE
3166      if (_PyUnicode_WSTR(unicode) != NULL) {
3167          return PyUnicode_WSTR_LENGTH(unicode);
3168      }
3169  #endif /* USE_UNICODE_WCHAR_CACHE */
3170      assert(PyUnicode_IS_READY(unicode));
3171  
3172      res = _PyUnicode_LENGTH(unicode);
3173  #if SIZEOF_WCHAR_T == 2
3174      if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175          const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176          const Py_UCS4 *end = s + res;
3177          for (; s < end; ++s) {
3178              if (*s > 0xFFFF) {
3179                  ++res;
3180              }
3181          }
3182      }
3183  #endif
3184      return res;
3185  }
3186  
3187  static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188  unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189  {
3190      assert(unicode != NULL);
3191      assert(_PyUnicode_CHECK(unicode));
3192  
3193  #if USE_UNICODE_WCHAR_CACHE
3194      const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195      if (wstr != NULL) {
3196          memcpy(w, wstr, size * sizeof(wchar_t));
3197          return;
3198      }
3199  #else /* USE_UNICODE_WCHAR_CACHE */
3200      if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201          memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202          return;
3203      }
3204  #endif /* USE_UNICODE_WCHAR_CACHE */
3205      assert(PyUnicode_IS_READY(unicode));
3206  
3207      if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208          const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209          for (; size--; ++s, ++w) {
3210              *w = *s;
3211          }
3212      }
3213      else {
3214  #if SIZEOF_WCHAR_T == 4
3215          assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216          const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217          for (; size--; ++s, ++w) {
3218              *w = *s;
3219          }
3220  #else
3221          assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222          const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223          for (; size--; ++s, ++w) {
3224              Py_UCS4 ch = *s;
3225              if (ch > 0xFFFF) {
3226                  assert(ch <= MAX_UNICODE);
3227                  /* encode surrogate pair in this case */
3228                  *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229                  if (!size--)
3230                      break;
3231                  *w = Py_UNICODE_LOW_SURROGATE(ch);
3232              }
3233              else {
3234                  *w = ch;
3235              }
3236          }
3237  #endif
3238      }
3239  }
3240  
3241  #ifdef HAVE_WCHAR_H
3242  
3243  /* Convert a Unicode object to a wide character string.
3244  
3245     - If w is NULL: return the number of wide characters (including the null
3246       character) required to convert the unicode object. Ignore size argument.
3247  
3248     - Otherwise: return the number of wide characters (excluding the null
3249       character) written into w. Write at most size wide characters (including
3250       the null character). */
3251  Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252  PyUnicode_AsWideChar(PyObject *unicode,
3253                       wchar_t *w,
3254                       Py_ssize_t size)
3255  {
3256      Py_ssize_t res;
3257  
3258      if (unicode == NULL) {
3259          PyErr_BadInternalCall();
3260          return -1;
3261      }
3262      if (!PyUnicode_Check(unicode)) {
3263          PyErr_BadArgument();
3264          return -1;
3265      }
3266  
3267      res = unicode_get_widechar_size(unicode);
3268      if (w == NULL) {
3269          return res + 1;
3270      }
3271  
3272      if (size > res) {
3273          size = res + 1;
3274      }
3275      else {
3276          res = size;
3277      }
3278      unicode_copy_as_widechar(unicode, w, size);
3279  
3280  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281      /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282         non-Unicode locales and hence needs conversion first. */
3283      if (_Py_LocaleUsesNonUnicodeWchar()) {
3284          if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285              return -1;
3286          }
3287      }
3288  #endif
3289  
3290      return res;
3291  }
3292  
3293  wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294  PyUnicode_AsWideCharString(PyObject *unicode,
3295                             Py_ssize_t *size)
3296  {
3297      wchar_t *buffer;
3298      Py_ssize_t buflen;
3299  
3300      if (unicode == NULL) {
3301          PyErr_BadInternalCall();
3302          return NULL;
3303      }
3304      if (!PyUnicode_Check(unicode)) {
3305          PyErr_BadArgument();
3306          return NULL;
3307      }
3308  
3309      buflen = unicode_get_widechar_size(unicode);
3310      buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311      if (buffer == NULL) {
3312          PyErr_NoMemory();
3313          return NULL;
3314      }
3315      unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316  
3317  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318      /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319         non-Unicode locales and hence needs conversion first. */
3320      if (_Py_LocaleUsesNonUnicodeWchar()) {
3321          if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322              return NULL;
3323          }
3324      }
3325  #endif
3326  
3327      if (size != NULL) {
3328          *size = buflen;
3329      }
3330      else if (wcslen(buffer) != (size_t)buflen) {
3331          PyMem_Free(buffer);
3332          PyErr_SetString(PyExc_ValueError,
3333                          "embedded null character");
3334          return NULL;
3335      }
3336      return buffer;
3337  }
3338  
3339  #endif /* HAVE_WCHAR_H */
3340  
3341  int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342  _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343  {
3344      wchar_t **p = (wchar_t **)ptr;
3345      if (obj == NULL) {
3346  #if !USE_UNICODE_WCHAR_CACHE
3347          PyMem_Free(*p);
3348  #endif /* USE_UNICODE_WCHAR_CACHE */
3349          *p = NULL;
3350          return 1;
3351      }
3352      if (PyUnicode_Check(obj)) {
3353  #if USE_UNICODE_WCHAR_CACHE
3354          *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355          if (*p == NULL) {
3356              return 0;
3357          }
3358          return 1;
3359  #else /* USE_UNICODE_WCHAR_CACHE */
3360          *p = PyUnicode_AsWideCharString(obj, NULL);
3361          if (*p == NULL) {
3362              return 0;
3363          }
3364          return Py_CLEANUP_SUPPORTED;
3365  #endif /* USE_UNICODE_WCHAR_CACHE */
3366      }
3367      PyErr_Format(PyExc_TypeError,
3368                   "argument must be str, not %.50s",
3369                   Py_TYPE(obj)->tp_name);
3370      return 0;
3371  }
3372  
3373  int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374  _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375  {
3376      wchar_t **p = (wchar_t **)ptr;
3377      if (obj == NULL) {
3378  #if !USE_UNICODE_WCHAR_CACHE
3379          PyMem_Free(*p);
3380  #endif /* USE_UNICODE_WCHAR_CACHE */
3381          *p = NULL;
3382          return 1;
3383      }
3384      if (obj == Py_None) {
3385          *p = NULL;
3386          return 1;
3387      }
3388      if (PyUnicode_Check(obj)) {
3389  #if USE_UNICODE_WCHAR_CACHE
3390          *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391          if (*p == NULL) {
3392              return 0;
3393          }
3394          return 1;
3395  #else /* USE_UNICODE_WCHAR_CACHE */
3396          *p = PyUnicode_AsWideCharString(obj, NULL);
3397          if (*p == NULL) {
3398              return 0;
3399          }
3400          return Py_CLEANUP_SUPPORTED;
3401  #endif /* USE_UNICODE_WCHAR_CACHE */
3402      }
3403      PyErr_Format(PyExc_TypeError,
3404                   "argument must be str or None, not %.50s",
3405                   Py_TYPE(obj)->tp_name);
3406      return 0;
3407  }
3408  
3409  PyObject *
PyUnicode_FromOrdinal(int ordinal)3410  PyUnicode_FromOrdinal(int ordinal)
3411  {
3412      if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413          PyErr_SetString(PyExc_ValueError,
3414                          "chr() arg not in range(0x110000)");
3415          return NULL;
3416      }
3417  
3418      return unicode_char((Py_UCS4)ordinal);
3419  }
3420  
3421  PyObject *
PyUnicode_FromObject(PyObject * obj)3422  PyUnicode_FromObject(PyObject *obj)
3423  {
3424      /* XXX Perhaps we should make this API an alias of
3425         PyObject_Str() instead ?! */
3426      if (PyUnicode_CheckExact(obj)) {
3427          if (PyUnicode_READY(obj) == -1)
3428              return NULL;
3429          Py_INCREF(obj);
3430          return obj;
3431      }
3432      if (PyUnicode_Check(obj)) {
3433          /* For a Unicode subtype that's not a Unicode object,
3434             return a true Unicode object with the same data. */
3435          return _PyUnicode_Copy(obj);
3436      }
3437      PyErr_Format(PyExc_TypeError,
3438                   "Can't convert '%.100s' object to str implicitly",
3439                   Py_TYPE(obj)->tp_name);
3440      return NULL;
3441  }
3442  
3443  PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444  PyUnicode_FromEncodedObject(PyObject *obj,
3445                              const char *encoding,
3446                              const char *errors)
3447  {
3448      Py_buffer buffer;
3449      PyObject *v;
3450  
3451      if (obj == NULL) {
3452          PyErr_BadInternalCall();
3453          return NULL;
3454      }
3455  
3456      /* Decoding bytes objects is the most common case and should be fast */
3457      if (PyBytes_Check(obj)) {
3458          if (PyBytes_GET_SIZE(obj) == 0) {
3459              if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460                  return NULL;
3461              }
3462              _Py_RETURN_UNICODE_EMPTY();
3463          }
3464          return PyUnicode_Decode(
3465                  PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466                  encoding, errors);
3467      }
3468  
3469      if (PyUnicode_Check(obj)) {
3470          PyErr_SetString(PyExc_TypeError,
3471                          "decoding str is not supported");
3472          return NULL;
3473      }
3474  
3475      /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476      if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477          PyErr_Format(PyExc_TypeError,
3478                       "decoding to str: need a bytes-like object, %.80s found",
3479                       Py_TYPE(obj)->tp_name);
3480          return NULL;
3481      }
3482  
3483      if (buffer.len == 0) {
3484          PyBuffer_Release(&buffer);
3485          if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486              return NULL;
3487          }
3488          _Py_RETURN_UNICODE_EMPTY();
3489      }
3490  
3491      v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492      PyBuffer_Release(&buffer);
3493      return v;
3494  }
3495  
3496  /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497     also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498     longer than lower_len-1). */
3499  int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500  _Py_normalize_encoding(const char *encoding,
3501                         char *lower,
3502                         size_t lower_len)
3503  {
3504      const char *e;
3505      char *l;
3506      char *l_end;
3507      int punct;
3508  
3509      assert(encoding != NULL);
3510  
3511      e = encoding;
3512      l = lower;
3513      l_end = &lower[lower_len - 1];
3514      punct = 0;
3515      while (1) {
3516          char c = *e;
3517          if (c == 0) {
3518              break;
3519          }
3520  
3521          if (Py_ISALNUM(c) || c == '.') {
3522              if (punct && l != lower) {
3523                  if (l == l_end) {
3524                      return 0;
3525                  }
3526                  *l++ = '_';
3527              }
3528              punct = 0;
3529  
3530              if (l == l_end) {
3531                  return 0;
3532              }
3533              *l++ = Py_TOLOWER(c);
3534          }
3535          else {
3536              punct = 1;
3537          }
3538  
3539          e++;
3540      }
3541      *l = '\0';
3542      return 1;
3543  }
3544  
3545  PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546  PyUnicode_Decode(const char *s,
3547                   Py_ssize_t size,
3548                   const char *encoding,
3549                   const char *errors)
3550  {
3551      PyObject *buffer = NULL, *unicode;
3552      Py_buffer info;
3553      char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554  
3555      if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556          return NULL;
3557      }
3558  
3559      if (size == 0) {
3560          _Py_RETURN_UNICODE_EMPTY();
3561      }
3562  
3563      if (encoding == NULL) {
3564          return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565      }
3566  
3567      /* Shortcuts for common default encodings */
3568      if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569          char *lower = buflower;
3570  
3571          /* Fast paths */
3572          if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573              lower += 3;
3574              if (*lower == '_') {
3575                  /* Match "utf8" and "utf_8" */
3576                  lower++;
3577              }
3578  
3579              if (lower[0] == '8' && lower[1] == 0) {
3580                  return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581              }
3582              else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583                  return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584              }
3585              else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586                  return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587              }
3588          }
3589          else {
3590              if (strcmp(lower, "ascii") == 0
3591                  || strcmp(lower, "us_ascii") == 0) {
3592                  return PyUnicode_DecodeASCII(s, size, errors);
3593              }
3594      #ifdef MS_WINDOWS
3595              else if (strcmp(lower, "mbcs") == 0) {
3596                  return PyUnicode_DecodeMBCS(s, size, errors);
3597              }
3598      #endif
3599              else if (strcmp(lower, "latin1") == 0
3600                       || strcmp(lower, "latin_1") == 0
3601                       || strcmp(lower, "iso_8859_1") == 0
3602                       || strcmp(lower, "iso8859_1") == 0) {
3603                  return PyUnicode_DecodeLatin1(s, size, errors);
3604              }
3605          }
3606      }
3607  
3608      /* Decode via the codec registry */
3609      buffer = NULL;
3610      if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611          goto onError;
3612      buffer = PyMemoryView_FromBuffer(&info);
3613      if (buffer == NULL)
3614          goto onError;
3615      unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616      if (unicode == NULL)
3617          goto onError;
3618      if (!PyUnicode_Check(unicode)) {
3619          PyErr_Format(PyExc_TypeError,
3620                       "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621                       "use codecs.decode() to decode to arbitrary types",
3622                       encoding,
3623                       Py_TYPE(unicode)->tp_name);
3624          Py_DECREF(unicode);
3625          goto onError;
3626      }
3627      Py_DECREF(buffer);
3628      return unicode_result(unicode);
3629  
3630    onError:
3631      Py_XDECREF(buffer);
3632      return NULL;
3633  }
3634  
3635  PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636  PyUnicode_AsDecodedObject(PyObject *unicode,
3637                            const char *encoding,
3638                            const char *errors)
3639  {
3640      if (!PyUnicode_Check(unicode)) {
3641          PyErr_BadArgument();
3642          return NULL;
3643      }
3644  
3645      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646                       "PyUnicode_AsDecodedObject() is deprecated; "
3647                       "use PyCodec_Decode() to decode from str", 1) < 0)
3648          return NULL;
3649  
3650      if (encoding == NULL)
3651          encoding = PyUnicode_GetDefaultEncoding();
3652  
3653      /* Decode via the codec registry */
3654      return PyCodec_Decode(unicode, encoding, errors);
3655  }
3656  
3657  PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658  PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659                             const char *encoding,
3660                             const char *errors)
3661  {
3662      PyObject *v;
3663  
3664      if (!PyUnicode_Check(unicode)) {
3665          PyErr_BadArgument();
3666          goto onError;
3667      }
3668  
3669      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670                       "PyUnicode_AsDecodedUnicode() is deprecated; "
3671                       "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672          return NULL;
3673  
3674      if (encoding == NULL)
3675          encoding = PyUnicode_GetDefaultEncoding();
3676  
3677      /* Decode via the codec registry */
3678      v = PyCodec_Decode(unicode, encoding, errors);
3679      if (v == NULL)
3680          goto onError;
3681      if (!PyUnicode_Check(v)) {
3682          PyErr_Format(PyExc_TypeError,
3683                       "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684                       "use codecs.decode() to decode to arbitrary types",
3685                       encoding,
3686                       Py_TYPE(unicode)->tp_name);
3687          Py_DECREF(v);
3688          goto onError;
3689      }
3690      return unicode_result(v);
3691  
3692    onError:
3693      return NULL;
3694  }
3695  
3696  PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697  PyUnicode_AsEncodedObject(PyObject *unicode,
3698                            const char *encoding,
3699                            const char *errors)
3700  {
3701      PyObject *v;
3702  
3703      if (!PyUnicode_Check(unicode)) {
3704          PyErr_BadArgument();
3705          goto onError;
3706      }
3707  
3708      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709                       "PyUnicode_AsEncodedObject() is deprecated; "
3710                       "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711                       "or PyCodec_Encode() for generic encoding", 1) < 0)
3712          return NULL;
3713  
3714      if (encoding == NULL)
3715          encoding = PyUnicode_GetDefaultEncoding();
3716  
3717      /* Encode via the codec registry */
3718      v = PyCodec_Encode(unicode, encoding, errors);
3719      if (v == NULL)
3720          goto onError;
3721      return v;
3722  
3723    onError:
3724      return NULL;
3725  }
3726  
3727  
3728  static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729  unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730                        int current_locale)
3731  {
3732      Py_ssize_t wlen;
3733      wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734      if (wstr == NULL) {
3735          return NULL;
3736      }
3737  
3738      if ((size_t)wlen != wcslen(wstr)) {
3739          PyErr_SetString(PyExc_ValueError, "embedded null character");
3740          PyMem_Free(wstr);
3741          return NULL;
3742      }
3743  
3744      char *str;
3745      size_t error_pos;
3746      const char *reason;
3747      int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748                                   current_locale, error_handler);
3749      PyMem_Free(wstr);
3750  
3751      if (res != 0) {
3752          if (res == -2) {
3753              PyObject *exc;
3754              exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755                      "locale", unicode,
3756                      (Py_ssize_t)error_pos,
3757                      (Py_ssize_t)(error_pos+1),
3758                      reason);
3759              if (exc != NULL) {
3760                  PyCodec_StrictErrors(exc);
3761                  Py_DECREF(exc);
3762              }
3763          }
3764          else if (res == -3) {
3765              PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766          }
3767          else {
3768              PyErr_NoMemory();
3769          }
3770          return NULL;
3771      }
3772  
3773      PyObject *bytes = PyBytes_FromString(str);
3774      PyMem_RawFree(str);
3775      return bytes;
3776  }
3777  
3778  PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779  PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780  {
3781      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782      return unicode_encode_locale(unicode, error_handler, 1);
3783  }
3784  
3785  PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3786  PyUnicode_EncodeFSDefault(PyObject *unicode)
3787  {
3788      PyInterpreterState *interp = _PyInterpreterState_GET();
3789      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790      if (fs_codec->utf8) {
3791          return unicode_encode_utf8(unicode,
3792                                     fs_codec->error_handler,
3793                                     fs_codec->errors);
3794      }
3795  #ifndef _Py_FORCE_UTF8_FS_ENCODING
3796      else if (fs_codec->encoding) {
3797          return PyUnicode_AsEncodedString(unicode,
3798                                           fs_codec->encoding,
3799                                           fs_codec->errors);
3800      }
3801  #endif
3802      else {
3803          /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804             machinery is not ready and so cannot be used:
3805             use wcstombs() in this case. */
3806          const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807          const wchar_t *filesystem_errors = config->filesystem_errors;
3808          assert(filesystem_errors != NULL);
3809          _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810          assert(errors != _Py_ERROR_UNKNOWN);
3811  #ifdef _Py_FORCE_UTF8_FS_ENCODING
3812          return unicode_encode_utf8(unicode, errors, NULL);
3813  #else
3814          return unicode_encode_locale(unicode, errors, 0);
3815  #endif
3816      }
3817  }
3818  
3819  PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820  PyUnicode_AsEncodedString(PyObject *unicode,
3821                            const char *encoding,
3822                            const char *errors)
3823  {
3824      PyObject *v;
3825      char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826  
3827      if (!PyUnicode_Check(unicode)) {
3828          PyErr_BadArgument();
3829          return NULL;
3830      }
3831  
3832      if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833          return NULL;
3834      }
3835  
3836      if (encoding == NULL) {
3837          return _PyUnicode_AsUTF8String(unicode, errors);
3838      }
3839  
3840      /* Shortcuts for common default encodings */
3841      if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842          char *lower = buflower;
3843  
3844          /* Fast paths */
3845          if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846              lower += 3;
3847              if (*lower == '_') {
3848                  /* Match "utf8" and "utf_8" */
3849                  lower++;
3850              }
3851  
3852              if (lower[0] == '8' && lower[1] == 0) {
3853                  return _PyUnicode_AsUTF8String(unicode, errors);
3854              }
3855              else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856                  return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857              }
3858              else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859                  return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860              }
3861          }
3862          else {
3863              if (strcmp(lower, "ascii") == 0
3864                  || strcmp(lower, "us_ascii") == 0) {
3865                  return _PyUnicode_AsASCIIString(unicode, errors);
3866              }
3867  #ifdef MS_WINDOWS
3868              else if (strcmp(lower, "mbcs") == 0) {
3869                  return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870              }
3871  #endif
3872              else if (strcmp(lower, "latin1") == 0 ||
3873                       strcmp(lower, "latin_1") == 0 ||
3874                       strcmp(lower, "iso_8859_1") == 0 ||
3875                       strcmp(lower, "iso8859_1") == 0) {
3876                  return _PyUnicode_AsLatin1String(unicode, errors);
3877              }
3878          }
3879      }
3880  
3881      /* Encode via the codec registry */
3882      v = _PyCodec_EncodeText(unicode, encoding, errors);
3883      if (v == NULL)
3884          return NULL;
3885  
3886      /* The normal path */
3887      if (PyBytes_Check(v))
3888          return v;
3889  
3890      /* If the codec returns a buffer, raise a warning and convert to bytes */
3891      if (PyByteArray_Check(v)) {
3892          int error;
3893          PyObject *b;
3894  
3895          error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896              "encoder %s returned bytearray instead of bytes; "
3897              "use codecs.encode() to encode to arbitrary types",
3898              encoding);
3899          if (error) {
3900              Py_DECREF(v);
3901              return NULL;
3902          }
3903  
3904          b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905                                        PyByteArray_GET_SIZE(v));
3906          Py_DECREF(v);
3907          return b;
3908      }
3909  
3910      PyErr_Format(PyExc_TypeError,
3911                   "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912                   "use codecs.encode() to encode to arbitrary types",
3913                   encoding,
3914                   Py_TYPE(v)->tp_name);
3915      Py_DECREF(v);
3916      return NULL;
3917  }
3918  
3919  PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920  PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921                             const char *encoding,
3922                             const char *errors)
3923  {
3924      PyObject *v;
3925  
3926      if (!PyUnicode_Check(unicode)) {
3927          PyErr_BadArgument();
3928          goto onError;
3929      }
3930  
3931      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932                       "PyUnicode_AsEncodedUnicode() is deprecated; "
3933                       "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934          return NULL;
3935  
3936      if (encoding == NULL)
3937          encoding = PyUnicode_GetDefaultEncoding();
3938  
3939      /* Encode via the codec registry */
3940      v = PyCodec_Encode(unicode, encoding, errors);
3941      if (v == NULL)
3942          goto onError;
3943      if (!PyUnicode_Check(v)) {
3944          PyErr_Format(PyExc_TypeError,
3945                       "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946                       "use codecs.encode() to encode to arbitrary types",
3947                       encoding,
3948                       Py_TYPE(v)->tp_name);
3949          Py_DECREF(v);
3950          goto onError;
3951      }
3952      return v;
3953  
3954    onError:
3955      return NULL;
3956  }
3957  
3958  static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959  unicode_decode_locale(const char *str, Py_ssize_t len,
3960                        _Py_error_handler errors, int current_locale)
3961  {
3962      if (str[len] != '\0' || (size_t)len != strlen(str))  {
3963          PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964          return NULL;
3965      }
3966  
3967      wchar_t *wstr;
3968      size_t wlen;
3969      const char *reason;
3970      int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971                                   current_locale, errors);
3972      if (res != 0) {
3973          if (res == -2) {
3974              PyObject *exc;
3975              exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976                                          "locale", str, len,
3977                                          (Py_ssize_t)wlen,
3978                                          (Py_ssize_t)(wlen + 1),
3979                                          reason);
3980              if (exc != NULL) {
3981                  PyCodec_StrictErrors(exc);
3982                  Py_DECREF(exc);
3983              }
3984          }
3985          else if (res == -3) {
3986              PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987          }
3988          else {
3989              PyErr_NoMemory();
3990          }
3991          return NULL;
3992      }
3993  
3994      PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995      PyMem_RawFree(wstr);
3996      return unicode;
3997  }
3998  
3999  PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000  PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001                                const char *errors)
4002  {
4003      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004      return unicode_decode_locale(str, len, error_handler, 1);
4005  }
4006  
4007  PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4008  PyUnicode_DecodeLocale(const char *str, const char *errors)
4009  {
4010      Py_ssize_t size = (Py_ssize_t)strlen(str);
4011      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012      return unicode_decode_locale(str, size, error_handler, 1);
4013  }
4014  
4015  
4016  PyObject*
PyUnicode_DecodeFSDefault(const char * s)4017  PyUnicode_DecodeFSDefault(const char *s) {
4018      Py_ssize_t size = (Py_ssize_t)strlen(s);
4019      return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020  }
4021  
4022  PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023  PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024  {
4025      PyInterpreterState *interp = _PyInterpreterState_GET();
4026      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027      if (fs_codec->utf8) {
4028          return unicode_decode_utf8(s, size,
4029                                     fs_codec->error_handler,
4030                                     fs_codec->errors,
4031                                     NULL);
4032      }
4033  #ifndef _Py_FORCE_UTF8_FS_ENCODING
4034      else if (fs_codec->encoding) {
4035          return PyUnicode_Decode(s, size,
4036                                  fs_codec->encoding,
4037                                  fs_codec->errors);
4038      }
4039  #endif
4040      else {
4041          /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042             machinery is not ready and so cannot be used:
4043             use mbstowcs() in this case. */
4044          const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045          const wchar_t *filesystem_errors = config->filesystem_errors;
4046          assert(filesystem_errors != NULL);
4047          _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048          assert(errors != _Py_ERROR_UNKNOWN);
4049  #ifdef _Py_FORCE_UTF8_FS_ENCODING
4050          return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051  #else
4052          return unicode_decode_locale(s, size, errors, 0);
4053  #endif
4054      }
4055  }
4056  
4057  
4058  int
PyUnicode_FSConverter(PyObject * arg,void * addr)4059  PyUnicode_FSConverter(PyObject* arg, void* addr)
4060  {
4061      PyObject *path = NULL;
4062      PyObject *output = NULL;
4063      Py_ssize_t size;
4064      const char *data;
4065      if (arg == NULL) {
4066          Py_DECREF(*(PyObject**)addr);
4067          *(PyObject**)addr = NULL;
4068          return 1;
4069      }
4070      path = PyOS_FSPath(arg);
4071      if (path == NULL) {
4072          return 0;
4073      }
4074      if (PyBytes_Check(path)) {
4075          output = path;
4076      }
4077      else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4078          output = PyUnicode_EncodeFSDefault(path);
4079          Py_DECREF(path);
4080          if (!output) {
4081              return 0;
4082          }
4083          assert(PyBytes_Check(output));
4084      }
4085  
4086      size = PyBytes_GET_SIZE(output);
4087      data = PyBytes_AS_STRING(output);
4088      if ((size_t)size != strlen(data)) {
4089          PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090          Py_DECREF(output);
4091          return 0;
4092      }
4093      *(PyObject**)addr = output;
4094      return Py_CLEANUP_SUPPORTED;
4095  }
4096  
4097  
4098  int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4099  PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100  {
4101      int is_buffer = 0;
4102      PyObject *path = NULL;
4103      PyObject *output = NULL;
4104      if (arg == NULL) {
4105          Py_DECREF(*(PyObject**)addr);
4106          *(PyObject**)addr = NULL;
4107          return 1;
4108      }
4109  
4110      is_buffer = PyObject_CheckBuffer(arg);
4111      if (!is_buffer) {
4112          path = PyOS_FSPath(arg);
4113          if (path == NULL) {
4114              return 0;
4115          }
4116      }
4117      else {
4118          path = arg;
4119          Py_INCREF(arg);
4120      }
4121  
4122      if (PyUnicode_Check(path)) {
4123          output = path;
4124      }
4125      else if (PyBytes_Check(path) || is_buffer) {
4126          PyObject *path_bytes = NULL;
4127  
4128          if (!PyBytes_Check(path) &&
4129              PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130              "path should be string, bytes, or os.PathLike, not %.200s",
4131              Py_TYPE(arg)->tp_name)) {
4132                  Py_DECREF(path);
4133              return 0;
4134          }
4135          path_bytes = PyBytes_FromObject(path);
4136          Py_DECREF(path);
4137          if (!path_bytes) {
4138              return 0;
4139          }
4140          output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141                                                    PyBytes_GET_SIZE(path_bytes));
4142          Py_DECREF(path_bytes);
4143          if (!output) {
4144              return 0;
4145          }
4146      }
4147      else {
4148          PyErr_Format(PyExc_TypeError,
4149                       "path should be string, bytes, or os.PathLike, not %.200s",
4150                       Py_TYPE(arg)->tp_name);
4151          Py_DECREF(path);
4152          return 0;
4153      }
4154      if (PyUnicode_READY(output) == -1) {
4155          Py_DECREF(output);
4156          return 0;
4157      }
4158      if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159                   PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160          PyErr_SetString(PyExc_ValueError, "embedded null character");
4161          Py_DECREF(output);
4162          return 0;
4163      }
4164      *(PyObject**)addr = output;
4165      return Py_CLEANUP_SUPPORTED;
4166  }
4167  
4168  
4169  static int unicode_fill_utf8(PyObject *unicode);
4170  
4171  const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172  PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173  {
4174      if (!PyUnicode_Check(unicode)) {
4175          PyErr_BadArgument();
4176          return NULL;
4177      }
4178      if (PyUnicode_READY(unicode) == -1)
4179          return NULL;
4180  
4181      if (PyUnicode_UTF8(unicode) == NULL) {
4182          if (unicode_fill_utf8(unicode) == -1) {
4183              return NULL;
4184          }
4185      }
4186  
4187      if (psize)
4188          *psize = PyUnicode_UTF8_LENGTH(unicode);
4189      return PyUnicode_UTF8(unicode);
4190  }
4191  
4192  const char *
PyUnicode_AsUTF8(PyObject * unicode)4193  PyUnicode_AsUTF8(PyObject *unicode)
4194  {
4195      return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196  }
4197  
4198  Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199  PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200  {
4201      if (!PyUnicode_Check(unicode)) {
4202          PyErr_BadArgument();
4203          return NULL;
4204      }
4205      Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206      if (w == NULL) {
4207          /* Non-ASCII compact unicode object */
4208          assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209          assert(PyUnicode_IS_READY(unicode));
4210  
4211          Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212          if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213              PyErr_NoMemory();
4214              return NULL;
4215          }
4216          w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217          if (w == NULL) {
4218              PyErr_NoMemory();
4219              return NULL;
4220          }
4221          unicode_copy_as_widechar(unicode, w, wlen + 1);
4222          _PyUnicode_WSTR(unicode) = w;
4223          if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224              _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225          }
4226      }
4227      if (size != NULL)
4228          *size = PyUnicode_WSTR_LENGTH(unicode);
4229      return w;
4230  }
4231  
4232  /* Deprecated APIs */
4233  
4234  _Py_COMP_DIAG_PUSH
4235  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236  
4237  Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4238  PyUnicode_AsUnicode(PyObject *unicode)
4239  {
4240      return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241  }
4242  
4243  const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4244  _PyUnicode_AsUnicode(PyObject *unicode)
4245  {
4246      Py_ssize_t size;
4247      const Py_UNICODE *wstr;
4248  
4249      wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250      if (wstr && wcslen(wstr) != (size_t)size) {
4251          PyErr_SetString(PyExc_ValueError, "embedded null character");
4252          return NULL;
4253      }
4254      return wstr;
4255  }
4256  
4257  
4258  Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4259  PyUnicode_GetSize(PyObject *unicode)
4260  {
4261      if (!PyUnicode_Check(unicode)) {
4262          PyErr_BadArgument();
4263          goto onError;
4264      }
4265      if (_PyUnicode_WSTR(unicode) == NULL) {
4266          if (PyUnicode_AsUnicode(unicode) == NULL)
4267              goto onError;
4268      }
4269      return PyUnicode_WSTR_LENGTH(unicode);
4270  
4271    onError:
4272      return -1;
4273  }
4274  
4275  _Py_COMP_DIAG_POP
4276  
4277  Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4278  PyUnicode_GetLength(PyObject *unicode)
4279  {
4280      if (!PyUnicode_Check(unicode)) {
4281          PyErr_BadArgument();
4282          return -1;
4283      }
4284      if (PyUnicode_READY(unicode) == -1)
4285          return -1;
4286      return PyUnicode_GET_LENGTH(unicode);
4287  }
4288  
4289  Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290  PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291  {
4292      const void *data;
4293      int kind;
4294  
4295      if (!PyUnicode_Check(unicode)) {
4296          PyErr_BadArgument();
4297          return (Py_UCS4)-1;
4298      }
4299      if (PyUnicode_READY(unicode) == -1) {
4300          return (Py_UCS4)-1;
4301      }
4302      if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303          PyErr_SetString(PyExc_IndexError, "string index out of range");
4304          return (Py_UCS4)-1;
4305      }
4306      data = PyUnicode_DATA(unicode);
4307      kind = PyUnicode_KIND(unicode);
4308      return PyUnicode_READ(kind, data, index);
4309  }
4310  
4311  int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312  PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313  {
4314      if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315          PyErr_BadArgument();
4316          return -1;
4317      }
4318      assert(PyUnicode_IS_READY(unicode));
4319      if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320          PyErr_SetString(PyExc_IndexError, "string index out of range");
4321          return -1;
4322      }
4323      if (unicode_check_modifiable(unicode))
4324          return -1;
4325      if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326          PyErr_SetString(PyExc_ValueError, "character out of range");
4327          return -1;
4328      }
4329      PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330                      index, ch);
4331      return 0;
4332  }
4333  
4334  const char *
PyUnicode_GetDefaultEncoding(void)4335  PyUnicode_GetDefaultEncoding(void)
4336  {
4337      return "utf-8";
4338  }
4339  
4340  /* create or adjust a UnicodeDecodeError */
4341  static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342  make_decode_exception(PyObject **exceptionObject,
4343                        const char *encoding,
4344                        const char *input, Py_ssize_t length,
4345                        Py_ssize_t startpos, Py_ssize_t endpos,
4346                        const char *reason)
4347  {
4348      if (*exceptionObject == NULL) {
4349          *exceptionObject = PyUnicodeDecodeError_Create(
4350              encoding, input, length, startpos, endpos, reason);
4351      }
4352      else {
4353          if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354              goto onError;
4355          if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356              goto onError;
4357          if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358              goto onError;
4359      }
4360      return;
4361  
4362  onError:
4363      Py_CLEAR(*exceptionObject);
4364  }
4365  
4366  #ifdef MS_WINDOWS
4367  static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368  widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369  {
4370      if (newsize > *size) {
4371          wchar_t *newbuf = *buf;
4372          if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373              PyErr_NoMemory();
4374              return -1;
4375          }
4376          *buf = newbuf;
4377      }
4378      *size = newsize;
4379      return 0;
4380  }
4381  
4382  /* error handling callback helper:
4383     build arguments, call the callback and check the arguments,
4384     if no exception occurred, copy the replacement to the output
4385     and adjust various state variables.
4386     return 0 on success, -1 on error
4387  */
4388  
4389  static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390  unicode_decode_call_errorhandler_wchar(
4391      const char *errors, PyObject **errorHandler,
4392      const char *encoding, const char *reason,
4393      const char **input, const char **inend, Py_ssize_t *startinpos,
4394      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395      wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396  {
4397      static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398  
4399      PyObject *restuple = NULL;
4400      PyObject *repunicode = NULL;
4401      Py_ssize_t outsize;
4402      Py_ssize_t insize;
4403      Py_ssize_t requiredsize;
4404      Py_ssize_t newpos;
4405      PyObject *inputobj = NULL;
4406      Py_ssize_t repwlen;
4407  
4408      if (*errorHandler == NULL) {
4409          *errorHandler = PyCodec_LookupError(errors);
4410          if (*errorHandler == NULL)
4411              goto onError;
4412      }
4413  
4414      make_decode_exception(exceptionObject,
4415          encoding,
4416          *input, *inend - *input,
4417          *startinpos, *endinpos,
4418          reason);
4419      if (*exceptionObject == NULL)
4420          goto onError;
4421  
4422      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423      if (restuple == NULL)
4424          goto onError;
4425      if (!PyTuple_Check(restuple)) {
4426          PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427          goto onError;
4428      }
4429      if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430          goto onError;
4431  
4432      /* Copy back the bytes variables, which might have been modified by the
4433         callback */
4434      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435      if (!inputobj)
4436          goto onError;
4437      *input = PyBytes_AS_STRING(inputobj);
4438      insize = PyBytes_GET_SIZE(inputobj);
4439      *inend = *input + insize;
4440      /* we can DECREF safely, as the exception has another reference,
4441         so the object won't go away. */
4442      Py_DECREF(inputobj);
4443  
4444      if (newpos<0)
4445          newpos = insize+newpos;
4446      if (newpos<0 || newpos>insize) {
4447          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448          goto onError;
4449      }
4450  
4451  #if USE_UNICODE_WCHAR_CACHE
4452  _Py_COMP_DIAG_PUSH
4453  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454      repwlen = PyUnicode_GetSize(repunicode);
4455      if (repwlen < 0)
4456          goto onError;
4457  _Py_COMP_DIAG_POP
4458  #else /* USE_UNICODE_WCHAR_CACHE */
4459      repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460      if (repwlen < 0)
4461          goto onError;
4462      repwlen--;
4463  #endif /* USE_UNICODE_WCHAR_CACHE */
4464      /* need more space? (at least enough for what we
4465         have+the replacement+the rest of the string (starting
4466         at the new input position), so we won't have to check space
4467         when there are no errors in the rest of the string) */
4468      requiredsize = *outpos;
4469      if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470          goto overflow;
4471      requiredsize += repwlen;
4472      if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473          goto overflow;
4474      requiredsize += insize - newpos;
4475      outsize = *bufsize;
4476      if (requiredsize > outsize) {
4477          if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478              requiredsize = 2*outsize;
4479          if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480              goto onError;
4481          }
4482      }
4483      PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484      *outpos += repwlen;
4485      *endinpos = newpos;
4486      *inptr = *input + newpos;
4487  
4488      /* we made it! */
4489      Py_DECREF(restuple);
4490      return 0;
4491  
4492    overflow:
4493      PyErr_SetString(PyExc_OverflowError,
4494                      "decoded result is too long for a Python string");
4495  
4496    onError:
4497      Py_XDECREF(restuple);
4498      return -1;
4499  }
4500  #endif   /* MS_WINDOWS */
4501  
4502  static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503  unicode_decode_call_errorhandler_writer(
4504      const char *errors, PyObject **errorHandler,
4505      const char *encoding, const char *reason,
4506      const char **input, const char **inend, Py_ssize_t *startinpos,
4507      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508      _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509  {
4510      static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511  
4512      PyObject *restuple = NULL;
4513      PyObject *repunicode = NULL;
4514      Py_ssize_t insize;
4515      Py_ssize_t newpos;
4516      Py_ssize_t replen;
4517      Py_ssize_t remain;
4518      PyObject *inputobj = NULL;
4519      int need_to_grow = 0;
4520      const char *new_inptr;
4521  
4522      if (*errorHandler == NULL) {
4523          *errorHandler = PyCodec_LookupError(errors);
4524          if (*errorHandler == NULL)
4525              goto onError;
4526      }
4527  
4528      make_decode_exception(exceptionObject,
4529          encoding,
4530          *input, *inend - *input,
4531          *startinpos, *endinpos,
4532          reason);
4533      if (*exceptionObject == NULL)
4534          goto onError;
4535  
4536      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537      if (restuple == NULL)
4538          goto onError;
4539      if (!PyTuple_Check(restuple)) {
4540          PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541          goto onError;
4542      }
4543      if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544          goto onError;
4545  
4546      /* Copy back the bytes variables, which might have been modified by the
4547         callback */
4548      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549      if (!inputobj)
4550          goto onError;
4551      remain = *inend - *input - *endinpos;
4552      *input = PyBytes_AS_STRING(inputobj);
4553      insize = PyBytes_GET_SIZE(inputobj);
4554      *inend = *input + insize;
4555      /* we can DECREF safely, as the exception has another reference,
4556         so the object won't go away. */
4557      Py_DECREF(inputobj);
4558  
4559      if (newpos<0)
4560          newpos = insize+newpos;
4561      if (newpos<0 || newpos>insize) {
4562          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563          goto onError;
4564      }
4565  
4566      replen = PyUnicode_GET_LENGTH(repunicode);
4567      if (replen > 1) {
4568          writer->min_length += replen - 1;
4569          need_to_grow = 1;
4570      }
4571      new_inptr = *input + newpos;
4572      if (*inend - new_inptr > remain) {
4573          /* We don't know the decoding algorithm here so we make the worst
4574             assumption that one byte decodes to one unicode character.
4575             If unfortunately one byte could decode to more unicode characters,
4576             the decoder may write out-of-bound then.  Is it possible for the
4577             algorithms using this function? */
4578          writer->min_length += *inend - new_inptr - remain;
4579          need_to_grow = 1;
4580      }
4581      if (need_to_grow) {
4582          writer->overallocate = 1;
4583          if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584                              PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585              goto onError;
4586      }
4587      if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588          goto onError;
4589  
4590      *endinpos = newpos;
4591      *inptr = new_inptr;
4592  
4593      /* we made it! */
4594      Py_DECREF(restuple);
4595      return 0;
4596  
4597    onError:
4598      Py_XDECREF(restuple);
4599      return -1;
4600  }
4601  
4602  /* --- UTF-7 Codec -------------------------------------------------------- */
4603  
4604  /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4605  
4606  /* Three simple macros defining base-64. */
4607  
4608  /* Is c a base-64 character? */
4609  
4610  #define IS_BASE64(c) \
4611      (((c) >= 'A' && (c) <= 'Z') ||     \
4612       ((c) >= 'a' && (c) <= 'z') ||     \
4613       ((c) >= '0' && (c) <= '9') ||     \
4614       (c) == '+' || (c) == '/')
4615  
4616  /* given that c is a base-64 character, what is its base-64 value? */
4617  
4618  #define FROM_BASE64(c)                                                  \
4619      (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4620       ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4621       ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4622       (c) == '+' ? 62 : 63)
4623  
4624  /* What is the base-64 character of the bottom 6 bits of n? */
4625  
4626  #define TO_BASE64(n)  \
4627      ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628  
4629  /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630   * decoded as itself.  We are permissive on decoding; the only ASCII
4631   * byte not decoding to itself is the + which begins a base64
4632   * string. */
4633  
4634  #define DECODE_DIRECT(c)                                \
4635      ((c) <= 127 && (c) != '+')
4636  
4637  /* The UTF-7 encoder treats ASCII characters differently according to
4638   * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639   * the above).  See RFC2152.  This array identifies these different
4640   * sets:
4641   * 0 : "Set D"
4642   *     alphanumeric and '(),-./:?
4643   * 1 : "Set O"
4644   *     !"#$%&*;<=>@[]^_`{|}
4645   * 2 : "whitespace"
4646   *     ht nl cr sp
4647   * 3 : special (must be base64 encoded)
4648   *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649   */
4650  
4651  static
4652  char utf7_category[128] = {
4653  /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4654      3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4655  /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4656      3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4657  /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4658      2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4659  /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4660      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4661  /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4662      1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4663  /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4664      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4665  /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4666      1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4667  /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4668      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4669  };
4670  
4671  /* ENCODE_DIRECT: this character should be encoded as itself.  The
4672   * answer depends on whether we are encoding set O as itself, and also
4673   * on whether we are encoding whitespace as itself.  RFC2152 makes it
4674   * clear that the answers to these questions vary between
4675   * applications, so this code needs to be flexible.  */
4676  
4677  #define ENCODE_DIRECT(c, directO, directWS)             \
4678      ((c) < 128 && (c) > 0 &&                            \
4679       ((utf7_category[(c)] == 0) ||                      \
4680        (directWS && (utf7_category[(c)] == 2)) ||        \
4681        (directO && (utf7_category[(c)] == 1))))
4682  
4683  PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684  PyUnicode_DecodeUTF7(const char *s,
4685                       Py_ssize_t size,
4686                       const char *errors)
4687  {
4688      return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689  }
4690  
4691  /* The decoder.  The only state we preserve is our read position,
4692   * i.e. how many characters we have consumed.  So if we end in the
4693   * middle of a shift sequence we have to back off the read position
4694   * and the output to the beginning of the sequence, otherwise we lose
4695   * all the shift state (seen bits, number of bits seen, high
4696   * surrogate). */
4697  
4698  PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699  PyUnicode_DecodeUTF7Stateful(const char *s,
4700                               Py_ssize_t size,
4701                               const char *errors,
4702                               Py_ssize_t *consumed)
4703  {
4704      const char *starts = s;
4705      Py_ssize_t startinpos;
4706      Py_ssize_t endinpos;
4707      const char *e;
4708      _PyUnicodeWriter writer;
4709      const char *errmsg = "";
4710      int inShift = 0;
4711      Py_ssize_t shiftOutStart;
4712      unsigned int base64bits = 0;
4713      unsigned long base64buffer = 0;
4714      Py_UCS4 surrogate = 0;
4715      PyObject *errorHandler = NULL;
4716      PyObject *exc = NULL;
4717  
4718      if (size == 0) {
4719          if (consumed)
4720              *consumed = 0;
4721          _Py_RETURN_UNICODE_EMPTY();
4722      }
4723  
4724      /* Start off assuming it's all ASCII. Widen later as necessary. */
4725      _PyUnicodeWriter_Init(&writer);
4726      writer.min_length = size;
4727  
4728      shiftOutStart = 0;
4729      e = s + size;
4730  
4731      while (s < e) {
4732          Py_UCS4 ch;
4733        restart:
4734          ch = (unsigned char) *s;
4735  
4736          if (inShift) { /* in a base-64 section */
4737              if (IS_BASE64(ch)) { /* consume a base-64 character */
4738                  base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739                  base64bits += 6;
4740                  s++;
4741                  if (base64bits >= 16) {
4742                      /* we have enough bits for a UTF-16 value */
4743                      Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744                      base64bits -= 16;
4745                      base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746                      assert(outCh <= 0xffff);
4747                      if (surrogate) {
4748                          /* expecting a second surrogate */
4749                          if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750                              Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751                              if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752                                  goto onError;
4753                              surrogate = 0;
4754                              continue;
4755                          }
4756                          else {
4757                              if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758                                  goto onError;
4759                              surrogate = 0;
4760                          }
4761                      }
4762                      if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763                          /* first surrogate */
4764                          surrogate = outCh;
4765                      }
4766                      else {
4767                          if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768                              goto onError;
4769                      }
4770                  }
4771              }
4772              else { /* now leaving a base-64 section */
4773                  inShift = 0;
4774                  if (base64bits > 0) { /* left-over bits */
4775                      if (base64bits >= 6) {
4776                          /* We've seen at least one base-64 character */
4777                          s++;
4778                          errmsg = "partial character in shift sequence";
4779                          goto utf7Error;
4780                      }
4781                      else {
4782                          /* Some bits remain; they should be zero */
4783                          if (base64buffer != 0) {
4784                              s++;
4785                              errmsg = "non-zero padding bits in shift sequence";
4786                              goto utf7Error;
4787                          }
4788                      }
4789                  }
4790                  if (surrogate && DECODE_DIRECT(ch)) {
4791                      if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792                          goto onError;
4793                  }
4794                  surrogate = 0;
4795                  if (ch == '-') {
4796                      /* '-' is absorbed; other terminating
4797                         characters are preserved */
4798                      s++;
4799                  }
4800              }
4801          }
4802          else if ( ch == '+' ) {
4803              startinpos = s-starts;
4804              s++; /* consume '+' */
4805              if (s < e && *s == '-') { /* '+-' encodes '+' */
4806                  s++;
4807                  if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808                      goto onError;
4809              }
4810              else if (s < e && !IS_BASE64(*s)) {
4811                  s++;
4812                  errmsg = "ill-formed sequence";
4813                  goto utf7Error;
4814              }
4815              else { /* begin base64-encoded section */
4816                  inShift = 1;
4817                  surrogate = 0;
4818                  shiftOutStart = writer.pos;
4819                  base64bits = 0;
4820                  base64buffer = 0;
4821              }
4822          }
4823          else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824              s++;
4825              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826                  goto onError;
4827          }
4828          else {
4829              startinpos = s-starts;
4830              s++;
4831              errmsg = "unexpected special character";
4832              goto utf7Error;
4833          }
4834          continue;
4835  utf7Error:
4836          endinpos = s-starts;
4837          if (unicode_decode_call_errorhandler_writer(
4838                  errors, &errorHandler,
4839                  "utf7", errmsg,
4840                  &starts, &e, &startinpos, &endinpos, &exc, &s,
4841                  &writer))
4842              goto onError;
4843      }
4844  
4845      /* end of string */
4846  
4847      if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848          /* if we're in an inconsistent state, that's an error */
4849          inShift = 0;
4850          if (surrogate ||
4851                  (base64bits >= 6) ||
4852                  (base64bits > 0 && base64buffer != 0)) {
4853              endinpos = size;
4854              if (unicode_decode_call_errorhandler_writer(
4855                      errors, &errorHandler,
4856                      "utf7", "unterminated shift sequence",
4857                      &starts, &e, &startinpos, &endinpos, &exc, &s,
4858                      &writer))
4859                  goto onError;
4860              if (s < e)
4861                  goto restart;
4862          }
4863      }
4864  
4865      /* return state */
4866      if (consumed) {
4867          if (inShift) {
4868              *consumed = startinpos;
4869              if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870                  PyObject *result = PyUnicode_FromKindAndData(
4871                          writer.kind, writer.data, shiftOutStart);
4872                  Py_XDECREF(errorHandler);
4873                  Py_XDECREF(exc);
4874                  _PyUnicodeWriter_Dealloc(&writer);
4875                  return result;
4876              }
4877              writer.pos = shiftOutStart; /* back off output */
4878          }
4879          else {
4880              *consumed = s-starts;
4881          }
4882      }
4883  
4884      Py_XDECREF(errorHandler);
4885      Py_XDECREF(exc);
4886      return _PyUnicodeWriter_Finish(&writer);
4887  
4888    onError:
4889      Py_XDECREF(errorHandler);
4890      Py_XDECREF(exc);
4891      _PyUnicodeWriter_Dealloc(&writer);
4892      return NULL;
4893  }
4894  
4895  
4896  PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897  _PyUnicode_EncodeUTF7(PyObject *str,
4898                        int base64SetO,
4899                        int base64WhiteSpace,
4900                        const char *errors)
4901  {
4902      int kind;
4903      const void *data;
4904      Py_ssize_t len;
4905      PyObject *v;
4906      int inShift = 0;
4907      Py_ssize_t i;
4908      unsigned int base64bits = 0;
4909      unsigned long base64buffer = 0;
4910      char * out;
4911      const char * start;
4912  
4913      if (PyUnicode_READY(str) == -1)
4914          return NULL;
4915      kind = PyUnicode_KIND(str);
4916      data = PyUnicode_DATA(str);
4917      len = PyUnicode_GET_LENGTH(str);
4918  
4919      if (len == 0)
4920          return PyBytes_FromStringAndSize(NULL, 0);
4921  
4922      /* It might be possible to tighten this worst case */
4923      if (len > PY_SSIZE_T_MAX / 8)
4924          return PyErr_NoMemory();
4925      v = PyBytes_FromStringAndSize(NULL, len * 8);
4926      if (v == NULL)
4927          return NULL;
4928  
4929      start = out = PyBytes_AS_STRING(v);
4930      for (i = 0; i < len; ++i) {
4931          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932  
4933          if (inShift) {
4934              if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935                  /* shifting out */
4936                  if (base64bits) { /* output remaining bits */
4937                      *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938                      base64buffer = 0;
4939                      base64bits = 0;
4940                  }
4941                  inShift = 0;
4942                  /* Characters not in the BASE64 set implicitly unshift the sequence
4943                     so no '-' is required, except if the character is itself a '-' */
4944                  if (IS_BASE64(ch) || ch == '-') {
4945                      *out++ = '-';
4946                  }
4947                  *out++ = (char) ch;
4948              }
4949              else {
4950                  goto encode_char;
4951              }
4952          }
4953          else { /* not in a shift sequence */
4954              if (ch == '+') {
4955                  *out++ = '+';
4956                          *out++ = '-';
4957              }
4958              else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959                  *out++ = (char) ch;
4960              }
4961              else {
4962                  *out++ = '+';
4963                  inShift = 1;
4964                  goto encode_char;
4965              }
4966          }
4967          continue;
4968  encode_char:
4969          if (ch >= 0x10000) {
4970              assert(ch <= MAX_UNICODE);
4971  
4972              /* code first surrogate */
4973              base64bits += 16;
4974              base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975              while (base64bits >= 6) {
4976                  *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977                  base64bits -= 6;
4978              }
4979              /* prepare second surrogate */
4980              ch = Py_UNICODE_LOW_SURROGATE(ch);
4981          }
4982          base64bits += 16;
4983          base64buffer = (base64buffer << 16) | ch;
4984          while (base64bits >= 6) {
4985              *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986              base64bits -= 6;
4987          }
4988      }
4989      if (base64bits)
4990          *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991      if (inShift)
4992          *out++ = '-';
4993      if (_PyBytes_Resize(&v, out - start) < 0)
4994          return NULL;
4995      return v;
4996  }
4997  
4998  #undef IS_BASE64
4999  #undef FROM_BASE64
5000  #undef TO_BASE64
5001  #undef DECODE_DIRECT
5002  #undef ENCODE_DIRECT
5003  
5004  /* --- UTF-8 Codec -------------------------------------------------------- */
5005  
5006  PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007  PyUnicode_DecodeUTF8(const char *s,
5008                       Py_ssize_t size,
5009                       const char *errors)
5010  {
5011      return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012  }
5013  
5014  #include "stringlib/asciilib.h"
5015  #include "stringlib/codecs.h"
5016  #include "stringlib/undef.h"
5017  
5018  #include "stringlib/ucs1lib.h"
5019  #include "stringlib/codecs.h"
5020  #include "stringlib/undef.h"
5021  
5022  #include "stringlib/ucs2lib.h"
5023  #include "stringlib/codecs.h"
5024  #include "stringlib/undef.h"
5025  
5026  #include "stringlib/ucs4lib.h"
5027  #include "stringlib/codecs.h"
5028  #include "stringlib/undef.h"
5029  
5030  /* Mask to quickly check whether a C 'size_t' contains a
5031     non-ASCII, UTF8-encoded char. */
5032  #if (SIZEOF_SIZE_T == 8)
5033  # define ASCII_CHAR_MASK 0x8080808080808080ULL
5034  #elif (SIZEOF_SIZE_T == 4)
5035  # define ASCII_CHAR_MASK 0x80808080U
5036  #else
5037  # error C 'size_t' size should be either 4 or 8!
5038  #endif
5039  
5040  static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041  ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042  {
5043      const char *p = start;
5044  
5045  #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046      assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047      if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048          /* Fast path, see in STRINGLIB(utf8_decode) for
5049             an explanation. */
5050          /* Help allocation */
5051          const char *_p = p;
5052          Py_UCS1 * q = dest;
5053          while (_p + SIZEOF_SIZE_T <= end) {
5054              size_t value = *(const size_t *) _p;
5055              if (value & ASCII_CHAR_MASK)
5056                  break;
5057              *((size_t *)q) = value;
5058              _p += SIZEOF_SIZE_T;
5059              q += SIZEOF_SIZE_T;
5060          }
5061          p = _p;
5062          while (p < end) {
5063              if ((unsigned char)*p & 0x80)
5064                  break;
5065              *q++ = *p++;
5066          }
5067          return p - start;
5068      }
5069  #endif
5070      while (p < end) {
5071          /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072             for an explanation. */
5073          if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074              /* Help allocation */
5075              const char *_p = p;
5076              while (_p + SIZEOF_SIZE_T <= end) {
5077                  size_t value = *(const size_t *) _p;
5078                  if (value & ASCII_CHAR_MASK)
5079                      break;
5080                  _p += SIZEOF_SIZE_T;
5081              }
5082              p = _p;
5083              if (_p == end)
5084                  break;
5085          }
5086          if ((unsigned char)*p & 0x80)
5087              break;
5088          ++p;
5089      }
5090      memcpy(dest, start, p - start);
5091      return p - start;
5092  }
5093  
5094  static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095  unicode_decode_utf8(const char *s, Py_ssize_t size,
5096                      _Py_error_handler error_handler, const char *errors,
5097                      Py_ssize_t *consumed)
5098  {
5099      if (size == 0) {
5100          if (consumed)
5101              *consumed = 0;
5102          _Py_RETURN_UNICODE_EMPTY();
5103      }
5104  
5105      /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106      if (size == 1 && (unsigned char)s[0] < 128) {
5107          if (consumed) {
5108              *consumed = 1;
5109          }
5110          return get_latin1_char((unsigned char)s[0]);
5111      }
5112  
5113      const char *starts = s;
5114      const char *end = s + size;
5115  
5116      // fast path: try ASCII string.
5117      PyObject *u = PyUnicode_New(size, 127);
5118      if (u == NULL) {
5119          return NULL;
5120      }
5121      s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122      if (s == end) {
5123          return u;
5124      }
5125  
5126      // Use _PyUnicodeWriter after fast path is failed.
5127      _PyUnicodeWriter writer;
5128      _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129      writer.pos = s - starts;
5130  
5131      Py_ssize_t startinpos, endinpos;
5132      const char *errmsg = "";
5133      PyObject *error_handler_obj = NULL;
5134      PyObject *exc = NULL;
5135  
5136      while (s < end) {
5137          Py_UCS4 ch;
5138          int kind = writer.kind;
5139  
5140          if (kind == PyUnicode_1BYTE_KIND) {
5141              if (PyUnicode_IS_ASCII(writer.buffer))
5142                  ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143              else
5144                  ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145          } else if (kind == PyUnicode_2BYTE_KIND) {
5146              ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147          } else {
5148              assert(kind == PyUnicode_4BYTE_KIND);
5149              ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150          }
5151  
5152          switch (ch) {
5153          case 0:
5154              if (s == end || consumed)
5155                  goto End;
5156              errmsg = "unexpected end of data";
5157              startinpos = s - starts;
5158              endinpos = end - starts;
5159              break;
5160          case 1:
5161              errmsg = "invalid start byte";
5162              startinpos = s - starts;
5163              endinpos = startinpos + 1;
5164              break;
5165          case 2:
5166              if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167                  && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168              {
5169                  /* Truncated surrogate code in range D800-DFFF */
5170                  goto End;
5171              }
5172              /* fall through */
5173          case 3:
5174          case 4:
5175              errmsg = "invalid continuation byte";
5176              startinpos = s - starts;
5177              endinpos = startinpos + ch - 1;
5178              break;
5179          default:
5180              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181                  goto onError;
5182              continue;
5183          }
5184  
5185          if (error_handler == _Py_ERROR_UNKNOWN)
5186              error_handler = _Py_GetErrorHandler(errors);
5187  
5188          switch (error_handler) {
5189          case _Py_ERROR_IGNORE:
5190              s += (endinpos - startinpos);
5191              break;
5192  
5193          case _Py_ERROR_REPLACE:
5194              if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195                  goto onError;
5196              s += (endinpos - startinpos);
5197              break;
5198  
5199          case _Py_ERROR_SURROGATEESCAPE:
5200          {
5201              Py_ssize_t i;
5202  
5203              if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204                  goto onError;
5205              for (i=startinpos; i<endinpos; i++) {
5206                  ch = (Py_UCS4)(unsigned char)(starts[i]);
5207                  PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208                                  ch + 0xdc00);
5209                  writer.pos++;
5210              }
5211              s += (endinpos - startinpos);
5212              break;
5213          }
5214  
5215          default:
5216              if (unicode_decode_call_errorhandler_writer(
5217                      errors, &error_handler_obj,
5218                      "utf-8", errmsg,
5219                      &starts, &end, &startinpos, &endinpos, &exc, &s,
5220                      &writer))
5221                  goto onError;
5222          }
5223      }
5224  
5225  End:
5226      if (consumed)
5227          *consumed = s - starts;
5228  
5229      Py_XDECREF(error_handler_obj);
5230      Py_XDECREF(exc);
5231      return _PyUnicodeWriter_Finish(&writer);
5232  
5233  onError:
5234      Py_XDECREF(error_handler_obj);
5235      Py_XDECREF(exc);
5236      _PyUnicodeWriter_Dealloc(&writer);
5237      return NULL;
5238  }
5239  
5240  
5241  PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242  PyUnicode_DecodeUTF8Stateful(const char *s,
5243                               Py_ssize_t size,
5244                               const char *errors,
5245                               Py_ssize_t *consumed)
5246  {
5247      return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248  }
5249  
5250  
5251  /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252     non-zero, use strict error handler otherwise.
5253  
5254     On success, write a pointer to a newly allocated wide character string into
5255     *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256     (in number of wchar_t units) into *wlen (if wlen is set).
5257  
5258     On memory allocation failure, return -1.
5259  
5260     On decoding error (if surrogateescape is zero), return -2. If wlen is
5261     non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262     is not NULL, write the decoding error message into *reason. */
5263  int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264  _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265                   const char **reason, _Py_error_handler errors)
5266  {
5267      const char *orig_s = s;
5268      const char *e;
5269      wchar_t *unicode;
5270      Py_ssize_t outpos;
5271  
5272      int surrogateescape = 0;
5273      int surrogatepass = 0;
5274      switch (errors)
5275      {
5276      case _Py_ERROR_STRICT:
5277          break;
5278      case _Py_ERROR_SURROGATEESCAPE:
5279          surrogateescape = 1;
5280          break;
5281      case _Py_ERROR_SURROGATEPASS:
5282          surrogatepass = 1;
5283          break;
5284      default:
5285          return -3;
5286      }
5287  
5288      /* Note: size will always be longer than the resulting Unicode
5289         character count */
5290      if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291          return -1;
5292      }
5293  
5294      unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295      if (!unicode) {
5296          return -1;
5297      }
5298  
5299      /* Unpack UTF-8 encoded data */
5300      e = s + size;
5301      outpos = 0;
5302      while (s < e) {
5303          Py_UCS4 ch;
5304  #if SIZEOF_WCHAR_T == 4
5305          ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306  #else
5307          ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308  #endif
5309          if (ch > 0xFF) {
5310  #if SIZEOF_WCHAR_T == 4
5311              Py_UNREACHABLE();
5312  #else
5313              assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314              /* write a surrogate pair */
5315              unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316              unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317  #endif
5318          }
5319          else {
5320              if (!ch && s == e) {
5321                  break;
5322              }
5323  
5324              if (surrogateescape) {
5325                  unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326              }
5327              else {
5328                  /* Is it a valid three-byte code? */
5329                  if (surrogatepass
5330                      && (e - s) >= 3
5331                      && (s[0] & 0xf0) == 0xe0
5332                      && (s[1] & 0xc0) == 0x80
5333                      && (s[2] & 0xc0) == 0x80)
5334                  {
5335                      ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336                      s += 3;
5337                      unicode[outpos++] = ch;
5338                  }
5339                  else {
5340                      PyMem_RawFree(unicode );
5341                      if (reason != NULL) {
5342                          switch (ch) {
5343                          case 0:
5344                              *reason = "unexpected end of data";
5345                              break;
5346                          case 1:
5347                              *reason = "invalid start byte";
5348                              break;
5349                          /* 2, 3, 4 */
5350                          default:
5351                              *reason = "invalid continuation byte";
5352                              break;
5353                          }
5354                      }
5355                      if (wlen != NULL) {
5356                          *wlen = s - orig_s;
5357                      }
5358                      return -2;
5359                  }
5360              }
5361          }
5362      }
5363      unicode[outpos] = L'\0';
5364      if (wlen) {
5365          *wlen = outpos;
5366      }
5367      *wstr = unicode;
5368      return 0;
5369  }
5370  
5371  
5372  wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373  _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374                                 size_t *wlen)
5375  {
5376      wchar_t *wstr;
5377      int res = _Py_DecodeUTF8Ex(arg, arglen,
5378                                 &wstr, wlen,
5379                                 NULL, _Py_ERROR_SURROGATEESCAPE);
5380      if (res != 0) {
5381          /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382          assert(res != -3);
5383          if (wlen) {
5384              *wlen = (size_t)res;
5385          }
5386          return NULL;
5387      }
5388      return wstr;
5389  }
5390  
5391  
5392  /* UTF-8 encoder using the surrogateescape error handler .
5393  
5394     On success, return 0 and write the newly allocated character string (use
5395     PyMem_Free() to free the memory) into *str.
5396  
5397     On encoding failure, return -2 and write the position of the invalid
5398     surrogate character into *error_pos (if error_pos is set) and the decoding
5399     error message into *reason (if reason is set).
5400  
5401     On memory allocation failure, return -1. */
5402  int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403  _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404                   const char **reason, int raw_malloc, _Py_error_handler errors)
5405  {
5406      const Py_ssize_t max_char_size = 4;
5407      Py_ssize_t len = wcslen(text);
5408  
5409      assert(len >= 0);
5410  
5411      int surrogateescape = 0;
5412      int surrogatepass = 0;
5413      switch (errors)
5414      {
5415      case _Py_ERROR_STRICT:
5416          break;
5417      case _Py_ERROR_SURROGATEESCAPE:
5418          surrogateescape = 1;
5419          break;
5420      case _Py_ERROR_SURROGATEPASS:
5421          surrogatepass = 1;
5422          break;
5423      default:
5424          return -3;
5425      }
5426  
5427      if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428          return -1;
5429      }
5430      char *bytes;
5431      if (raw_malloc) {
5432          bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433      }
5434      else {
5435          bytes = PyMem_Malloc((len + 1) * max_char_size);
5436      }
5437      if (bytes == NULL) {
5438          return -1;
5439      }
5440  
5441      char *p = bytes;
5442      Py_ssize_t i;
5443      for (i = 0; i < len; ) {
5444          Py_ssize_t ch_pos = i;
5445          Py_UCS4 ch = text[i];
5446          i++;
5447  #if Py_UNICODE_SIZE == 2
5448          if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449              && i < len
5450              && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451          {
5452              ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453              i++;
5454          }
5455  #endif
5456  
5457          if (ch < 0x80) {
5458              /* Encode ASCII */
5459              *p++ = (char) ch;
5460  
5461          }
5462          else if (ch < 0x0800) {
5463              /* Encode Latin-1 */
5464              *p++ = (char)(0xc0 | (ch >> 6));
5465              *p++ = (char)(0x80 | (ch & 0x3f));
5466          }
5467          else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468              /* surrogateescape error handler */
5469              if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470                  if (error_pos != NULL) {
5471                      *error_pos = (size_t)ch_pos;
5472                  }
5473                  if (reason != NULL) {
5474                      *reason = "encoding error";
5475                  }
5476                  if (raw_malloc) {
5477                      PyMem_RawFree(bytes);
5478                  }
5479                  else {
5480                      PyMem_Free(bytes);
5481                  }
5482                  return -2;
5483              }
5484              *p++ = (char)(ch & 0xff);
5485          }
5486          else if (ch < 0x10000) {
5487              *p++ = (char)(0xe0 | (ch >> 12));
5488              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489              *p++ = (char)(0x80 | (ch & 0x3f));
5490          }
5491          else {  /* ch >= 0x10000 */
5492              assert(ch <= MAX_UNICODE);
5493              /* Encode UCS4 Unicode ordinals */
5494              *p++ = (char)(0xf0 | (ch >> 18));
5495              *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497              *p++ = (char)(0x80 | (ch & 0x3f));
5498          }
5499      }
5500      *p++ = '\0';
5501  
5502      size_t final_size = (p - bytes);
5503      char *bytes2;
5504      if (raw_malloc) {
5505          bytes2 = PyMem_RawRealloc(bytes, final_size);
5506      }
5507      else {
5508          bytes2 = PyMem_Realloc(bytes, final_size);
5509      }
5510      if (bytes2 == NULL) {
5511          if (error_pos != NULL) {
5512              *error_pos = (size_t)-1;
5513          }
5514          if (raw_malloc) {
5515              PyMem_RawFree(bytes);
5516          }
5517          else {
5518              PyMem_Free(bytes);
5519          }
5520          return -1;
5521      }
5522      *str = bytes2;
5523      return 0;
5524  }
5525  
5526  
5527  /* Primary internal function which creates utf8 encoded bytes objects.
5528  
5529     Allocation strategy:  if the string is short, convert into a stack buffer
5530     and allocate exactly as much space needed at the end.  Else allocate the
5531     maximum possible needed (4 result bytes per Unicode character), and return
5532     the excess memory at the end.
5533  */
5534  static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535  unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536                      const char *errors)
5537  {
5538      if (!PyUnicode_Check(unicode)) {
5539          PyErr_BadArgument();
5540          return NULL;
5541      }
5542  
5543      if (PyUnicode_READY(unicode) == -1)
5544          return NULL;
5545  
5546      if (PyUnicode_UTF8(unicode))
5547          return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548                                           PyUnicode_UTF8_LENGTH(unicode));
5549  
5550      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551      const void *data = PyUnicode_DATA(unicode);
5552      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553  
5554      _PyBytesWriter writer;
5555      char *end;
5556  
5557      switch (kind) {
5558      default:
5559          Py_UNREACHABLE();
5560      case PyUnicode_1BYTE_KIND:
5561          /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562          assert(!PyUnicode_IS_ASCII(unicode));
5563          end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564          break;
5565      case PyUnicode_2BYTE_KIND:
5566          end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567          break;
5568      case PyUnicode_4BYTE_KIND:
5569          end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570          break;
5571      }
5572  
5573      if (end == NULL) {
5574          _PyBytesWriter_Dealloc(&writer);
5575          return NULL;
5576      }
5577      return _PyBytesWriter_Finish(&writer, end);
5578  }
5579  
5580  static int
unicode_fill_utf8(PyObject * unicode)5581  unicode_fill_utf8(PyObject *unicode)
5582  {
5583      /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584      assert(!PyUnicode_IS_ASCII(unicode));
5585  
5586      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587      const void *data = PyUnicode_DATA(unicode);
5588      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589  
5590      _PyBytesWriter writer;
5591      char *end;
5592  
5593      switch (kind) {
5594      default:
5595          Py_UNREACHABLE();
5596      case PyUnicode_1BYTE_KIND:
5597          end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598                                     _Py_ERROR_STRICT, NULL);
5599          break;
5600      case PyUnicode_2BYTE_KIND:
5601          end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602                                     _Py_ERROR_STRICT, NULL);
5603          break;
5604      case PyUnicode_4BYTE_KIND:
5605          end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606                                     _Py_ERROR_STRICT, NULL);
5607          break;
5608      }
5609      if (end == NULL) {
5610          _PyBytesWriter_Dealloc(&writer);
5611          return -1;
5612      }
5613  
5614      const char *start = writer.use_small_buffer ? writer.small_buffer :
5615                      PyBytes_AS_STRING(writer.buffer);
5616      Py_ssize_t len = end - start;
5617  
5618      char *cache = PyObject_Malloc(len + 1);
5619      if (cache == NULL) {
5620          _PyBytesWriter_Dealloc(&writer);
5621          PyErr_NoMemory();
5622          return -1;
5623      }
5624      _PyUnicode_UTF8(unicode) = cache;
5625      _PyUnicode_UTF8_LENGTH(unicode) = len;
5626      memcpy(cache, start, len);
5627      cache[len] = '\0';
5628      _PyBytesWriter_Dealloc(&writer);
5629      return 0;
5630  }
5631  
5632  PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633  _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634  {
5635      return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636  }
5637  
5638  
5639  PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5640  PyUnicode_AsUTF8String(PyObject *unicode)
5641  {
5642      return _PyUnicode_AsUTF8String(unicode, NULL);
5643  }
5644  
5645  /* --- UTF-32 Codec ------------------------------------------------------- */
5646  
5647  PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648  PyUnicode_DecodeUTF32(const char *s,
5649                        Py_ssize_t size,
5650                        const char *errors,
5651                        int *byteorder)
5652  {
5653      return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654  }
5655  
5656  PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657  PyUnicode_DecodeUTF32Stateful(const char *s,
5658                                Py_ssize_t size,
5659                                const char *errors,
5660                                int *byteorder,
5661                                Py_ssize_t *consumed)
5662  {
5663      const char *starts = s;
5664      Py_ssize_t startinpos;
5665      Py_ssize_t endinpos;
5666      _PyUnicodeWriter writer;
5667      const unsigned char *q, *e;
5668      int le, bo = 0;       /* assume native ordering by default */
5669      const char *encoding;
5670      const char *errmsg = "";
5671      PyObject *errorHandler = NULL;
5672      PyObject *exc = NULL;
5673  
5674      q = (const unsigned char *)s;
5675      e = q + size;
5676  
5677      if (byteorder)
5678          bo = *byteorder;
5679  
5680      /* Check for BOM marks (U+FEFF) in the input and adjust current
5681         byte order setting accordingly. In native mode, the leading BOM
5682         mark is skipped, in all other modes, it is copied to the output
5683         stream as-is (giving a ZWNBSP character). */
5684      if (bo == 0 && size >= 4) {
5685          Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686          if (bom == 0x0000FEFF) {
5687              bo = -1;
5688              q += 4;
5689          }
5690          else if (bom == 0xFFFE0000) {
5691              bo = 1;
5692              q += 4;
5693          }
5694          if (byteorder)
5695              *byteorder = bo;
5696      }
5697  
5698      if (q == e) {
5699          if (consumed)
5700              *consumed = size;
5701          _Py_RETURN_UNICODE_EMPTY();
5702      }
5703  
5704  #ifdef WORDS_BIGENDIAN
5705      le = bo < 0;
5706  #else
5707      le = bo <= 0;
5708  #endif
5709      encoding = le ? "utf-32-le" : "utf-32-be";
5710  
5711      _PyUnicodeWriter_Init(&writer);
5712      writer.min_length = (e - q + 3) / 4;
5713      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714          goto onError;
5715  
5716      while (1) {
5717          Py_UCS4 ch = 0;
5718          Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719  
5720          if (e - q >= 4) {
5721              enum PyUnicode_Kind kind = writer.kind;
5722              void *data = writer.data;
5723              const unsigned char *last = e - 4;
5724              Py_ssize_t pos = writer.pos;
5725              if (le) {
5726                  do {
5727                      ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728                      if (ch > maxch)
5729                          break;
5730                      if (kind != PyUnicode_1BYTE_KIND &&
5731                          Py_UNICODE_IS_SURROGATE(ch))
5732                          break;
5733                      PyUnicode_WRITE(kind, data, pos++, ch);
5734                      q += 4;
5735                  } while (q <= last);
5736              }
5737              else {
5738                  do {
5739                      ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740                      if (ch > maxch)
5741                          break;
5742                      if (kind != PyUnicode_1BYTE_KIND &&
5743                          Py_UNICODE_IS_SURROGATE(ch))
5744                          break;
5745                      PyUnicode_WRITE(kind, data, pos++, ch);
5746                      q += 4;
5747                  } while (q <= last);
5748              }
5749              writer.pos = pos;
5750          }
5751  
5752          if (Py_UNICODE_IS_SURROGATE(ch)) {
5753              errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754              startinpos = ((const char *)q) - starts;
5755              endinpos = startinpos + 4;
5756          }
5757          else if (ch <= maxch) {
5758              if (q == e || consumed)
5759                  break;
5760              /* remaining bytes at the end? (size should be divisible by 4) */
5761              errmsg = "truncated data";
5762              startinpos = ((const char *)q) - starts;
5763              endinpos = ((const char *)e) - starts;
5764          }
5765          else {
5766              if (ch < 0x110000) {
5767                  if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768                      goto onError;
5769                  q += 4;
5770                  continue;
5771              }
5772              errmsg = "code point not in range(0x110000)";
5773              startinpos = ((const char *)q) - starts;
5774              endinpos = startinpos + 4;
5775          }
5776  
5777          /* The remaining input chars are ignored if the callback
5778             chooses to skip the input */
5779          if (unicode_decode_call_errorhandler_writer(
5780                  errors, &errorHandler,
5781                  encoding, errmsg,
5782                  &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783                  &writer))
5784              goto onError;
5785      }
5786  
5787      if (consumed)
5788          *consumed = (const char *)q-starts;
5789  
5790      Py_XDECREF(errorHandler);
5791      Py_XDECREF(exc);
5792      return _PyUnicodeWriter_Finish(&writer);
5793  
5794    onError:
5795      _PyUnicodeWriter_Dealloc(&writer);
5796      Py_XDECREF(errorHandler);
5797      Py_XDECREF(exc);
5798      return NULL;
5799  }
5800  
5801  PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802  _PyUnicode_EncodeUTF32(PyObject *str,
5803                         const char *errors,
5804                         int byteorder)
5805  {
5806      enum PyUnicode_Kind kind;
5807      const void *data;
5808      Py_ssize_t len;
5809      PyObject *v;
5810      uint32_t *out;
5811  #if PY_LITTLE_ENDIAN
5812      int native_ordering = byteorder <= 0;
5813  #else
5814      int native_ordering = byteorder >= 0;
5815  #endif
5816      const char *encoding;
5817      Py_ssize_t nsize, pos;
5818      PyObject *errorHandler = NULL;
5819      PyObject *exc = NULL;
5820      PyObject *rep = NULL;
5821  
5822      if (!PyUnicode_Check(str)) {
5823          PyErr_BadArgument();
5824          return NULL;
5825      }
5826      if (PyUnicode_READY(str) == -1)
5827          return NULL;
5828      kind = PyUnicode_KIND(str);
5829      data = PyUnicode_DATA(str);
5830      len = PyUnicode_GET_LENGTH(str);
5831  
5832      if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833          return PyErr_NoMemory();
5834      nsize = len + (byteorder == 0);
5835      v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836      if (v == NULL)
5837          return NULL;
5838  
5839      /* output buffer is 4-bytes aligned */
5840      assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841      out = (uint32_t *)PyBytes_AS_STRING(v);
5842      if (byteorder == 0)
5843          *out++ = 0xFEFF;
5844      if (len == 0)
5845          goto done;
5846  
5847      if (byteorder == -1)
5848          encoding = "utf-32-le";
5849      else if (byteorder == 1)
5850          encoding = "utf-32-be";
5851      else
5852          encoding = "utf-32";
5853  
5854      if (kind == PyUnicode_1BYTE_KIND) {
5855          ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856          goto done;
5857      }
5858  
5859      pos = 0;
5860      while (pos < len) {
5861          Py_ssize_t newpos, repsize, moreunits;
5862  
5863          if (kind == PyUnicode_2BYTE_KIND) {
5864              pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865                                          &out, native_ordering);
5866          }
5867          else {
5868              assert(kind == PyUnicode_4BYTE_KIND);
5869              pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870                                          &out, native_ordering);
5871          }
5872          if (pos == len)
5873              break;
5874  
5875          rep = unicode_encode_call_errorhandler(
5876                  errors, &errorHandler,
5877                  encoding, "surrogates not allowed",
5878                  str, &exc, pos, pos + 1, &newpos);
5879          if (!rep)
5880              goto error;
5881  
5882          if (PyBytes_Check(rep)) {
5883              repsize = PyBytes_GET_SIZE(rep);
5884              if (repsize & 3) {
5885                  raise_encode_exception(&exc, encoding,
5886                                         str, pos, pos + 1,
5887                                         "surrogates not allowed");
5888                  goto error;
5889              }
5890              moreunits = repsize / 4;
5891          }
5892          else {
5893              assert(PyUnicode_Check(rep));
5894              if (PyUnicode_READY(rep) < 0)
5895                  goto error;
5896              moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897              if (!PyUnicode_IS_ASCII(rep)) {
5898                  raise_encode_exception(&exc, encoding,
5899                                         str, pos, pos + 1,
5900                                         "surrogates not allowed");
5901                  goto error;
5902              }
5903          }
5904          moreunits += pos - newpos;
5905          pos = newpos;
5906  
5907          /* four bytes are reserved for each surrogate */
5908          if (moreunits > 0) {
5909              Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910              if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911                  /* integer overflow */
5912                  PyErr_NoMemory();
5913                  goto error;
5914              }
5915              if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916                  goto error;
5917              out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918          }
5919  
5920          if (PyBytes_Check(rep)) {
5921              memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922              out += repsize / 4;
5923          } else /* rep is unicode */ {
5924              assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925              ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926                                   &out, native_ordering);
5927          }
5928  
5929          Py_CLEAR(rep);
5930      }
5931  
5932      /* Cut back to size actually needed. This is necessary for, for example,
5933         encoding of a string containing isolated surrogates and the 'ignore'
5934         handler is used. */
5935      nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936      if (nsize != PyBytes_GET_SIZE(v))
5937        _PyBytes_Resize(&v, nsize);
5938      Py_XDECREF(errorHandler);
5939      Py_XDECREF(exc);
5940    done:
5941      return v;
5942    error:
5943      Py_XDECREF(rep);
5944      Py_XDECREF(errorHandler);
5945      Py_XDECREF(exc);
5946      Py_XDECREF(v);
5947      return NULL;
5948  }
5949  
5950  PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5951  PyUnicode_AsUTF32String(PyObject *unicode)
5952  {
5953      return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954  }
5955  
5956  /* --- UTF-16 Codec ------------------------------------------------------- */
5957  
5958  PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959  PyUnicode_DecodeUTF16(const char *s,
5960                        Py_ssize_t size,
5961                        const char *errors,
5962                        int *byteorder)
5963  {
5964      return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965  }
5966  
5967  PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968  PyUnicode_DecodeUTF16Stateful(const char *s,
5969                                Py_ssize_t size,
5970                                const char *errors,
5971                                int *byteorder,
5972                                Py_ssize_t *consumed)
5973  {
5974      const char *starts = s;
5975      Py_ssize_t startinpos;
5976      Py_ssize_t endinpos;
5977      _PyUnicodeWriter writer;
5978      const unsigned char *q, *e;
5979      int bo = 0;       /* assume native ordering by default */
5980      int native_ordering;
5981      const char *errmsg = "";
5982      PyObject *errorHandler = NULL;
5983      PyObject *exc = NULL;
5984      const char *encoding;
5985  
5986      q = (const unsigned char *)s;
5987      e = q + size;
5988  
5989      if (byteorder)
5990          bo = *byteorder;
5991  
5992      /* Check for BOM marks (U+FEFF) in the input and adjust current
5993         byte order setting accordingly. In native mode, the leading BOM
5994         mark is skipped, in all other modes, it is copied to the output
5995         stream as-is (giving a ZWNBSP character). */
5996      if (bo == 0 && size >= 2) {
5997          const Py_UCS4 bom = (q[1] << 8) | q[0];
5998          if (bom == 0xFEFF) {
5999              q += 2;
6000              bo = -1;
6001          }
6002          else if (bom == 0xFFFE) {
6003              q += 2;
6004              bo = 1;
6005          }
6006          if (byteorder)
6007              *byteorder = bo;
6008      }
6009  
6010      if (q == e) {
6011          if (consumed)
6012              *consumed = size;
6013          _Py_RETURN_UNICODE_EMPTY();
6014      }
6015  
6016  #if PY_LITTLE_ENDIAN
6017      native_ordering = bo <= 0;
6018      encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019  #else
6020      native_ordering = bo >= 0;
6021      encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022  #endif
6023  
6024      /* Note: size will always be longer than the resulting Unicode
6025         character count normally.  Error handler will take care of
6026         resizing when needed. */
6027      _PyUnicodeWriter_Init(&writer);
6028      writer.min_length = (e - q + 1) / 2;
6029      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030          goto onError;
6031  
6032      while (1) {
6033          Py_UCS4 ch = 0;
6034          if (e - q >= 2) {
6035              int kind = writer.kind;
6036              if (kind == PyUnicode_1BYTE_KIND) {
6037                  if (PyUnicode_IS_ASCII(writer.buffer))
6038                      ch = asciilib_utf16_decode(&q, e,
6039                              (Py_UCS1*)writer.data, &writer.pos,
6040                              native_ordering);
6041                  else
6042                      ch = ucs1lib_utf16_decode(&q, e,
6043                              (Py_UCS1*)writer.data, &writer.pos,
6044                              native_ordering);
6045              } else if (kind == PyUnicode_2BYTE_KIND) {
6046                  ch = ucs2lib_utf16_decode(&q, e,
6047                          (Py_UCS2*)writer.data, &writer.pos,
6048                          native_ordering);
6049              } else {
6050                  assert(kind == PyUnicode_4BYTE_KIND);
6051                  ch = ucs4lib_utf16_decode(&q, e,
6052                          (Py_UCS4*)writer.data, &writer.pos,
6053                          native_ordering);
6054              }
6055          }
6056  
6057          switch (ch)
6058          {
6059          case 0:
6060              /* remaining byte at the end? (size should be even) */
6061              if (q == e || consumed)
6062                  goto End;
6063              errmsg = "truncated data";
6064              startinpos = ((const char *)q) - starts;
6065              endinpos = ((const char *)e) - starts;
6066              break;
6067              /* The remaining input chars are ignored if the callback
6068                 chooses to skip the input */
6069          case 1:
6070              q -= 2;
6071              if (consumed)
6072                  goto End;
6073              errmsg = "unexpected end of data";
6074              startinpos = ((const char *)q) - starts;
6075              endinpos = ((const char *)e) - starts;
6076              break;
6077          case 2:
6078              errmsg = "illegal encoding";
6079              startinpos = ((const char *)q) - 2 - starts;
6080              endinpos = startinpos + 2;
6081              break;
6082          case 3:
6083              errmsg = "illegal UTF-16 surrogate";
6084              startinpos = ((const char *)q) - 4 - starts;
6085              endinpos = startinpos + 2;
6086              break;
6087          default:
6088              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089                  goto onError;
6090              continue;
6091          }
6092  
6093          if (unicode_decode_call_errorhandler_writer(
6094                  errors,
6095                  &errorHandler,
6096                  encoding, errmsg,
6097                  &starts,
6098                  (const char **)&e,
6099                  &startinpos,
6100                  &endinpos,
6101                  &exc,
6102                  (const char **)&q,
6103                  &writer))
6104              goto onError;
6105      }
6106  
6107  End:
6108      if (consumed)
6109          *consumed = (const char *)q-starts;
6110  
6111      Py_XDECREF(errorHandler);
6112      Py_XDECREF(exc);
6113      return _PyUnicodeWriter_Finish(&writer);
6114  
6115    onError:
6116      _PyUnicodeWriter_Dealloc(&writer);
6117      Py_XDECREF(errorHandler);
6118      Py_XDECREF(exc);
6119      return NULL;
6120  }
6121  
6122  PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123  _PyUnicode_EncodeUTF16(PyObject *str,
6124                         const char *errors,
6125                         int byteorder)
6126  {
6127      enum PyUnicode_Kind kind;
6128      const void *data;
6129      Py_ssize_t len;
6130      PyObject *v;
6131      unsigned short *out;
6132      Py_ssize_t pairs;
6133  #if PY_BIG_ENDIAN
6134      int native_ordering = byteorder >= 0;
6135  #else
6136      int native_ordering = byteorder <= 0;
6137  #endif
6138      const char *encoding;
6139      Py_ssize_t nsize, pos;
6140      PyObject *errorHandler = NULL;
6141      PyObject *exc = NULL;
6142      PyObject *rep = NULL;
6143  
6144      if (!PyUnicode_Check(str)) {
6145          PyErr_BadArgument();
6146          return NULL;
6147      }
6148      if (PyUnicode_READY(str) == -1)
6149          return NULL;
6150      kind = PyUnicode_KIND(str);
6151      data = PyUnicode_DATA(str);
6152      len = PyUnicode_GET_LENGTH(str);
6153  
6154      pairs = 0;
6155      if (kind == PyUnicode_4BYTE_KIND) {
6156          const Py_UCS4 *in = (const Py_UCS4 *)data;
6157          const Py_UCS4 *end = in + len;
6158          while (in < end) {
6159              if (*in++ >= 0x10000) {
6160                  pairs++;
6161              }
6162          }
6163      }
6164      if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165          return PyErr_NoMemory();
6166      }
6167      nsize = len + pairs + (byteorder == 0);
6168      v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169      if (v == NULL) {
6170          return NULL;
6171      }
6172  
6173      /* output buffer is 2-bytes aligned */
6174      assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175      out = (unsigned short *)PyBytes_AS_STRING(v);
6176      if (byteorder == 0) {
6177          *out++ = 0xFEFF;
6178      }
6179      if (len == 0) {
6180          goto done;
6181      }
6182  
6183      if (kind == PyUnicode_1BYTE_KIND) {
6184          ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185          goto done;
6186      }
6187  
6188      if (byteorder < 0) {
6189          encoding = "utf-16-le";
6190      }
6191      else if (byteorder > 0) {
6192          encoding = "utf-16-be";
6193      }
6194      else {
6195          encoding = "utf-16";
6196      }
6197  
6198      pos = 0;
6199      while (pos < len) {
6200          Py_ssize_t newpos, repsize, moreunits;
6201  
6202          if (kind == PyUnicode_2BYTE_KIND) {
6203              pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204                                          &out, native_ordering);
6205          }
6206          else {
6207              assert(kind == PyUnicode_4BYTE_KIND);
6208              pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209                                          &out, native_ordering);
6210          }
6211          if (pos == len)
6212              break;
6213  
6214          rep = unicode_encode_call_errorhandler(
6215                  errors, &errorHandler,
6216                  encoding, "surrogates not allowed",
6217                  str, &exc, pos, pos + 1, &newpos);
6218          if (!rep)
6219              goto error;
6220  
6221          if (PyBytes_Check(rep)) {
6222              repsize = PyBytes_GET_SIZE(rep);
6223              if (repsize & 1) {
6224                  raise_encode_exception(&exc, encoding,
6225                                         str, pos, pos + 1,
6226                                         "surrogates not allowed");
6227                  goto error;
6228              }
6229              moreunits = repsize / 2;
6230          }
6231          else {
6232              assert(PyUnicode_Check(rep));
6233              if (PyUnicode_READY(rep) < 0)
6234                  goto error;
6235              moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236              if (!PyUnicode_IS_ASCII(rep)) {
6237                  raise_encode_exception(&exc, encoding,
6238                                         str, pos, pos + 1,
6239                                         "surrogates not allowed");
6240                  goto error;
6241              }
6242          }
6243          moreunits += pos - newpos;
6244          pos = newpos;
6245  
6246          /* two bytes are reserved for each surrogate */
6247          if (moreunits > 0) {
6248              Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249              if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250                  /* integer overflow */
6251                  PyErr_NoMemory();
6252                  goto error;
6253              }
6254              if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255                  goto error;
6256              out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257          }
6258  
6259          if (PyBytes_Check(rep)) {
6260              memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261              out += repsize / 2;
6262          } else /* rep is unicode */ {
6263              assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264              ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265                                   &out, native_ordering);
6266          }
6267  
6268          Py_CLEAR(rep);
6269      }
6270  
6271      /* Cut back to size actually needed. This is necessary for, for example,
6272      encoding of a string containing isolated surrogates and the 'ignore' handler
6273      is used. */
6274      nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275      if (nsize != PyBytes_GET_SIZE(v))
6276        _PyBytes_Resize(&v, nsize);
6277      Py_XDECREF(errorHandler);
6278      Py_XDECREF(exc);
6279    done:
6280      return v;
6281    error:
6282      Py_XDECREF(rep);
6283      Py_XDECREF(errorHandler);
6284      Py_XDECREF(exc);
6285      Py_XDECREF(v);
6286      return NULL;
6287  #undef STORECHAR
6288  }
6289  
6290  PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6291  PyUnicode_AsUTF16String(PyObject *unicode)
6292  {
6293      return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294  }
6295  
6296  /* --- Unicode Escape Codec ----------------------------------------------- */
6297  
6298  static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299  
6300  PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6301  _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6302                                 Py_ssize_t size,
6303                                 const char *errors,
6304                                 Py_ssize_t *consumed,
6305                                 const char **first_invalid_escape)
6306  {
6307      const char *starts = s;
6308      _PyUnicodeWriter writer;
6309      const char *end;
6310      PyObject *errorHandler = NULL;
6311      PyObject *exc = NULL;
6312  
6313      // so we can remember if we've seen an invalid escape char or not
6314      *first_invalid_escape = NULL;
6315  
6316      if (size == 0) {
6317          if (consumed) {
6318              *consumed = 0;
6319          }
6320          _Py_RETURN_UNICODE_EMPTY();
6321      }
6322      /* Escaped strings will always be longer than the resulting
6323         Unicode string, so we start with size here and then reduce the
6324         length after conversion to the true value.
6325         (but if the error callback returns a long replacement string
6326         we'll have to allocate more space) */
6327      _PyUnicodeWriter_Init(&writer);
6328      writer.min_length = size;
6329      if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6330          goto onError;
6331      }
6332  
6333      end = s + size;
6334      while (s < end) {
6335          unsigned char c = (unsigned char) *s++;
6336          Py_UCS4 ch;
6337          int count;
6338          const char *message;
6339  
6340  #define WRITE_ASCII_CHAR(ch)                                                  \
6341              do {                                                              \
6342                  assert(ch <= 127);                                            \
6343                  assert(writer.pos < writer.size);                             \
6344                  PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6345              } while(0)
6346  
6347  #define WRITE_CHAR(ch)                                                        \
6348              do {                                                              \
6349                  if (ch <= writer.maxchar) {                                   \
6350                      assert(writer.pos < writer.size);                         \
6351                      PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6352                  }                                                             \
6353                  else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6354                      goto onError;                                             \
6355                  }                                                             \
6356              } while(0)
6357  
6358          /* Non-escape characters are interpreted as Unicode ordinals */
6359          if (c != '\\') {
6360              WRITE_CHAR(c);
6361              continue;
6362          }
6363  
6364          Py_ssize_t startinpos = s - starts - 1;
6365          /* \ - Escapes */
6366          if (s >= end) {
6367              message = "\\ at end of string";
6368              goto incomplete;
6369          }
6370          c = (unsigned char) *s++;
6371  
6372          assert(writer.pos < writer.size);
6373          switch (c) {
6374  
6375              /* \x escapes */
6376          case '\n': continue;
6377          case '\\': WRITE_ASCII_CHAR('\\'); continue;
6378          case '\'': WRITE_ASCII_CHAR('\''); continue;
6379          case '\"': WRITE_ASCII_CHAR('\"'); continue;
6380          case 'b': WRITE_ASCII_CHAR('\b'); continue;
6381          /* FF */
6382          case 'f': WRITE_ASCII_CHAR('\014'); continue;
6383          case 't': WRITE_ASCII_CHAR('\t'); continue;
6384          case 'n': WRITE_ASCII_CHAR('\n'); continue;
6385          case 'r': WRITE_ASCII_CHAR('\r'); continue;
6386          /* VT */
6387          case 'v': WRITE_ASCII_CHAR('\013'); continue;
6388          /* BEL, not classic C */
6389          case 'a': WRITE_ASCII_CHAR('\007'); continue;
6390  
6391              /* \OOO (octal) escapes */
6392          case '0': case '1': case '2': case '3':
6393          case '4': case '5': case '6': case '7':
6394              ch = c - '0';
6395              if (s < end && '0' <= *s && *s <= '7') {
6396                  ch = (ch<<3) + *s++ - '0';
6397                  if (s < end && '0' <= *s && *s <= '7') {
6398                      ch = (ch<<3) + *s++ - '0';
6399                  }
6400              }
6401              if (ch > 0377) {
6402                  if (*first_invalid_escape == NULL) {
6403                      *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6404                                                      already incremented s. */
6405                  }
6406              }
6407              WRITE_CHAR(ch);
6408              continue;
6409  
6410              /* hex escapes */
6411              /* \xXX */
6412          case 'x':
6413              count = 2;
6414              message = "truncated \\xXX escape";
6415              goto hexescape;
6416  
6417              /* \uXXXX */
6418          case 'u':
6419              count = 4;
6420              message = "truncated \\uXXXX escape";
6421              goto hexescape;
6422  
6423              /* \UXXXXXXXX */
6424          case 'U':
6425              count = 8;
6426              message = "truncated \\UXXXXXXXX escape";
6427          hexescape:
6428              for (ch = 0; count; ++s, --count) {
6429                  if (s >= end) {
6430                      goto incomplete;
6431                  }
6432                  c = (unsigned char)*s;
6433                  ch <<= 4;
6434                  if (c >= '0' && c <= '9') {
6435                      ch += c - '0';
6436                  }
6437                  else if (c >= 'a' && c <= 'f') {
6438                      ch += c - ('a' - 10);
6439                  }
6440                  else if (c >= 'A' && c <= 'F') {
6441                      ch += c - ('A' - 10);
6442                  }
6443                  else {
6444                      goto error;
6445                  }
6446              }
6447  
6448              /* when we get here, ch is a 32-bit unicode character */
6449              if (ch > MAX_UNICODE) {
6450                  message = "illegal Unicode character";
6451                  goto error;
6452              }
6453  
6454              WRITE_CHAR(ch);
6455              continue;
6456  
6457              /* \N{name} */
6458          case 'N':
6459              if (ucnhash_capi == NULL) {
6460                  /* load the unicode data module */
6461                  ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6462                                                  PyUnicodeData_CAPSULE_NAME, 1);
6463                  if (ucnhash_capi == NULL) {
6464                      PyErr_SetString(
6465                          PyExc_UnicodeError,
6466                          "\\N escapes not supported (can't load unicodedata module)"
6467                          );
6468                      goto onError;
6469                  }
6470              }
6471  
6472              message = "malformed \\N character escape";
6473              if (s >= end) {
6474                  goto incomplete;
6475              }
6476              if (*s == '{') {
6477                  const char *start = ++s;
6478                  size_t namelen;
6479                  /* look for the closing brace */
6480                  while (s < end && *s != '}')
6481                      s++;
6482                  if (s >= end) {
6483                      goto incomplete;
6484                  }
6485                  namelen = s - start;
6486                  if (namelen) {
6487                      /* found a name.  look it up in the unicode database */
6488                      s++;
6489                      ch = 0xffffffff; /* in case 'getcode' messes up */
6490                      if (namelen <= INT_MAX &&
6491                          ucnhash_capi->getcode(start, (int)namelen,
6492                                                &ch, 0)) {
6493                          assert(ch <= MAX_UNICODE);
6494                          WRITE_CHAR(ch);
6495                          continue;
6496                      }
6497                      message = "unknown Unicode character name";
6498                  }
6499              }
6500              goto error;
6501  
6502          default:
6503              if (*first_invalid_escape == NULL) {
6504                  *first_invalid_escape = s-1; /* Back up one char, since we've
6505                                                  already incremented s. */
6506              }
6507              WRITE_ASCII_CHAR('\\');
6508              WRITE_CHAR(c);
6509              continue;
6510          }
6511  
6512        incomplete:
6513          if (consumed) {
6514              *consumed = startinpos;
6515              break;
6516          }
6517        error:;
6518          Py_ssize_t endinpos = s-starts;
6519          writer.min_length = end - s + writer.pos;
6520          if (unicode_decode_call_errorhandler_writer(
6521                  errors, &errorHandler,
6522                  "unicodeescape", message,
6523                  &starts, &end, &startinpos, &endinpos, &exc, &s,
6524                  &writer)) {
6525              goto onError;
6526          }
6527          assert(end - s <= writer.size - writer.pos);
6528  
6529  #undef WRITE_ASCII_CHAR
6530  #undef WRITE_CHAR
6531      }
6532  
6533      Py_XDECREF(errorHandler);
6534      Py_XDECREF(exc);
6535      return _PyUnicodeWriter_Finish(&writer);
6536  
6537    onError:
6538      _PyUnicodeWriter_Dealloc(&writer);
6539      Py_XDECREF(errorHandler);
6540      Py_XDECREF(exc);
6541      return NULL;
6542  }
6543  
6544  PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6545  _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6546                                Py_ssize_t size,
6547                                const char *errors,
6548                                Py_ssize_t *consumed)
6549  {
6550      const char *first_invalid_escape;
6551      PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6552                                                        consumed,
6553                                                        &first_invalid_escape);
6554      if (result == NULL)
6555          return NULL;
6556      if (first_invalid_escape != NULL) {
6557          unsigned char c = *first_invalid_escape;
6558          if ('4' <= c && c <= '7') {
6559              if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6560                                   "invalid octal escape sequence '\\%.3s'",
6561                                   first_invalid_escape) < 0)
6562              {
6563                  Py_DECREF(result);
6564                  return NULL;
6565              }
6566          }
6567          else {
6568              if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6569                                   "invalid escape sequence '\\%c'",
6570                                   c) < 0)
6571              {
6572                  Py_DECREF(result);
6573                  return NULL;
6574              }
6575          }
6576      }
6577      return result;
6578  }
6579  
6580  PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6581  PyUnicode_DecodeUnicodeEscape(const char *s,
6582                                Py_ssize_t size,
6583                                const char *errors)
6584  {
6585      return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6586  }
6587  
6588  /* Return a Unicode-Escape string version of the Unicode object. */
6589  
6590  PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6591  PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6592  {
6593      Py_ssize_t i, len;
6594      PyObject *repr;
6595      char *p;
6596      enum PyUnicode_Kind kind;
6597      const void *data;
6598      Py_ssize_t expandsize;
6599  
6600      /* Initial allocation is based on the longest-possible character
6601         escape.
6602  
6603         For UCS1 strings it's '\xxx', 4 bytes per source character.
6604         For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6605         For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6606      */
6607  
6608      if (!PyUnicode_Check(unicode)) {
6609          PyErr_BadArgument();
6610          return NULL;
6611      }
6612      if (PyUnicode_READY(unicode) == -1) {
6613          return NULL;
6614      }
6615  
6616      len = PyUnicode_GET_LENGTH(unicode);
6617      if (len == 0) {
6618          return PyBytes_FromStringAndSize(NULL, 0);
6619      }
6620  
6621      kind = PyUnicode_KIND(unicode);
6622      data = PyUnicode_DATA(unicode);
6623      /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6624         bytes, and 1 byte characters 4. */
6625      expandsize = kind * 2 + 2;
6626      if (len > PY_SSIZE_T_MAX / expandsize) {
6627          return PyErr_NoMemory();
6628      }
6629      repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6630      if (repr == NULL) {
6631          return NULL;
6632      }
6633  
6634      p = PyBytes_AS_STRING(repr);
6635      for (i = 0; i < len; i++) {
6636          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6637  
6638          /* U+0000-U+00ff range */
6639          if (ch < 0x100) {
6640              if (ch >= ' ' && ch < 127) {
6641                  if (ch != '\\') {
6642                      /* Copy printable US ASCII as-is */
6643                      *p++ = (char) ch;
6644                  }
6645                  /* Escape backslashes */
6646                  else {
6647                      *p++ = '\\';
6648                      *p++ = '\\';
6649                  }
6650              }
6651  
6652              /* Map special whitespace to '\t', \n', '\r' */
6653              else if (ch == '\t') {
6654                  *p++ = '\\';
6655                  *p++ = 't';
6656              }
6657              else if (ch == '\n') {
6658                  *p++ = '\\';
6659                  *p++ = 'n';
6660              }
6661              else if (ch == '\r') {
6662                  *p++ = '\\';
6663                  *p++ = 'r';
6664              }
6665  
6666              /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6667              else {
6668                  *p++ = '\\';
6669                  *p++ = 'x';
6670                  *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6671                  *p++ = Py_hexdigits[ch & 0x000F];
6672              }
6673          }
6674          /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6675          else if (ch < 0x10000) {
6676              *p++ = '\\';
6677              *p++ = 'u';
6678              *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6679              *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6680              *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6681              *p++ = Py_hexdigits[ch & 0x000F];
6682          }
6683          /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6684          else {
6685  
6686              /* Make sure that the first two digits are zero */
6687              assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688              *p++ = '\\';
6689              *p++ = 'U';
6690              *p++ = '0';
6691              *p++ = '0';
6692              *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6693              *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6694              *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6695              *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6696              *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6697              *p++ = Py_hexdigits[ch & 0x0000000F];
6698          }
6699      }
6700  
6701      assert(p - PyBytes_AS_STRING(repr) > 0);
6702      if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703          return NULL;
6704      }
6705      return repr;
6706  }
6707  
6708  /* --- Raw Unicode Escape Codec ------------------------------------------- */
6709  
6710  PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6711  _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6712                                            Py_ssize_t size,
6713                                            const char *errors,
6714                                            Py_ssize_t *consumed)
6715  {
6716      const char *starts = s;
6717      _PyUnicodeWriter writer;
6718      const char *end;
6719      PyObject *errorHandler = NULL;
6720      PyObject *exc = NULL;
6721  
6722      if (size == 0) {
6723          if (consumed) {
6724              *consumed = 0;
6725          }
6726          _Py_RETURN_UNICODE_EMPTY();
6727      }
6728  
6729      /* Escaped strings will always be longer than the resulting
6730         Unicode string, so we start with size here and then reduce the
6731         length after conversion to the true value. (But decoding error
6732         handler might have to resize the string) */
6733      _PyUnicodeWriter_Init(&writer);
6734      writer.min_length = size;
6735      if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6736          goto onError;
6737      }
6738  
6739      end = s + size;
6740      while (s < end) {
6741          unsigned char c = (unsigned char) *s++;
6742          Py_UCS4 ch;
6743          int count;
6744          const char *message;
6745  
6746  #define WRITE_CHAR(ch)                                                        \
6747              do {                                                              \
6748                  if (ch <= writer.maxchar) {                                   \
6749                      assert(writer.pos < writer.size);                         \
6750                      PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6751                  }                                                             \
6752                  else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6753                      goto onError;                                             \
6754                  }                                                             \
6755              } while(0)
6756  
6757          /* Non-escape characters are interpreted as Unicode ordinals */
6758          if (c != '\\' || (s >= end && !consumed)) {
6759              WRITE_CHAR(c);
6760              continue;
6761          }
6762  
6763          Py_ssize_t startinpos = s - starts - 1;
6764          /* \ - Escapes */
6765          if (s >= end) {
6766              assert(consumed);
6767              // Set message to silent compiler warning.
6768              // Actually it is never used.
6769              message = "\\ at end of string";
6770              goto incomplete;
6771          }
6772  
6773          c = (unsigned char) *s++;
6774          if (c == 'u') {
6775              count = 4;
6776              message = "truncated \\uXXXX escape";
6777          }
6778          else if (c == 'U') {
6779              count = 8;
6780              message = "truncated \\UXXXXXXXX escape";
6781          }
6782          else {
6783              assert(writer.pos < writer.size);
6784              PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6785              WRITE_CHAR(c);
6786              continue;
6787          }
6788  
6789          /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6790          for (ch = 0; count; ++s, --count) {
6791              if (s >= end) {
6792                  goto incomplete;
6793              }
6794              c = (unsigned char)*s;
6795              ch <<= 4;
6796              if (c >= '0' && c <= '9') {
6797                  ch += c - '0';
6798              }
6799              else if (c >= 'a' && c <= 'f') {
6800                  ch += c - ('a' - 10);
6801              }
6802              else if (c >= 'A' && c <= 'F') {
6803                  ch += c - ('A' - 10);
6804              }
6805              else {
6806                  goto error;
6807              }
6808          }
6809          if (ch > MAX_UNICODE) {
6810              message = "\\Uxxxxxxxx out of range";
6811              goto error;
6812          }
6813          WRITE_CHAR(ch);
6814          continue;
6815  
6816        incomplete:
6817          if (consumed) {
6818              *consumed = startinpos;
6819              break;
6820          }
6821        error:;
6822          Py_ssize_t endinpos = s-starts;
6823          writer.min_length = end - s + writer.pos;
6824          if (unicode_decode_call_errorhandler_writer(
6825                  errors, &errorHandler,
6826                  "rawunicodeescape", message,
6827                  &starts, &end, &startinpos, &endinpos, &exc, &s,
6828                  &writer)) {
6829              goto onError;
6830          }
6831          assert(end - s <= writer.size - writer.pos);
6832  
6833  #undef WRITE_CHAR
6834      }
6835      Py_XDECREF(errorHandler);
6836      Py_XDECREF(exc);
6837      return _PyUnicodeWriter_Finish(&writer);
6838  
6839    onError:
6840      _PyUnicodeWriter_Dealloc(&writer);
6841      Py_XDECREF(errorHandler);
6842      Py_XDECREF(exc);
6843      return NULL;
6844  }
6845  
6846  PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6847  PyUnicode_DecodeRawUnicodeEscape(const char *s,
6848                                   Py_ssize_t size,
6849                                   const char *errors)
6850  {
6851      return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6852  }
6853  
6854  
6855  PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6856  PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6857  {
6858      PyObject *repr;
6859      char *p;
6860      Py_ssize_t expandsize, pos;
6861      int kind;
6862      const void *data;
6863      Py_ssize_t len;
6864  
6865      if (!PyUnicode_Check(unicode)) {
6866          PyErr_BadArgument();
6867          return NULL;
6868      }
6869      if (PyUnicode_READY(unicode) == -1) {
6870          return NULL;
6871      }
6872      kind = PyUnicode_KIND(unicode);
6873      data = PyUnicode_DATA(unicode);
6874      len = PyUnicode_GET_LENGTH(unicode);
6875      if (kind == PyUnicode_1BYTE_KIND) {
6876          return PyBytes_FromStringAndSize(data, len);
6877      }
6878  
6879      /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6880         bytes, and 1 byte characters 4. */
6881      expandsize = kind * 2 + 2;
6882  
6883      if (len > PY_SSIZE_T_MAX / expandsize) {
6884          return PyErr_NoMemory();
6885      }
6886      repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6887      if (repr == NULL) {
6888          return NULL;
6889      }
6890      if (len == 0) {
6891          return repr;
6892      }
6893  
6894      p = PyBytes_AS_STRING(repr);
6895      for (pos = 0; pos < len; pos++) {
6896          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6897  
6898          /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6899          if (ch < 0x100) {
6900              *p++ = (char) ch;
6901          }
6902          /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6903          else if (ch < 0x10000) {
6904              *p++ = '\\';
6905              *p++ = 'u';
6906              *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907              *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908              *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909              *p++ = Py_hexdigits[ch & 15];
6910          }
6911          /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6912          else {
6913              assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6914              *p++ = '\\';
6915              *p++ = 'U';
6916              *p++ = '0';
6917              *p++ = '0';
6918              *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6919              *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6920              *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6921              *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6922              *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6923              *p++ = Py_hexdigits[ch & 15];
6924          }
6925      }
6926  
6927      assert(p > PyBytes_AS_STRING(repr));
6928      if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6929          return NULL;
6930      }
6931      return repr;
6932  }
6933  
6934  /* --- Latin-1 Codec ------------------------------------------------------ */
6935  
6936  PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6937  PyUnicode_DecodeLatin1(const char *s,
6938                         Py_ssize_t size,
6939                         const char *errors)
6940  {
6941      /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6942      return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6943  }
6944  
6945  /* create or adjust a UnicodeEncodeError */
6946  static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6947  make_encode_exception(PyObject **exceptionObject,
6948                        const char *encoding,
6949                        PyObject *unicode,
6950                        Py_ssize_t startpos, Py_ssize_t endpos,
6951                        const char *reason)
6952  {
6953      if (*exceptionObject == NULL) {
6954          *exceptionObject = PyObject_CallFunction(
6955              PyExc_UnicodeEncodeError, "sOnns",
6956              encoding, unicode, startpos, endpos, reason);
6957      }
6958      else {
6959          if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6960              goto onError;
6961          if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6962              goto onError;
6963          if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6964              goto onError;
6965          return;
6966        onError:
6967          Py_CLEAR(*exceptionObject);
6968      }
6969  }
6970  
6971  /* raises a UnicodeEncodeError */
6972  static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6973  raise_encode_exception(PyObject **exceptionObject,
6974                         const char *encoding,
6975                         PyObject *unicode,
6976                         Py_ssize_t startpos, Py_ssize_t endpos,
6977                         const char *reason)
6978  {
6979      make_encode_exception(exceptionObject,
6980                            encoding, unicode, startpos, endpos, reason);
6981      if (*exceptionObject != NULL)
6982          PyCodec_StrictErrors(*exceptionObject);
6983  }
6984  
6985  /* error handling callback helper:
6986     build arguments, call the callback and check the arguments,
6987     put the result into newpos and return the replacement string, which
6988     has to be freed by the caller */
6989  static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6990  unicode_encode_call_errorhandler(const char *errors,
6991                                   PyObject **errorHandler,
6992                                   const char *encoding, const char *reason,
6993                                   PyObject *unicode, PyObject **exceptionObject,
6994                                   Py_ssize_t startpos, Py_ssize_t endpos,
6995                                   Py_ssize_t *newpos)
6996  {
6997      static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6998      Py_ssize_t len;
6999      PyObject *restuple;
7000      PyObject *resunicode;
7001  
7002      if (*errorHandler == NULL) {
7003          *errorHandler = PyCodec_LookupError(errors);
7004          if (*errorHandler == NULL)
7005              return NULL;
7006      }
7007  
7008      if (PyUnicode_READY(unicode) == -1)
7009          return NULL;
7010      len = PyUnicode_GET_LENGTH(unicode);
7011  
7012      make_encode_exception(exceptionObject,
7013                            encoding, unicode, startpos, endpos, reason);
7014      if (*exceptionObject == NULL)
7015          return NULL;
7016  
7017      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7018      if (restuple == NULL)
7019          return NULL;
7020      if (!PyTuple_Check(restuple)) {
7021          PyErr_SetString(PyExc_TypeError, &argparse[3]);
7022          Py_DECREF(restuple);
7023          return NULL;
7024      }
7025      if (!PyArg_ParseTuple(restuple, argparse,
7026                            &resunicode, newpos)) {
7027          Py_DECREF(restuple);
7028          return NULL;
7029      }
7030      if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7031          PyErr_SetString(PyExc_TypeError, &argparse[3]);
7032          Py_DECREF(restuple);
7033          return NULL;
7034      }
7035      if (*newpos<0)
7036          *newpos = len + *newpos;
7037      if (*newpos<0 || *newpos>len) {
7038          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7039          Py_DECREF(restuple);
7040          return NULL;
7041      }
7042      Py_INCREF(resunicode);
7043      Py_DECREF(restuple);
7044      return resunicode;
7045  }
7046  
7047  static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7048  unicode_encode_ucs1(PyObject *unicode,
7049                      const char *errors,
7050                      const Py_UCS4 limit)
7051  {
7052      /* input state */
7053      Py_ssize_t pos=0, size;
7054      int kind;
7055      const void *data;
7056      /* pointer into the output */
7057      char *str;
7058      const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7059      const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7060      PyObject *error_handler_obj = NULL;
7061      PyObject *exc = NULL;
7062      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7063      PyObject *rep = NULL;
7064      /* output object */
7065      _PyBytesWriter writer;
7066  
7067      if (PyUnicode_READY(unicode) == -1)
7068          return NULL;
7069      size = PyUnicode_GET_LENGTH(unicode);
7070      kind = PyUnicode_KIND(unicode);
7071      data = PyUnicode_DATA(unicode);
7072      /* allocate enough for a simple encoding without
7073         replacements, if we need more, we'll resize */
7074      if (size == 0)
7075          return PyBytes_FromStringAndSize(NULL, 0);
7076  
7077      _PyBytesWriter_Init(&writer);
7078      str = _PyBytesWriter_Alloc(&writer, size);
7079      if (str == NULL)
7080          return NULL;
7081  
7082      while (pos < size) {
7083          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7084  
7085          /* can we encode this? */
7086          if (ch < limit) {
7087              /* no overflow check, because we know that the space is enough */
7088              *str++ = (char)ch;
7089              ++pos;
7090          }
7091          else {
7092              Py_ssize_t newpos, i;
7093              /* startpos for collecting unencodable chars */
7094              Py_ssize_t collstart = pos;
7095              Py_ssize_t collend = collstart + 1;
7096              /* find all unecodable characters */
7097  
7098              while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7099                  ++collend;
7100  
7101              /* Only overallocate the buffer if it's not the last write */
7102              writer.overallocate = (collend < size);
7103  
7104              /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7105              if (error_handler == _Py_ERROR_UNKNOWN)
7106                  error_handler = _Py_GetErrorHandler(errors);
7107  
7108              switch (error_handler) {
7109              case _Py_ERROR_STRICT:
7110                  raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7111                  goto onError;
7112  
7113              case _Py_ERROR_REPLACE:
7114                  memset(str, '?', collend - collstart);
7115                  str += (collend - collstart);
7116                  /* fall through */
7117              case _Py_ERROR_IGNORE:
7118                  pos = collend;
7119                  break;
7120  
7121              case _Py_ERROR_BACKSLASHREPLACE:
7122                  /* subtract preallocated bytes */
7123                  writer.min_size -= (collend - collstart);
7124                  str = backslashreplace(&writer, str,
7125                                         unicode, collstart, collend);
7126                  if (str == NULL)
7127                      goto onError;
7128                  pos = collend;
7129                  break;
7130  
7131              case _Py_ERROR_XMLCHARREFREPLACE:
7132                  /* subtract preallocated bytes */
7133                  writer.min_size -= (collend - collstart);
7134                  str = xmlcharrefreplace(&writer, str,
7135                                          unicode, collstart, collend);
7136                  if (str == NULL)
7137                      goto onError;
7138                  pos = collend;
7139                  break;
7140  
7141              case _Py_ERROR_SURROGATEESCAPE:
7142                  for (i = collstart; i < collend; ++i) {
7143                      ch = PyUnicode_READ(kind, data, i);
7144                      if (ch < 0xdc80 || 0xdcff < ch) {
7145                          /* Not a UTF-8b surrogate */
7146                          break;
7147                      }
7148                      *str++ = (char)(ch - 0xdc00);
7149                      ++pos;
7150                  }
7151                  if (i >= collend)
7152                      break;
7153                  collstart = pos;
7154                  assert(collstart != collend);
7155                  /* fall through */
7156  
7157              default:
7158                  rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7159                                                         encoding, reason, unicode, &exc,
7160                                                         collstart, collend, &newpos);
7161                  if (rep == NULL)
7162                      goto onError;
7163  
7164                  if (newpos < collstart) {
7165                      writer.overallocate = 1;
7166                      str = _PyBytesWriter_Prepare(&writer, str,
7167                                                   collstart - newpos);
7168                      if (str == NULL)
7169                          goto onError;
7170                  }
7171                  else {
7172                      /* subtract preallocated bytes */
7173                      writer.min_size -= newpos - collstart;
7174                      /* Only overallocate the buffer if it's not the last write */
7175                      writer.overallocate = (newpos < size);
7176                  }
7177  
7178                  if (PyBytes_Check(rep)) {
7179                      /* Directly copy bytes result to output. */
7180                      str = _PyBytesWriter_WriteBytes(&writer, str,
7181                                                      PyBytes_AS_STRING(rep),
7182                                                      PyBytes_GET_SIZE(rep));
7183                  }
7184                  else {
7185                      assert(PyUnicode_Check(rep));
7186  
7187                      if (PyUnicode_READY(rep) < 0)
7188                          goto onError;
7189  
7190                      if (limit == 256 ?
7191                          PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7192                          !PyUnicode_IS_ASCII(rep))
7193                      {
7194                          /* Not all characters are smaller than limit */
7195                          raise_encode_exception(&exc, encoding, unicode,
7196                                                 collstart, collend, reason);
7197                          goto onError;
7198                      }
7199                      assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7200                      str = _PyBytesWriter_WriteBytes(&writer, str,
7201                                                      PyUnicode_DATA(rep),
7202                                                      PyUnicode_GET_LENGTH(rep));
7203                  }
7204                  if (str == NULL)
7205                      goto onError;
7206  
7207                  pos = newpos;
7208                  Py_CLEAR(rep);
7209              }
7210  
7211              /* If overallocation was disabled, ensure that it was the last
7212                 write. Otherwise, we missed an optimization */
7213              assert(writer.overallocate || pos == size);
7214          }
7215      }
7216  
7217      Py_XDECREF(error_handler_obj);
7218      Py_XDECREF(exc);
7219      return _PyBytesWriter_Finish(&writer, str);
7220  
7221    onError:
7222      Py_XDECREF(rep);
7223      _PyBytesWriter_Dealloc(&writer);
7224      Py_XDECREF(error_handler_obj);
7225      Py_XDECREF(exc);
7226      return NULL;
7227  }
7228  
7229  PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7230  _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7231  {
7232      if (!PyUnicode_Check(unicode)) {
7233          PyErr_BadArgument();
7234          return NULL;
7235      }
7236      if (PyUnicode_READY(unicode) == -1)
7237          return NULL;
7238      /* Fast path: if it is a one-byte string, construct
7239         bytes object directly. */
7240      if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7241          return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7242                                           PyUnicode_GET_LENGTH(unicode));
7243      /* Non-Latin-1 characters present. Defer to above function to
7244         raise the exception. */
7245      return unicode_encode_ucs1(unicode, errors, 256);
7246  }
7247  
7248  PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7249  PyUnicode_AsLatin1String(PyObject *unicode)
7250  {
7251      return _PyUnicode_AsLatin1String(unicode, NULL);
7252  }
7253  
7254  /* --- 7-bit ASCII Codec -------------------------------------------------- */
7255  
7256  PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7257  PyUnicode_DecodeASCII(const char *s,
7258                        Py_ssize_t size,
7259                        const char *errors)
7260  {
7261      const char *starts = s;
7262      const char *e = s + size;
7263      PyObject *error_handler_obj = NULL;
7264      PyObject *exc = NULL;
7265      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7266  
7267      if (size == 0)
7268          _Py_RETURN_UNICODE_EMPTY();
7269  
7270      /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7271      if (size == 1 && (unsigned char)s[0] < 128) {
7272          return get_latin1_char((unsigned char)s[0]);
7273      }
7274  
7275      // Shortcut for simple case
7276      PyObject *u = PyUnicode_New(size, 127);
7277      if (u == NULL) {
7278          return NULL;
7279      }
7280      Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7281      if (outpos == size) {
7282          return u;
7283      }
7284  
7285      _PyUnicodeWriter writer;
7286      _PyUnicodeWriter_InitWithBuffer(&writer, u);
7287      writer.pos = outpos;
7288  
7289      s += outpos;
7290      int kind = writer.kind;
7291      void *data = writer.data;
7292      Py_ssize_t startinpos, endinpos;
7293  
7294      while (s < e) {
7295          unsigned char c = (unsigned char)*s;
7296          if (c < 128) {
7297              PyUnicode_WRITE(kind, data, writer.pos, c);
7298              writer.pos++;
7299              ++s;
7300              continue;
7301          }
7302  
7303          /* byte outsize range 0x00..0x7f: call the error handler */
7304  
7305          if (error_handler == _Py_ERROR_UNKNOWN)
7306              error_handler = _Py_GetErrorHandler(errors);
7307  
7308          switch (error_handler)
7309          {
7310          case _Py_ERROR_REPLACE:
7311          case _Py_ERROR_SURROGATEESCAPE:
7312              /* Fast-path: the error handler only writes one character,
7313                 but we may switch to UCS2 at the first write */
7314              if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7315                  goto onError;
7316              kind = writer.kind;
7317              data = writer.data;
7318  
7319              if (error_handler == _Py_ERROR_REPLACE)
7320                  PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7321              else
7322                  PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7323              writer.pos++;
7324              ++s;
7325              break;
7326  
7327          case _Py_ERROR_IGNORE:
7328              ++s;
7329              break;
7330  
7331          default:
7332              startinpos = s-starts;
7333              endinpos = startinpos + 1;
7334              if (unicode_decode_call_errorhandler_writer(
7335                      errors, &error_handler_obj,
7336                      "ascii", "ordinal not in range(128)",
7337                      &starts, &e, &startinpos, &endinpos, &exc, &s,
7338                      &writer))
7339                  goto onError;
7340              kind = writer.kind;
7341              data = writer.data;
7342          }
7343      }
7344      Py_XDECREF(error_handler_obj);
7345      Py_XDECREF(exc);
7346      return _PyUnicodeWriter_Finish(&writer);
7347  
7348    onError:
7349      _PyUnicodeWriter_Dealloc(&writer);
7350      Py_XDECREF(error_handler_obj);
7351      Py_XDECREF(exc);
7352      return NULL;
7353  }
7354  
7355  PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7356  _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7357  {
7358      if (!PyUnicode_Check(unicode)) {
7359          PyErr_BadArgument();
7360          return NULL;
7361      }
7362      if (PyUnicode_READY(unicode) == -1)
7363          return NULL;
7364      /* Fast path: if it is an ASCII-only string, construct bytes object
7365         directly. Else defer to above function to raise the exception. */
7366      if (PyUnicode_IS_ASCII(unicode))
7367          return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7368                                           PyUnicode_GET_LENGTH(unicode));
7369      return unicode_encode_ucs1(unicode, errors, 128);
7370  }
7371  
7372  PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7373  PyUnicode_AsASCIIString(PyObject *unicode)
7374  {
7375      return _PyUnicode_AsASCIIString(unicode, NULL);
7376  }
7377  
7378  #ifdef MS_WINDOWS
7379  
7380  /* --- MBCS codecs for Windows -------------------------------------------- */
7381  
7382  #if SIZEOF_INT < SIZEOF_SIZE_T
7383  #define NEED_RETRY
7384  #endif
7385  
7386  /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7387     transcoding from UTF-16), but INT_MAX / 4 performs better in
7388     both cases also and avoids partial characters overrunning the
7389     length limit in MultiByteToWideChar on Windows */
7390  #define DECODING_CHUNK_SIZE (INT_MAX/4)
7391  
7392  #ifndef WC_ERR_INVALID_CHARS
7393  #  define WC_ERR_INVALID_CHARS 0x0080
7394  #endif
7395  
7396  static const char*
code_page_name(UINT code_page,PyObject ** obj)7397  code_page_name(UINT code_page, PyObject **obj)
7398  {
7399      *obj = NULL;
7400      if (code_page == CP_ACP)
7401          return "mbcs";
7402      if (code_page == CP_UTF7)
7403          return "CP_UTF7";
7404      if (code_page == CP_UTF8)
7405          return "CP_UTF8";
7406  
7407      *obj = PyBytes_FromFormat("cp%u", code_page);
7408      if (*obj == NULL)
7409          return NULL;
7410      return PyBytes_AS_STRING(*obj);
7411  }
7412  
7413  static DWORD
decode_code_page_flags(UINT code_page)7414  decode_code_page_flags(UINT code_page)
7415  {
7416      if (code_page == CP_UTF7) {
7417          /* The CP_UTF7 decoder only supports flags=0 */
7418          return 0;
7419      }
7420      else
7421          return MB_ERR_INVALID_CHARS;
7422  }
7423  
7424  /*
7425   * Decode a byte string from a Windows code page into unicode object in strict
7426   * mode.
7427   *
7428   * Returns consumed size if succeed, returns -2 on decode error, or raise an
7429   * OSError and returns -1 on other error.
7430   */
7431  static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7432  decode_code_page_strict(UINT code_page,
7433                          wchar_t **buf,
7434                          Py_ssize_t *bufsize,
7435                          const char *in,
7436                          int insize)
7437  {
7438      DWORD flags = MB_ERR_INVALID_CHARS;
7439      wchar_t *out;
7440      DWORD outsize;
7441  
7442      /* First get the size of the result */
7443      assert(insize > 0);
7444      while ((outsize = MultiByteToWideChar(code_page, flags,
7445                                            in, insize, NULL, 0)) <= 0)
7446      {
7447          if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7448              goto error;
7449          }
7450          /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7451          flags = 0;
7452      }
7453  
7454      /* Extend a wchar_t* buffer */
7455      Py_ssize_t n = *bufsize;   /* Get the current length */
7456      if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7457          return -1;
7458      }
7459      out = *buf + n;
7460  
7461      /* Do the conversion */
7462      outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7463      if (outsize <= 0)
7464          goto error;
7465      return insize;
7466  
7467  error:
7468      if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7469          return -2;
7470      PyErr_SetFromWindowsErr(0);
7471      return -1;
7472  }
7473  
7474  /*
7475   * Decode a byte string from a code page into unicode object with an error
7476   * handler.
7477   *
7478   * Returns consumed size if succeed, or raise an OSError or
7479   * UnicodeDecodeError exception and returns -1 on error.
7480   */
7481  static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7482  decode_code_page_errors(UINT code_page,
7483                          wchar_t **buf,
7484                          Py_ssize_t *bufsize,
7485                          const char *in, const int size,
7486                          const char *errors, int final)
7487  {
7488      const char *startin = in;
7489      const char *endin = in + size;
7490      DWORD flags = MB_ERR_INVALID_CHARS;
7491      /* Ideally, we should get reason from FormatMessage. This is the Windows
7492         2000 English version of the message. */
7493      const char *reason = "No mapping for the Unicode character exists "
7494                           "in the target code page.";
7495      /* each step cannot decode more than 1 character, but a character can be
7496         represented as a surrogate pair */
7497      wchar_t buffer[2], *out;
7498      int insize;
7499      Py_ssize_t outsize;
7500      PyObject *errorHandler = NULL;
7501      PyObject *exc = NULL;
7502      PyObject *encoding_obj = NULL;
7503      const char *encoding;
7504      DWORD err;
7505      int ret = -1;
7506  
7507      assert(size > 0);
7508  
7509      encoding = code_page_name(code_page, &encoding_obj);
7510      if (encoding == NULL)
7511          return -1;
7512  
7513      if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7514          /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7515             UnicodeDecodeError. */
7516          make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7517          if (exc != NULL) {
7518              PyCodec_StrictErrors(exc);
7519              Py_CLEAR(exc);
7520          }
7521          goto error;
7522      }
7523  
7524      /* Extend a wchar_t* buffer */
7525      Py_ssize_t n = *bufsize;   /* Get the current length */
7526      if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7527          PyErr_NoMemory();
7528          goto error;
7529      }
7530      if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7531          goto error;
7532      }
7533      out = *buf + n;
7534  
7535      /* Decode the byte string character per character */
7536      while (in < endin)
7537      {
7538          /* Decode a character */
7539          insize = 1;
7540          do
7541          {
7542              outsize = MultiByteToWideChar(code_page, flags,
7543                                            in, insize,
7544                                            buffer, Py_ARRAY_LENGTH(buffer));
7545              if (outsize > 0)
7546                  break;
7547              err = GetLastError();
7548              if (err == ERROR_INVALID_FLAGS && flags) {
7549                  /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7550                  flags = 0;
7551                  continue;
7552              }
7553              if (err != ERROR_NO_UNICODE_TRANSLATION
7554                  && err != ERROR_INSUFFICIENT_BUFFER)
7555              {
7556                  PyErr_SetFromWindowsErr(0);
7557                  goto error;
7558              }
7559              insize++;
7560          }
7561          /* 4=maximum length of a UTF-8 sequence */
7562          while (insize <= 4 && (in + insize) <= endin);
7563  
7564          if (outsize <= 0) {
7565              Py_ssize_t startinpos, endinpos, outpos;
7566  
7567              /* last character in partial decode? */
7568              if (in + insize >= endin && !final)
7569                  break;
7570  
7571              startinpos = in - startin;
7572              endinpos = startinpos + 1;
7573              outpos = out - *buf;
7574              if (unicode_decode_call_errorhandler_wchar(
7575                      errors, &errorHandler,
7576                      encoding, reason,
7577                      &startin, &endin, &startinpos, &endinpos, &exc, &in,
7578                      buf, bufsize, &outpos))
7579              {
7580                  goto error;
7581              }
7582              out = *buf + outpos;
7583          }
7584          else {
7585              in += insize;
7586              memcpy(out, buffer, outsize * sizeof(wchar_t));
7587              out += outsize;
7588          }
7589      }
7590  
7591      /* Shrink the buffer */
7592      assert(out - *buf <= *bufsize);
7593      *bufsize = out - *buf;
7594      /* (in - startin) <= size and size is an int */
7595      ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7596  
7597  error:
7598      Py_XDECREF(encoding_obj);
7599      Py_XDECREF(errorHandler);
7600      Py_XDECREF(exc);
7601      return ret;
7602  }
7603  
7604  static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7605  decode_code_page_stateful(int code_page,
7606                            const char *s, Py_ssize_t size,
7607                            const char *errors, Py_ssize_t *consumed)
7608  {
7609      wchar_t *buf = NULL;
7610      Py_ssize_t bufsize = 0;
7611      int chunk_size, final, converted, done;
7612  
7613      if (code_page < 0) {
7614          PyErr_SetString(PyExc_ValueError, "invalid code page number");
7615          return NULL;
7616      }
7617      if (size < 0) {
7618          PyErr_BadInternalCall();
7619          return NULL;
7620      }
7621  
7622      if (consumed)
7623          *consumed = 0;
7624  
7625      do
7626      {
7627  #ifdef NEED_RETRY
7628          if (size > DECODING_CHUNK_SIZE) {
7629              chunk_size = DECODING_CHUNK_SIZE;
7630              final = 0;
7631              done = 0;
7632          }
7633          else
7634  #endif
7635          {
7636              chunk_size = (int)size;
7637              final = (consumed == NULL);
7638              done = 1;
7639          }
7640  
7641          if (chunk_size == 0 && done) {
7642              if (buf != NULL)
7643                  break;
7644              _Py_RETURN_UNICODE_EMPTY();
7645          }
7646  
7647          converted = decode_code_page_strict(code_page, &buf, &bufsize,
7648                                              s, chunk_size);
7649          if (converted == -2)
7650              converted = decode_code_page_errors(code_page, &buf, &bufsize,
7651                                                  s, chunk_size,
7652                                                  errors, final);
7653          assert(converted != 0 || done);
7654  
7655          if (converted < 0) {
7656              PyMem_Free(buf);
7657              return NULL;
7658          }
7659  
7660          if (consumed)
7661              *consumed += converted;
7662  
7663          s += converted;
7664          size -= converted;
7665      } while (!done);
7666  
7667      PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7668      PyMem_Free(buf);
7669      return v;
7670  }
7671  
7672  PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7673  PyUnicode_DecodeCodePageStateful(int code_page,
7674                                   const char *s,
7675                                   Py_ssize_t size,
7676                                   const char *errors,
7677                                   Py_ssize_t *consumed)
7678  {
7679      return decode_code_page_stateful(code_page, s, size, errors, consumed);
7680  }
7681  
7682  PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7683  PyUnicode_DecodeMBCSStateful(const char *s,
7684                               Py_ssize_t size,
7685                               const char *errors,
7686                               Py_ssize_t *consumed)
7687  {
7688      return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7689  }
7690  
7691  PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7692  PyUnicode_DecodeMBCS(const char *s,
7693                       Py_ssize_t size,
7694                       const char *errors)
7695  {
7696      return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7697  }
7698  
7699  static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7700  encode_code_page_flags(UINT code_page, const char *errors)
7701  {
7702      if (code_page == CP_UTF8) {
7703          return WC_ERR_INVALID_CHARS;
7704      }
7705      else if (code_page == CP_UTF7) {
7706          /* CP_UTF7 only supports flags=0 */
7707          return 0;
7708      }
7709      else {
7710          if (errors != NULL && strcmp(errors, "replace") == 0)
7711              return 0;
7712          else
7713              return WC_NO_BEST_FIT_CHARS;
7714      }
7715  }
7716  
7717  /*
7718   * Encode a Unicode string to a Windows code page into a byte string in strict
7719   * mode.
7720   *
7721   * Returns consumed characters if succeed, returns -2 on encode error, or raise
7722   * an OSError and returns -1 on other error.
7723   */
7724  static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7725  encode_code_page_strict(UINT code_page, PyObject **outbytes,
7726                          PyObject *unicode, Py_ssize_t offset, int len,
7727                          const char* errors)
7728  {
7729      BOOL usedDefaultChar = FALSE;
7730      BOOL *pusedDefaultChar = &usedDefaultChar;
7731      int outsize;
7732      wchar_t *p;
7733      Py_ssize_t size;
7734      const DWORD flags = encode_code_page_flags(code_page, NULL);
7735      char *out;
7736      /* Create a substring so that we can get the UTF-16 representation
7737         of just the slice under consideration. */
7738      PyObject *substring;
7739      int ret = -1;
7740  
7741      assert(len > 0);
7742  
7743      if (code_page != CP_UTF8 && code_page != CP_UTF7)
7744          pusedDefaultChar = &usedDefaultChar;
7745      else
7746          pusedDefaultChar = NULL;
7747  
7748      substring = PyUnicode_Substring(unicode, offset, offset+len);
7749      if (substring == NULL)
7750          return -1;
7751  #if USE_UNICODE_WCHAR_CACHE
7752  _Py_COMP_DIAG_PUSH
7753  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7754      p = PyUnicode_AsUnicodeAndSize(substring, &size);
7755      if (p == NULL) {
7756          Py_DECREF(substring);
7757          return -1;
7758      }
7759  _Py_COMP_DIAG_POP
7760  #else /* USE_UNICODE_WCHAR_CACHE */
7761      p = PyUnicode_AsWideCharString(substring, &size);
7762      Py_CLEAR(substring);
7763      if (p == NULL) {
7764          return -1;
7765      }
7766  #endif /* USE_UNICODE_WCHAR_CACHE */
7767      assert(size <= INT_MAX);
7768  
7769      /* First get the size of the result */
7770      outsize = WideCharToMultiByte(code_page, flags,
7771                                    p, (int)size,
7772                                    NULL, 0,
7773                                    NULL, pusedDefaultChar);
7774      if (outsize <= 0)
7775          goto error;
7776      /* If we used a default char, then we failed! */
7777      if (pusedDefaultChar && *pusedDefaultChar) {
7778          ret = -2;
7779          goto done;
7780      }
7781  
7782      if (*outbytes == NULL) {
7783          /* Create string object */
7784          *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7785          if (*outbytes == NULL) {
7786              goto done;
7787          }
7788          out = PyBytes_AS_STRING(*outbytes);
7789      }
7790      else {
7791          /* Extend string object */
7792          const Py_ssize_t n = PyBytes_Size(*outbytes);
7793          if (outsize > PY_SSIZE_T_MAX - n) {
7794              PyErr_NoMemory();
7795              goto done;
7796          }
7797          if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7798              goto done;
7799          }
7800          out = PyBytes_AS_STRING(*outbytes) + n;
7801      }
7802  
7803      /* Do the conversion */
7804      outsize = WideCharToMultiByte(code_page, flags,
7805                                    p, (int)size,
7806                                    out, outsize,
7807                                    NULL, pusedDefaultChar);
7808      if (outsize <= 0)
7809          goto error;
7810      if (pusedDefaultChar && *pusedDefaultChar) {
7811          ret = -2;
7812          goto done;
7813      }
7814      ret = 0;
7815  
7816  done:
7817  #if USE_UNICODE_WCHAR_CACHE
7818      Py_DECREF(substring);
7819  #else /* USE_UNICODE_WCHAR_CACHE */
7820      PyMem_Free(p);
7821  #endif /* USE_UNICODE_WCHAR_CACHE */
7822      return ret;
7823  
7824  error:
7825      if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7826          ret = -2;
7827          goto done;
7828      }
7829      PyErr_SetFromWindowsErr(0);
7830      goto done;
7831  }
7832  
7833  /*
7834   * Encode a Unicode string to a Windows code page into a byte string using an
7835   * error handler.
7836   *
7837   * Returns consumed characters if succeed, or raise an OSError and returns
7838   * -1 on other error.
7839   */
7840  static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7841  encode_code_page_errors(UINT code_page, PyObject **outbytes,
7842                          PyObject *unicode, Py_ssize_t unicode_offset,
7843                          Py_ssize_t insize, const char* errors)
7844  {
7845      const DWORD flags = encode_code_page_flags(code_page, errors);
7846      Py_ssize_t pos = unicode_offset;
7847      Py_ssize_t endin = unicode_offset + insize;
7848      /* Ideally, we should get reason from FormatMessage. This is the Windows
7849         2000 English version of the message. */
7850      const char *reason = "invalid character";
7851      /* 4=maximum length of a UTF-8 sequence */
7852      char buffer[4];
7853      BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7854      Py_ssize_t outsize;
7855      char *out;
7856      PyObject *errorHandler = NULL;
7857      PyObject *exc = NULL;
7858      PyObject *encoding_obj = NULL;
7859      const char *encoding;
7860      Py_ssize_t newpos, newoutsize;
7861      PyObject *rep;
7862      int ret = -1;
7863  
7864      assert(insize > 0);
7865  
7866      encoding = code_page_name(code_page, &encoding_obj);
7867      if (encoding == NULL)
7868          return -1;
7869  
7870      if (errors == NULL || strcmp(errors, "strict") == 0) {
7871          /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7872             then we raise a UnicodeEncodeError. */
7873          make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7874          if (exc != NULL) {
7875              PyCodec_StrictErrors(exc);
7876              Py_DECREF(exc);
7877          }
7878          Py_XDECREF(encoding_obj);
7879          return -1;
7880      }
7881  
7882      if (code_page != CP_UTF8 && code_page != CP_UTF7)
7883          pusedDefaultChar = &usedDefaultChar;
7884      else
7885          pusedDefaultChar = NULL;
7886  
7887      if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7888          PyErr_NoMemory();
7889          goto error;
7890      }
7891      outsize = insize * Py_ARRAY_LENGTH(buffer);
7892  
7893      if (*outbytes == NULL) {
7894          /* Create string object */
7895          *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7896          if (*outbytes == NULL)
7897              goto error;
7898          out = PyBytes_AS_STRING(*outbytes);
7899      }
7900      else {
7901          /* Extend string object */
7902          Py_ssize_t n = PyBytes_Size(*outbytes);
7903          if (n > PY_SSIZE_T_MAX - outsize) {
7904              PyErr_NoMemory();
7905              goto error;
7906          }
7907          if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7908              goto error;
7909          out = PyBytes_AS_STRING(*outbytes) + n;
7910      }
7911  
7912      /* Encode the string character per character */
7913      while (pos < endin)
7914      {
7915          Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7916          wchar_t chars[2];
7917          int charsize;
7918          if (ch < 0x10000) {
7919              chars[0] = (wchar_t)ch;
7920              charsize = 1;
7921          }
7922          else {
7923              chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7924              chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7925              charsize = 2;
7926          }
7927  
7928          outsize = WideCharToMultiByte(code_page, flags,
7929                                        chars, charsize,
7930                                        buffer, Py_ARRAY_LENGTH(buffer),
7931                                        NULL, pusedDefaultChar);
7932          if (outsize > 0) {
7933              if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7934              {
7935                  pos++;
7936                  memcpy(out, buffer, outsize);
7937                  out += outsize;
7938                  continue;
7939              }
7940          }
7941          else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7942              PyErr_SetFromWindowsErr(0);
7943              goto error;
7944          }
7945  
7946          rep = unicode_encode_call_errorhandler(
7947                    errors, &errorHandler, encoding, reason,
7948                    unicode, &exc,
7949                    pos, pos + 1, &newpos);
7950          if (rep == NULL)
7951              goto error;
7952  
7953          Py_ssize_t morebytes = pos - newpos;
7954          if (PyBytes_Check(rep)) {
7955              outsize = PyBytes_GET_SIZE(rep);
7956              morebytes += outsize;
7957              if (morebytes > 0) {
7958                  Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7959                  newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7960                  if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7961                      Py_DECREF(rep);
7962                      goto error;
7963                  }
7964                  out = PyBytes_AS_STRING(*outbytes) + offset;
7965              }
7966              memcpy(out, PyBytes_AS_STRING(rep), outsize);
7967              out += outsize;
7968          }
7969          else {
7970              Py_ssize_t i;
7971              enum PyUnicode_Kind kind;
7972              const void *data;
7973  
7974              if (PyUnicode_READY(rep) == -1) {
7975                  Py_DECREF(rep);
7976                  goto error;
7977              }
7978  
7979              outsize = PyUnicode_GET_LENGTH(rep);
7980              morebytes += outsize;
7981              if (morebytes > 0) {
7982                  Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7983                  newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7984                  if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7985                      Py_DECREF(rep);
7986                      goto error;
7987                  }
7988                  out = PyBytes_AS_STRING(*outbytes) + offset;
7989              }
7990              kind = PyUnicode_KIND(rep);
7991              data = PyUnicode_DATA(rep);
7992              for (i=0; i < outsize; i++) {
7993                  Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7994                  if (ch > 127) {
7995                      raise_encode_exception(&exc,
7996                          encoding, unicode,
7997                          pos, pos + 1,
7998                          "unable to encode error handler result to ASCII");
7999                      Py_DECREF(rep);
8000                      goto error;
8001                  }
8002                  *out = (unsigned char)ch;
8003                  out++;
8004              }
8005          }
8006          pos = newpos;
8007          Py_DECREF(rep);
8008      }
8009      /* write a NUL byte */
8010      *out = 0;
8011      outsize = out - PyBytes_AS_STRING(*outbytes);
8012      assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8013      if (_PyBytes_Resize(outbytes, outsize) < 0)
8014          goto error;
8015      ret = 0;
8016  
8017  error:
8018      Py_XDECREF(encoding_obj);
8019      Py_XDECREF(errorHandler);
8020      Py_XDECREF(exc);
8021      return ret;
8022  }
8023  
8024  static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8025  encode_code_page(int code_page,
8026                   PyObject *unicode,
8027                   const char *errors)
8028  {
8029      Py_ssize_t len;
8030      PyObject *outbytes = NULL;
8031      Py_ssize_t offset;
8032      int chunk_len, ret, done;
8033  
8034      if (!PyUnicode_Check(unicode)) {
8035          PyErr_BadArgument();
8036          return NULL;
8037      }
8038  
8039      if (PyUnicode_READY(unicode) == -1)
8040          return NULL;
8041      len = PyUnicode_GET_LENGTH(unicode);
8042  
8043      if (code_page < 0) {
8044          PyErr_SetString(PyExc_ValueError, "invalid code page number");
8045          return NULL;
8046      }
8047  
8048      if (len == 0)
8049          return PyBytes_FromStringAndSize(NULL, 0);
8050  
8051      offset = 0;
8052      do
8053      {
8054  #ifdef NEED_RETRY
8055          if (len > DECODING_CHUNK_SIZE) {
8056              chunk_len = DECODING_CHUNK_SIZE;
8057              done = 0;
8058          }
8059          else
8060  #endif
8061          {
8062              chunk_len = (int)len;
8063              done = 1;
8064          }
8065  
8066          ret = encode_code_page_strict(code_page, &outbytes,
8067                                        unicode, offset, chunk_len,
8068                                        errors);
8069          if (ret == -2)
8070              ret = encode_code_page_errors(code_page, &outbytes,
8071                                            unicode, offset,
8072                                            chunk_len, errors);
8073          if (ret < 0) {
8074              Py_XDECREF(outbytes);
8075              return NULL;
8076          }
8077  
8078          offset += chunk_len;
8079          len -= chunk_len;
8080      } while (!done);
8081  
8082      return outbytes;
8083  }
8084  
8085  PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8086  PyUnicode_EncodeCodePage(int code_page,
8087                           PyObject *unicode,
8088                           const char *errors)
8089  {
8090      return encode_code_page(code_page, unicode, errors);
8091  }
8092  
8093  PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8094  PyUnicode_AsMBCSString(PyObject *unicode)
8095  {
8096      return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8097  }
8098  
8099  #undef NEED_RETRY
8100  
8101  #endif /* MS_WINDOWS */
8102  
8103  /* --- Character Mapping Codec -------------------------------------------- */
8104  
8105  static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8106  charmap_decode_string(const char *s,
8107                        Py_ssize_t size,
8108                        PyObject *mapping,
8109                        const char *errors,
8110                        _PyUnicodeWriter *writer)
8111  {
8112      const char *starts = s;
8113      const char *e;
8114      Py_ssize_t startinpos, endinpos;
8115      PyObject *errorHandler = NULL, *exc = NULL;
8116      Py_ssize_t maplen;
8117      enum PyUnicode_Kind mapkind;
8118      const void *mapdata;
8119      Py_UCS4 x;
8120      unsigned char ch;
8121  
8122      if (PyUnicode_READY(mapping) == -1)
8123          return -1;
8124  
8125      maplen = PyUnicode_GET_LENGTH(mapping);
8126      mapdata = PyUnicode_DATA(mapping);
8127      mapkind = PyUnicode_KIND(mapping);
8128  
8129      e = s + size;
8130  
8131      if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8132          /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8133           * is disabled in encoding aliases, latin1 is preferred because
8134           * its implementation is faster. */
8135          const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8136          Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8137          Py_UCS4 maxchar = writer->maxchar;
8138  
8139          assert (writer->kind == PyUnicode_1BYTE_KIND);
8140          while (s < e) {
8141              ch = *s;
8142              x = mapdata_ucs1[ch];
8143              if (x > maxchar) {
8144                  if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8145                      goto onError;
8146                  maxchar = writer->maxchar;
8147                  outdata = (Py_UCS1 *)writer->data;
8148              }
8149              outdata[writer->pos] = x;
8150              writer->pos++;
8151              ++s;
8152          }
8153          return 0;
8154      }
8155  
8156      while (s < e) {
8157          if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8158              enum PyUnicode_Kind outkind = writer->kind;
8159              const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8160              if (outkind == PyUnicode_1BYTE_KIND) {
8161                  Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8162                  Py_UCS4 maxchar = writer->maxchar;
8163                  while (s < e) {
8164                      ch = *s;
8165                      x = mapdata_ucs2[ch];
8166                      if (x > maxchar)
8167                          goto Error;
8168                      outdata[writer->pos] = x;
8169                      writer->pos++;
8170                      ++s;
8171                  }
8172                  break;
8173              }
8174              else if (outkind == PyUnicode_2BYTE_KIND) {
8175                  Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8176                  while (s < e) {
8177                      ch = *s;
8178                      x = mapdata_ucs2[ch];
8179                      if (x == 0xFFFE)
8180                          goto Error;
8181                      outdata[writer->pos] = x;
8182                      writer->pos++;
8183                      ++s;
8184                  }
8185                  break;
8186              }
8187          }
8188          ch = *s;
8189  
8190          if (ch < maplen)
8191              x = PyUnicode_READ(mapkind, mapdata, ch);
8192          else
8193              x = 0xfffe; /* invalid value */
8194  Error:
8195          if (x == 0xfffe)
8196          {
8197              /* undefined mapping */
8198              startinpos = s-starts;
8199              endinpos = startinpos+1;
8200              if (unicode_decode_call_errorhandler_writer(
8201                      errors, &errorHandler,
8202                      "charmap", "character maps to <undefined>",
8203                      &starts, &e, &startinpos, &endinpos, &exc, &s,
8204                      writer)) {
8205                  goto onError;
8206              }
8207              continue;
8208          }
8209  
8210          if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8211              goto onError;
8212          ++s;
8213      }
8214      Py_XDECREF(errorHandler);
8215      Py_XDECREF(exc);
8216      return 0;
8217  
8218  onError:
8219      Py_XDECREF(errorHandler);
8220      Py_XDECREF(exc);
8221      return -1;
8222  }
8223  
8224  static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8225  charmap_decode_mapping(const char *s,
8226                         Py_ssize_t size,
8227                         PyObject *mapping,
8228                         const char *errors,
8229                         _PyUnicodeWriter *writer)
8230  {
8231      const char *starts = s;
8232      const char *e;
8233      Py_ssize_t startinpos, endinpos;
8234      PyObject *errorHandler = NULL, *exc = NULL;
8235      unsigned char ch;
8236      PyObject *key, *item = NULL;
8237  
8238      e = s + size;
8239  
8240      while (s < e) {
8241          ch = *s;
8242  
8243          /* Get mapping (char ordinal -> integer, Unicode char or None) */
8244          key = PyLong_FromLong((long)ch);
8245          if (key == NULL)
8246              goto onError;
8247  
8248          item = PyObject_GetItem(mapping, key);
8249          Py_DECREF(key);
8250          if (item == NULL) {
8251              if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8252                  /* No mapping found means: mapping is undefined. */
8253                  PyErr_Clear();
8254                  goto Undefined;
8255              } else
8256                  goto onError;
8257          }
8258  
8259          /* Apply mapping */
8260          if (item == Py_None)
8261              goto Undefined;
8262          if (PyLong_Check(item)) {
8263              long value = PyLong_AS_LONG(item);
8264              if (value == 0xFFFE)
8265                  goto Undefined;
8266              if (value < 0 || value > MAX_UNICODE) {
8267                  PyErr_Format(PyExc_TypeError,
8268                               "character mapping must be in range(0x%x)",
8269                               (unsigned long)MAX_UNICODE + 1);
8270                  goto onError;
8271              }
8272  
8273              if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8274                  goto onError;
8275          }
8276          else if (PyUnicode_Check(item)) {
8277              if (PyUnicode_READY(item) == -1)
8278                  goto onError;
8279              if (PyUnicode_GET_LENGTH(item) == 1) {
8280                  Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8281                  if (value == 0xFFFE)
8282                      goto Undefined;
8283                  if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8284                      goto onError;
8285              }
8286              else {
8287                  writer->overallocate = 1;
8288                  if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8289                      goto onError;
8290              }
8291          }
8292          else {
8293              /* wrong return value */
8294              PyErr_SetString(PyExc_TypeError,
8295                              "character mapping must return integer, None or str");
8296              goto onError;
8297          }
8298          Py_CLEAR(item);
8299          ++s;
8300          continue;
8301  
8302  Undefined:
8303          /* undefined mapping */
8304          Py_CLEAR(item);
8305          startinpos = s-starts;
8306          endinpos = startinpos+1;
8307          if (unicode_decode_call_errorhandler_writer(
8308                  errors, &errorHandler,
8309                  "charmap", "character maps to <undefined>",
8310                  &starts, &e, &startinpos, &endinpos, &exc, &s,
8311                  writer)) {
8312              goto onError;
8313          }
8314      }
8315      Py_XDECREF(errorHandler);
8316      Py_XDECREF(exc);
8317      return 0;
8318  
8319  onError:
8320      Py_XDECREF(item);
8321      Py_XDECREF(errorHandler);
8322      Py_XDECREF(exc);
8323      return -1;
8324  }
8325  
8326  PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8327  PyUnicode_DecodeCharmap(const char *s,
8328                          Py_ssize_t size,
8329                          PyObject *mapping,
8330                          const char *errors)
8331  {
8332      _PyUnicodeWriter writer;
8333  
8334      /* Default to Latin-1 */
8335      if (mapping == NULL)
8336          return PyUnicode_DecodeLatin1(s, size, errors);
8337  
8338      if (size == 0)
8339          _Py_RETURN_UNICODE_EMPTY();
8340      _PyUnicodeWriter_Init(&writer);
8341      writer.min_length = size;
8342      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8343          goto onError;
8344  
8345      if (PyUnicode_CheckExact(mapping)) {
8346          if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8347              goto onError;
8348      }
8349      else {
8350          if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8351              goto onError;
8352      }
8353      return _PyUnicodeWriter_Finish(&writer);
8354  
8355    onError:
8356      _PyUnicodeWriter_Dealloc(&writer);
8357      return NULL;
8358  }
8359  
8360  /* Charmap encoding: the lookup table */
8361  
8362  /*[clinic input]
8363  class EncodingMap "struct encoding_map *" "&EncodingMapType"
8364  [clinic start generated code]*/
8365  /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8366  
8367  struct encoding_map {
8368      PyObject_HEAD
8369      unsigned char level1[32];
8370      int count2, count3;
8371      unsigned char level23[1];
8372  };
8373  
8374  /*[clinic input]
8375  EncodingMap.size
8376  
8377  Return the size (in bytes) of this object.
8378  [clinic start generated code]*/
8379  
8380  static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8381  EncodingMap_size_impl(struct encoding_map *self)
8382  /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8383  {
8384      return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8385                             128*self->count3);
8386  }
8387  
8388  static PyMethodDef encoding_map_methods[] = {
8389      ENCODINGMAP_SIZE_METHODDEF
8390      {NULL, NULL}
8391  };
8392  
8393  static PyTypeObject EncodingMapType = {
8394      PyVarObject_HEAD_INIT(NULL, 0)
8395      .tp_name = "EncodingMap",
8396      .tp_basicsize = sizeof(struct encoding_map),
8397      /* methods */
8398      .tp_flags = Py_TPFLAGS_DEFAULT,
8399      .tp_methods = encoding_map_methods,
8400  };
8401  
8402  PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8403  PyUnicode_BuildEncodingMap(PyObject* string)
8404  {
8405      PyObject *result;
8406      struct encoding_map *mresult;
8407      int i;
8408      int need_dict = 0;
8409      unsigned char level1[32];
8410      unsigned char level2[512];
8411      unsigned char *mlevel1, *mlevel2, *mlevel3;
8412      int count2 = 0, count3 = 0;
8413      int kind;
8414      const void *data;
8415      Py_ssize_t length;
8416      Py_UCS4 ch;
8417  
8418      if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8419          PyErr_BadArgument();
8420          return NULL;
8421      }
8422      kind = PyUnicode_KIND(string);
8423      data = PyUnicode_DATA(string);
8424      length = PyUnicode_GET_LENGTH(string);
8425      length = Py_MIN(length, 256);
8426      memset(level1, 0xFF, sizeof level1);
8427      memset(level2, 0xFF, sizeof level2);
8428  
8429      /* If there isn't a one-to-one mapping of NULL to \0,
8430         or if there are non-BMP characters, we need to use
8431         a mapping dictionary. */
8432      if (PyUnicode_READ(kind, data, 0) != 0)
8433          need_dict = 1;
8434      for (i = 1; i < length; i++) {
8435          int l1, l2;
8436          ch = PyUnicode_READ(kind, data, i);
8437          if (ch == 0 || ch > 0xFFFF) {
8438              need_dict = 1;
8439              break;
8440          }
8441          if (ch == 0xFFFE)
8442              /* unmapped character */
8443              continue;
8444          l1 = ch >> 11;
8445          l2 = ch >> 7;
8446          if (level1[l1] == 0xFF)
8447              level1[l1] = count2++;
8448          if (level2[l2] == 0xFF)
8449              level2[l2] = count3++;
8450      }
8451  
8452      if (count2 >= 0xFF || count3 >= 0xFF)
8453          need_dict = 1;
8454  
8455      if (need_dict) {
8456          PyObject *result = PyDict_New();
8457          PyObject *key, *value;
8458          if (!result)
8459              return NULL;
8460          for (i = 0; i < length; i++) {
8461              key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8462              value = PyLong_FromLong(i);
8463              if (!key || !value)
8464                  goto failed1;
8465              if (PyDict_SetItem(result, key, value) == -1)
8466                  goto failed1;
8467              Py_DECREF(key);
8468              Py_DECREF(value);
8469          }
8470          return result;
8471        failed1:
8472          Py_XDECREF(key);
8473          Py_XDECREF(value);
8474          Py_DECREF(result);
8475          return NULL;
8476      }
8477  
8478      /* Create a three-level trie */
8479      result = PyObject_Malloc(sizeof(struct encoding_map) +
8480                               16*count2 + 128*count3 - 1);
8481      if (!result) {
8482          return PyErr_NoMemory();
8483      }
8484  
8485      _PyObject_Init(result, &EncodingMapType);
8486      mresult = (struct encoding_map*)result;
8487      mresult->count2 = count2;
8488      mresult->count3 = count3;
8489      mlevel1 = mresult->level1;
8490      mlevel2 = mresult->level23;
8491      mlevel3 = mresult->level23 + 16*count2;
8492      memcpy(mlevel1, level1, 32);
8493      memset(mlevel2, 0xFF, 16*count2);
8494      memset(mlevel3, 0, 128*count3);
8495      count3 = 0;
8496      for (i = 1; i < length; i++) {
8497          int o1, o2, o3, i2, i3;
8498          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8499          if (ch == 0xFFFE)
8500              /* unmapped character */
8501              continue;
8502          o1 = ch>>11;
8503          o2 = (ch>>7) & 0xF;
8504          i2 = 16*mlevel1[o1] + o2;
8505          if (mlevel2[i2] == 0xFF)
8506              mlevel2[i2] = count3++;
8507          o3 = ch & 0x7F;
8508          i3 = 128*mlevel2[i2] + o3;
8509          mlevel3[i3] = i;
8510      }
8511      return result;
8512  }
8513  
8514  static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8515  encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8516  {
8517      struct encoding_map *map = (struct encoding_map*)mapping;
8518      int l1 = c>>11;
8519      int l2 = (c>>7) & 0xF;
8520      int l3 = c & 0x7F;
8521      int i;
8522  
8523      if (c > 0xFFFF)
8524          return -1;
8525      if (c == 0)
8526          return 0;
8527      /* level 1*/
8528      i = map->level1[l1];
8529      if (i == 0xFF) {
8530          return -1;
8531      }
8532      /* level 2*/
8533      i = map->level23[16*i+l2];
8534      if (i == 0xFF) {
8535          return -1;
8536      }
8537      /* level 3 */
8538      i = map->level23[16*map->count2 + 128*i + l3];
8539      if (i == 0) {
8540          return -1;
8541      }
8542      return i;
8543  }
8544  
8545  /* Lookup the character ch in the mapping. If the character
8546     can't be found, Py_None is returned (or NULL, if another
8547     error occurred). */
8548  static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8549  charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8550  {
8551      PyObject *w = PyLong_FromLong((long)c);
8552      PyObject *x;
8553  
8554      if (w == NULL)
8555          return NULL;
8556      x = PyObject_GetItem(mapping, w);
8557      Py_DECREF(w);
8558      if (x == NULL) {
8559          if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8560              /* No mapping found means: mapping is undefined. */
8561              PyErr_Clear();
8562              Py_RETURN_NONE;
8563          } else
8564              return NULL;
8565      }
8566      else if (x == Py_None)
8567          return x;
8568      else if (PyLong_Check(x)) {
8569          long value = PyLong_AS_LONG(x);
8570          if (value < 0 || value > 255) {
8571              PyErr_SetString(PyExc_TypeError,
8572                              "character mapping must be in range(256)");
8573              Py_DECREF(x);
8574              return NULL;
8575          }
8576          return x;
8577      }
8578      else if (PyBytes_Check(x))
8579          return x;
8580      else {
8581          /* wrong return value */
8582          PyErr_Format(PyExc_TypeError,
8583                       "character mapping must return integer, bytes or None, not %.400s",
8584                       Py_TYPE(x)->tp_name);
8585          Py_DECREF(x);
8586          return NULL;
8587      }
8588  }
8589  
8590  static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8591  charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8592  {
8593      Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8594      /* exponentially overallocate to minimize reallocations */
8595      if (requiredsize < 2*outsize)
8596          requiredsize = 2*outsize;
8597      if (_PyBytes_Resize(outobj, requiredsize))
8598          return -1;
8599      return 0;
8600  }
8601  
8602  typedef enum charmapencode_result {
8603      enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8604  } charmapencode_result;
8605  /* lookup the character, put the result in the output string and adjust
8606     various state variables. Resize the output bytes object if not enough
8607     space is available. Return a new reference to the object that
8608     was put in the output buffer, or Py_None, if the mapping was undefined
8609     (in which case no character was written) or NULL, if a
8610     reallocation error occurred. The caller must decref the result */
8611  static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8612  charmapencode_output(Py_UCS4 c, PyObject *mapping,
8613                       PyObject **outobj, Py_ssize_t *outpos)
8614  {
8615      PyObject *rep;
8616      char *outstart;
8617      Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8618  
8619      if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8620          int res = encoding_map_lookup(c, mapping);
8621          Py_ssize_t requiredsize = *outpos+1;
8622          if (res == -1)
8623              return enc_FAILED;
8624          if (outsize<requiredsize)
8625              if (charmapencode_resize(outobj, outpos, requiredsize))
8626                  return enc_EXCEPTION;
8627          outstart = PyBytes_AS_STRING(*outobj);
8628          outstart[(*outpos)++] = (char)res;
8629          return enc_SUCCESS;
8630      }
8631  
8632      rep = charmapencode_lookup(c, mapping);
8633      if (rep==NULL)
8634          return enc_EXCEPTION;
8635      else if (rep==Py_None) {
8636          Py_DECREF(rep);
8637          return enc_FAILED;
8638      } else {
8639          if (PyLong_Check(rep)) {
8640              Py_ssize_t requiredsize = *outpos+1;
8641              if (outsize<requiredsize)
8642                  if (charmapencode_resize(outobj, outpos, requiredsize)) {
8643                      Py_DECREF(rep);
8644                      return enc_EXCEPTION;
8645                  }
8646              outstart = PyBytes_AS_STRING(*outobj);
8647              outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8648          }
8649          else {
8650              const char *repchars = PyBytes_AS_STRING(rep);
8651              Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8652              Py_ssize_t requiredsize = *outpos+repsize;
8653              if (outsize<requiredsize)
8654                  if (charmapencode_resize(outobj, outpos, requiredsize)) {
8655                      Py_DECREF(rep);
8656                      return enc_EXCEPTION;
8657                  }
8658              outstart = PyBytes_AS_STRING(*outobj);
8659              memcpy(outstart + *outpos, repchars, repsize);
8660              *outpos += repsize;
8661          }
8662      }
8663      Py_DECREF(rep);
8664      return enc_SUCCESS;
8665  }
8666  
8667  /* handle an error in PyUnicode_EncodeCharmap
8668     Return 0 on success, -1 on error */
8669  static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8670  charmap_encoding_error(
8671      PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8672      PyObject **exceptionObject,
8673      _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8674      PyObject **res, Py_ssize_t *respos)
8675  {
8676      PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8677      Py_ssize_t size, repsize;
8678      Py_ssize_t newpos;
8679      enum PyUnicode_Kind kind;
8680      const void *data;
8681      Py_ssize_t index;
8682      /* startpos for collecting unencodable chars */
8683      Py_ssize_t collstartpos = *inpos;
8684      Py_ssize_t collendpos = *inpos+1;
8685      Py_ssize_t collpos;
8686      const char *encoding = "charmap";
8687      const char *reason = "character maps to <undefined>";
8688      charmapencode_result x;
8689      Py_UCS4 ch;
8690      int val;
8691  
8692      if (PyUnicode_READY(unicode) == -1)
8693          return -1;
8694      size = PyUnicode_GET_LENGTH(unicode);
8695      /* find all unencodable characters */
8696      while (collendpos < size) {
8697          PyObject *rep;
8698          if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8699              ch = PyUnicode_READ_CHAR(unicode, collendpos);
8700              val = encoding_map_lookup(ch, mapping);
8701              if (val != -1)
8702                  break;
8703              ++collendpos;
8704              continue;
8705          }
8706  
8707          ch = PyUnicode_READ_CHAR(unicode, collendpos);
8708          rep = charmapencode_lookup(ch, mapping);
8709          if (rep==NULL)
8710              return -1;
8711          else if (rep!=Py_None) {
8712              Py_DECREF(rep);
8713              break;
8714          }
8715          Py_DECREF(rep);
8716          ++collendpos;
8717      }
8718      /* cache callback name lookup
8719       * (if not done yet, i.e. it's the first error) */
8720      if (*error_handler == _Py_ERROR_UNKNOWN)
8721          *error_handler = _Py_GetErrorHandler(errors);
8722  
8723      switch (*error_handler) {
8724      case _Py_ERROR_STRICT:
8725          raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8726          return -1;
8727  
8728      case _Py_ERROR_REPLACE:
8729          for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8730              x = charmapencode_output('?', mapping, res, respos);
8731              if (x==enc_EXCEPTION) {
8732                  return -1;
8733              }
8734              else if (x==enc_FAILED) {
8735                  raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8736                  return -1;
8737              }
8738          }
8739          /* fall through */
8740      case _Py_ERROR_IGNORE:
8741          *inpos = collendpos;
8742          break;
8743  
8744      case _Py_ERROR_XMLCHARREFREPLACE:
8745          /* generate replacement (temporarily (mis)uses p) */
8746          for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8747              char buffer[2+29+1+1];
8748              char *cp;
8749              sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8750              for (cp = buffer; *cp; ++cp) {
8751                  x = charmapencode_output(*cp, mapping, res, respos);
8752                  if (x==enc_EXCEPTION)
8753                      return -1;
8754                  else if (x==enc_FAILED) {
8755                      raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8756                      return -1;
8757                  }
8758              }
8759          }
8760          *inpos = collendpos;
8761          break;
8762  
8763      default:
8764          repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8765                                                        encoding, reason, unicode, exceptionObject,
8766                                                        collstartpos, collendpos, &newpos);
8767          if (repunicode == NULL)
8768              return -1;
8769          if (PyBytes_Check(repunicode)) {
8770              /* Directly copy bytes result to output. */
8771              Py_ssize_t outsize = PyBytes_Size(*res);
8772              Py_ssize_t requiredsize;
8773              repsize = PyBytes_Size(repunicode);
8774              requiredsize = *respos + repsize;
8775              if (requiredsize > outsize)
8776                  /* Make room for all additional bytes. */
8777                  if (charmapencode_resize(res, respos, requiredsize)) {
8778                      Py_DECREF(repunicode);
8779                      return -1;
8780                  }
8781              memcpy(PyBytes_AsString(*res) + *respos,
8782                     PyBytes_AsString(repunicode),  repsize);
8783              *respos += repsize;
8784              *inpos = newpos;
8785              Py_DECREF(repunicode);
8786              break;
8787          }
8788          /* generate replacement  */
8789          if (PyUnicode_READY(repunicode) == -1) {
8790              Py_DECREF(repunicode);
8791              return -1;
8792          }
8793          repsize = PyUnicode_GET_LENGTH(repunicode);
8794          data = PyUnicode_DATA(repunicode);
8795          kind = PyUnicode_KIND(repunicode);
8796          for (index = 0; index < repsize; index++) {
8797              Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8798              x = charmapencode_output(repch, mapping, res, respos);
8799              if (x==enc_EXCEPTION) {
8800                  Py_DECREF(repunicode);
8801                  return -1;
8802              }
8803              else if (x==enc_FAILED) {
8804                  Py_DECREF(repunicode);
8805                  raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8806                  return -1;
8807              }
8808          }
8809          *inpos = newpos;
8810          Py_DECREF(repunicode);
8811      }
8812      return 0;
8813  }
8814  
8815  PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8816  _PyUnicode_EncodeCharmap(PyObject *unicode,
8817                           PyObject *mapping,
8818                           const char *errors)
8819  {
8820      /* output object */
8821      PyObject *res = NULL;
8822      /* current input position */
8823      Py_ssize_t inpos = 0;
8824      Py_ssize_t size;
8825      /* current output position */
8826      Py_ssize_t respos = 0;
8827      PyObject *error_handler_obj = NULL;
8828      PyObject *exc = NULL;
8829      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8830      const void *data;
8831      int kind;
8832  
8833      if (PyUnicode_READY(unicode) == -1)
8834          return NULL;
8835      size = PyUnicode_GET_LENGTH(unicode);
8836      data = PyUnicode_DATA(unicode);
8837      kind = PyUnicode_KIND(unicode);
8838  
8839      /* Default to Latin-1 */
8840      if (mapping == NULL)
8841          return unicode_encode_ucs1(unicode, errors, 256);
8842  
8843      /* allocate enough for a simple encoding without
8844         replacements, if we need more, we'll resize */
8845      res = PyBytes_FromStringAndSize(NULL, size);
8846      if (res == NULL)
8847          goto onError;
8848      if (size == 0)
8849          return res;
8850  
8851      while (inpos<size) {
8852          Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8853          /* try to encode it */
8854          charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8855          if (x==enc_EXCEPTION) /* error */
8856              goto onError;
8857          if (x==enc_FAILED) { /* unencodable character */
8858              if (charmap_encoding_error(unicode, &inpos, mapping,
8859                                         &exc,
8860                                         &error_handler, &error_handler_obj, errors,
8861                                         &res, &respos)) {
8862                  goto onError;
8863              }
8864          }
8865          else
8866              /* done with this character => adjust input position */
8867              ++inpos;
8868      }
8869  
8870      /* Resize if we allocated to much */
8871      if (respos<PyBytes_GET_SIZE(res))
8872          if (_PyBytes_Resize(&res, respos) < 0)
8873              goto onError;
8874  
8875      Py_XDECREF(exc);
8876      Py_XDECREF(error_handler_obj);
8877      return res;
8878  
8879    onError:
8880      Py_XDECREF(res);
8881      Py_XDECREF(exc);
8882      Py_XDECREF(error_handler_obj);
8883      return NULL;
8884  }
8885  
8886  PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8887  PyUnicode_AsCharmapString(PyObject *unicode,
8888                            PyObject *mapping)
8889  {
8890      if (!PyUnicode_Check(unicode) || mapping == NULL) {
8891          PyErr_BadArgument();
8892          return NULL;
8893      }
8894      return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8895  }
8896  
8897  /* create or adjust a UnicodeTranslateError */
8898  static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8899  make_translate_exception(PyObject **exceptionObject,
8900                           PyObject *unicode,
8901                           Py_ssize_t startpos, Py_ssize_t endpos,
8902                           const char *reason)
8903  {
8904      if (*exceptionObject == NULL) {
8905          *exceptionObject = _PyUnicodeTranslateError_Create(
8906              unicode, startpos, endpos, reason);
8907      }
8908      else {
8909          if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8910              goto onError;
8911          if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8912              goto onError;
8913          if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8914              goto onError;
8915          return;
8916        onError:
8917          Py_CLEAR(*exceptionObject);
8918      }
8919  }
8920  
8921  /* error handling callback helper:
8922     build arguments, call the callback and check the arguments,
8923     put the result into newpos and return the replacement string, which
8924     has to be freed by the caller */
8925  static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8926  unicode_translate_call_errorhandler(const char *errors,
8927                                      PyObject **errorHandler,
8928                                      const char *reason,
8929                                      PyObject *unicode, PyObject **exceptionObject,
8930                                      Py_ssize_t startpos, Py_ssize_t endpos,
8931                                      Py_ssize_t *newpos)
8932  {
8933      static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8934  
8935      Py_ssize_t i_newpos;
8936      PyObject *restuple;
8937      PyObject *resunicode;
8938  
8939      if (*errorHandler == NULL) {
8940          *errorHandler = PyCodec_LookupError(errors);
8941          if (*errorHandler == NULL)
8942              return NULL;
8943      }
8944  
8945      make_translate_exception(exceptionObject,
8946                               unicode, startpos, endpos, reason);
8947      if (*exceptionObject == NULL)
8948          return NULL;
8949  
8950      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8951      if (restuple == NULL)
8952          return NULL;
8953      if (!PyTuple_Check(restuple)) {
8954          PyErr_SetString(PyExc_TypeError, &argparse[3]);
8955          Py_DECREF(restuple);
8956          return NULL;
8957      }
8958      if (!PyArg_ParseTuple(restuple, argparse,
8959                            &resunicode, &i_newpos)) {
8960          Py_DECREF(restuple);
8961          return NULL;
8962      }
8963      if (i_newpos<0)
8964          *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8965      else
8966          *newpos = i_newpos;
8967      if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8968          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8969          Py_DECREF(restuple);
8970          return NULL;
8971      }
8972      Py_INCREF(resunicode);
8973      Py_DECREF(restuple);
8974      return resunicode;
8975  }
8976  
8977  /* Lookup the character ch in the mapping and put the result in result,
8978     which must be decrefed by the caller.
8979     Return 0 on success, -1 on error */
8980  static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8981  charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8982  {
8983      PyObject *w = PyLong_FromLong((long)c);
8984      PyObject *x;
8985  
8986      if (w == NULL)
8987          return -1;
8988      x = PyObject_GetItem(mapping, w);
8989      Py_DECREF(w);
8990      if (x == NULL) {
8991          if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8992              /* No mapping found means: use 1:1 mapping. */
8993              PyErr_Clear();
8994              *result = NULL;
8995              return 0;
8996          } else
8997              return -1;
8998      }
8999      else if (x == Py_None) {
9000          *result = x;
9001          return 0;
9002      }
9003      else if (PyLong_Check(x)) {
9004          long value = PyLong_AS_LONG(x);
9005          if (value < 0 || value > MAX_UNICODE) {
9006              PyErr_Format(PyExc_ValueError,
9007                           "character mapping must be in range(0x%x)",
9008                           MAX_UNICODE+1);
9009              Py_DECREF(x);
9010              return -1;
9011          }
9012          *result = x;
9013          return 0;
9014      }
9015      else if (PyUnicode_Check(x)) {
9016          *result = x;
9017          return 0;
9018      }
9019      else {
9020          /* wrong return value */
9021          PyErr_SetString(PyExc_TypeError,
9022                          "character mapping must return integer, None or str");
9023          Py_DECREF(x);
9024          return -1;
9025      }
9026  }
9027  
9028  /* lookup the character, write the result into the writer.
9029     Return 1 if the result was written into the writer, return 0 if the mapping
9030     was undefined, raise an exception return -1 on error. */
9031  static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9032  charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9033                          _PyUnicodeWriter *writer)
9034  {
9035      PyObject *item;
9036  
9037      if (charmaptranslate_lookup(ch, mapping, &item))
9038          return -1;
9039  
9040      if (item == NULL) {
9041          /* not found => default to 1:1 mapping */
9042          if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9043              return -1;
9044          }
9045          return 1;
9046      }
9047  
9048      if (item == Py_None) {
9049          Py_DECREF(item);
9050          return 0;
9051      }
9052  
9053      if (PyLong_Check(item)) {
9054          long ch = (Py_UCS4)PyLong_AS_LONG(item);
9055          /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9056             used it */
9057          if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9058              Py_DECREF(item);
9059              return -1;
9060          }
9061          Py_DECREF(item);
9062          return 1;
9063      }
9064  
9065      if (!PyUnicode_Check(item)) {
9066          Py_DECREF(item);
9067          return -1;
9068      }
9069  
9070      if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9071          Py_DECREF(item);
9072          return -1;
9073      }
9074  
9075      Py_DECREF(item);
9076      return 1;
9077  }
9078  
9079  static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9080  unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9081                                Py_UCS1 *translate)
9082  {
9083      PyObject *item = NULL;
9084      int ret = 0;
9085  
9086      if (charmaptranslate_lookup(ch, mapping, &item)) {
9087          return -1;
9088      }
9089  
9090      if (item == Py_None) {
9091          /* deletion */
9092          translate[ch] = 0xfe;
9093      }
9094      else if (item == NULL) {
9095          /* not found => default to 1:1 mapping */
9096          translate[ch] = ch;
9097          return 1;
9098      }
9099      else if (PyLong_Check(item)) {
9100          long replace = PyLong_AS_LONG(item);
9101          /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9102             used it */
9103          if (127 < replace) {
9104              /* invalid character or character outside ASCII:
9105                 skip the fast translate */
9106              goto exit;
9107          }
9108          translate[ch] = (Py_UCS1)replace;
9109      }
9110      else if (PyUnicode_Check(item)) {
9111          Py_UCS4 replace;
9112  
9113          if (PyUnicode_READY(item) == -1) {
9114              Py_DECREF(item);
9115              return -1;
9116          }
9117          if (PyUnicode_GET_LENGTH(item) != 1)
9118              goto exit;
9119  
9120          replace = PyUnicode_READ_CHAR(item, 0);
9121          if (replace > 127)
9122              goto exit;
9123          translate[ch] = (Py_UCS1)replace;
9124      }
9125      else {
9126          /* not None, NULL, long or unicode */
9127          goto exit;
9128      }
9129      ret = 1;
9130  
9131    exit:
9132      Py_DECREF(item);
9133      return ret;
9134  }
9135  
9136  /* Fast path for ascii => ascii translation. Return 1 if the whole string
9137     was translated into writer, return 0 if the input string was partially
9138     translated into writer, raise an exception and return -1 on error. */
9139  static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9140  unicode_fast_translate(PyObject *input, PyObject *mapping,
9141                         _PyUnicodeWriter *writer, int ignore,
9142                         Py_ssize_t *input_pos)
9143  {
9144      Py_UCS1 ascii_table[128], ch, ch2;
9145      Py_ssize_t len;
9146      const Py_UCS1 *in, *end;
9147      Py_UCS1 *out;
9148      int res = 0;
9149  
9150      len = PyUnicode_GET_LENGTH(input);
9151  
9152      memset(ascii_table, 0xff, 128);
9153  
9154      in = PyUnicode_1BYTE_DATA(input);
9155      end = in + len;
9156  
9157      assert(PyUnicode_IS_ASCII(writer->buffer));
9158      assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9159      out = PyUnicode_1BYTE_DATA(writer->buffer);
9160  
9161      for (; in < end; in++) {
9162          ch = *in;
9163          ch2 = ascii_table[ch];
9164          if (ch2 == 0xff) {
9165              int translate = unicode_fast_translate_lookup(mapping, ch,
9166                                                            ascii_table);
9167              if (translate < 0)
9168                  return -1;
9169              if (translate == 0)
9170                  goto exit;
9171              ch2 = ascii_table[ch];
9172          }
9173          if (ch2 == 0xfe) {
9174              if (ignore)
9175                  continue;
9176              goto exit;
9177          }
9178          assert(ch2 < 128);
9179          *out = ch2;
9180          out++;
9181      }
9182      res = 1;
9183  
9184  exit:
9185      writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9186      *input_pos = in - PyUnicode_1BYTE_DATA(input);
9187      return res;
9188  }
9189  
9190  static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9191  _PyUnicode_TranslateCharmap(PyObject *input,
9192                              PyObject *mapping,
9193                              const char *errors)
9194  {
9195      /* input object */
9196      const void *data;
9197      Py_ssize_t size, i;
9198      int kind;
9199      /* output buffer */
9200      _PyUnicodeWriter writer;
9201      /* error handler */
9202      const char *reason = "character maps to <undefined>";
9203      PyObject *errorHandler = NULL;
9204      PyObject *exc = NULL;
9205      int ignore;
9206      int res;
9207  
9208      if (mapping == NULL) {
9209          PyErr_BadArgument();
9210          return NULL;
9211      }
9212  
9213      if (PyUnicode_READY(input) == -1)
9214          return NULL;
9215      data = PyUnicode_DATA(input);
9216      kind = PyUnicode_KIND(input);
9217      size = PyUnicode_GET_LENGTH(input);
9218  
9219      if (size == 0)
9220          return PyUnicode_FromObject(input);
9221  
9222      /* allocate enough for a simple 1:1 translation without
9223         replacements, if we need more, we'll resize */
9224      _PyUnicodeWriter_Init(&writer);
9225      if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9226          goto onError;
9227  
9228      ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9229  
9230      if (PyUnicode_READY(input) == -1)
9231          return NULL;
9232      if (PyUnicode_IS_ASCII(input)) {
9233          res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9234          if (res < 0) {
9235              _PyUnicodeWriter_Dealloc(&writer);
9236              return NULL;
9237          }
9238          if (res == 1)
9239              return _PyUnicodeWriter_Finish(&writer);
9240      }
9241      else {
9242          i = 0;
9243      }
9244  
9245      while (i<size) {
9246          /* try to encode it */
9247          int translate;
9248          PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9249          Py_ssize_t newpos;
9250          /* startpos for collecting untranslatable chars */
9251          Py_ssize_t collstart;
9252          Py_ssize_t collend;
9253          Py_UCS4 ch;
9254  
9255          ch = PyUnicode_READ(kind, data, i);
9256          translate = charmaptranslate_output(ch, mapping, &writer);
9257          if (translate < 0)
9258              goto onError;
9259  
9260          if (translate != 0) {
9261              /* it worked => adjust input pointer */
9262              ++i;
9263              continue;
9264          }
9265  
9266          /* untranslatable character */
9267          collstart = i;
9268          collend = i+1;
9269  
9270          /* find all untranslatable characters */
9271          while (collend < size) {
9272              PyObject *x;
9273              ch = PyUnicode_READ(kind, data, collend);
9274              if (charmaptranslate_lookup(ch, mapping, &x))
9275                  goto onError;
9276              Py_XDECREF(x);
9277              if (x != Py_None)
9278                  break;
9279              ++collend;
9280          }
9281  
9282          if (ignore) {
9283              i = collend;
9284          }
9285          else {
9286              repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9287                                                               reason, input, &exc,
9288                                                               collstart, collend, &newpos);
9289              if (repunicode == NULL)
9290                  goto onError;
9291              if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9292                  Py_DECREF(repunicode);
9293                  goto onError;
9294              }
9295              Py_DECREF(repunicode);
9296              i = newpos;
9297          }
9298      }
9299      Py_XDECREF(exc);
9300      Py_XDECREF(errorHandler);
9301      return _PyUnicodeWriter_Finish(&writer);
9302  
9303    onError:
9304      _PyUnicodeWriter_Dealloc(&writer);
9305      Py_XDECREF(exc);
9306      Py_XDECREF(errorHandler);
9307      return NULL;
9308  }
9309  
9310  PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9311  PyUnicode_Translate(PyObject *str,
9312                      PyObject *mapping,
9313                      const char *errors)
9314  {
9315      if (ensure_unicode(str) < 0)
9316          return NULL;
9317      return _PyUnicode_TranslateCharmap(str, mapping, errors);
9318  }
9319  
9320  PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9321  _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9322  {
9323      if (!PyUnicode_Check(unicode)) {
9324          PyErr_BadInternalCall();
9325          return NULL;
9326      }
9327      if (PyUnicode_READY(unicode) == -1)
9328          return NULL;
9329      if (PyUnicode_IS_ASCII(unicode)) {
9330          /* If the string is already ASCII, just return the same string */
9331          Py_INCREF(unicode);
9332          return unicode;
9333      }
9334  
9335      Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9336      PyObject *result = PyUnicode_New(len, 127);
9337      if (result == NULL) {
9338          return NULL;
9339      }
9340  
9341      Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9342      int kind = PyUnicode_KIND(unicode);
9343      const void *data = PyUnicode_DATA(unicode);
9344      Py_ssize_t i;
9345      for (i = 0; i < len; ++i) {
9346          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9347          if (ch < 127) {
9348              out[i] = ch;
9349          }
9350          else if (Py_UNICODE_ISSPACE(ch)) {
9351              out[i] = ' ';
9352          }
9353          else {
9354              int decimal = Py_UNICODE_TODECIMAL(ch);
9355              if (decimal < 0) {
9356                  out[i] = '?';
9357                  out[i+1] = '\0';
9358                  _PyUnicode_LENGTH(result) = i + 1;
9359                  break;
9360              }
9361              out[i] = '0' + decimal;
9362          }
9363      }
9364  
9365      assert(_PyUnicode_CheckConsistency(result, 1));
9366      return result;
9367  }
9368  
9369  /* --- Helpers ------------------------------------------------------------ */
9370  
9371  /* helper macro to fixup start/end slice values */
9372  #define ADJUST_INDICES(start, end, len)         \
9373      if (end > len)                              \
9374          end = len;                              \
9375      else if (end < 0) {                         \
9376          end += len;                             \
9377          if (end < 0)                            \
9378              end = 0;                            \
9379      }                                           \
9380      if (start < 0) {                            \
9381          start += len;                           \
9382          if (start < 0)                          \
9383              start = 0;                          \
9384      }
9385  
9386  static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9387  any_find_slice(PyObject* s1, PyObject* s2,
9388                 Py_ssize_t start,
9389                 Py_ssize_t end,
9390                 int direction)
9391  {
9392      int kind1, kind2;
9393      const void *buf1, *buf2;
9394      Py_ssize_t len1, len2, result;
9395  
9396      kind1 = PyUnicode_KIND(s1);
9397      kind2 = PyUnicode_KIND(s2);
9398      if (kind1 < kind2)
9399          return -1;
9400  
9401      len1 = PyUnicode_GET_LENGTH(s1);
9402      len2 = PyUnicode_GET_LENGTH(s2);
9403      ADJUST_INDICES(start, end, len1);
9404      if (end - start < len2)
9405          return -1;
9406  
9407      buf1 = PyUnicode_DATA(s1);
9408      buf2 = PyUnicode_DATA(s2);
9409      if (len2 == 1) {
9410          Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9411          result = findchar((const char *)buf1 + kind1*start,
9412                            kind1, end - start, ch, direction);
9413          if (result == -1)
9414              return -1;
9415          else
9416              return start + result;
9417      }
9418  
9419      if (kind2 != kind1) {
9420          buf2 = unicode_askind(kind2, buf2, len2, kind1);
9421          if (!buf2)
9422              return -2;
9423      }
9424  
9425      if (direction > 0) {
9426          switch (kind1) {
9427          case PyUnicode_1BYTE_KIND:
9428              if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9429                  result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9430              else
9431                  result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9432              break;
9433          case PyUnicode_2BYTE_KIND:
9434              result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9435              break;
9436          case PyUnicode_4BYTE_KIND:
9437              result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9438              break;
9439          default:
9440              Py_UNREACHABLE();
9441          }
9442      }
9443      else {
9444          switch (kind1) {
9445          case PyUnicode_1BYTE_KIND:
9446              if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9447                  result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9448              else
9449                  result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450              break;
9451          case PyUnicode_2BYTE_KIND:
9452              result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453              break;
9454          case PyUnicode_4BYTE_KIND:
9455              result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456              break;
9457          default:
9458              Py_UNREACHABLE();
9459          }
9460      }
9461  
9462      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9463      if (kind2 != kind1)
9464          PyMem_Free((void *)buf2);
9465  
9466      return result;
9467  }
9468  
9469  /* _PyUnicode_InsertThousandsGrouping() helper functions */
9470  #include "stringlib/localeutil.h"
9471  
9472  /**
9473   * InsertThousandsGrouping:
9474   * @writer: Unicode writer.
9475   * @n_buffer: Number of characters in @buffer.
9476   * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9477   * @d_pos: Start of digits string.
9478   * @n_digits: The number of digits in the string, in which we want
9479   *            to put the grouping chars.
9480   * @min_width: The minimum width of the digits in the output string.
9481   *             Output will be zero-padded on the left to fill.
9482   * @grouping: see definition in localeconv().
9483   * @thousands_sep: see definition in localeconv().
9484   *
9485   * There are 2 modes: counting and filling. If @writer is NULL,
9486   *  we are in counting mode, else filling mode.
9487   * If counting, the required buffer size is returned.
9488   * If filling, we know the buffer will be large enough, so we don't
9489   *  need to pass in the buffer size.
9490   * Inserts thousand grouping characters (as defined by grouping and
9491   *  thousands_sep) into @writer.
9492   *
9493   * Return value: -1 on error, number of characters otherwise.
9494   **/
9495  Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9496  _PyUnicode_InsertThousandsGrouping(
9497      _PyUnicodeWriter *writer,
9498      Py_ssize_t n_buffer,
9499      PyObject *digits,
9500      Py_ssize_t d_pos,
9501      Py_ssize_t n_digits,
9502      Py_ssize_t min_width,
9503      const char *grouping,
9504      PyObject *thousands_sep,
9505      Py_UCS4 *maxchar)
9506  {
9507      min_width = Py_MAX(0, min_width);
9508      if (writer) {
9509          assert(digits != NULL);
9510          assert(maxchar == NULL);
9511      }
9512      else {
9513          assert(digits == NULL);
9514          assert(maxchar != NULL);
9515      }
9516      assert(0 <= d_pos);
9517      assert(0 <= n_digits);
9518      assert(grouping != NULL);
9519  
9520      if (digits != NULL) {
9521          if (PyUnicode_READY(digits) == -1) {
9522              return -1;
9523          }
9524      }
9525      if (PyUnicode_READY(thousands_sep) == -1) {
9526          return -1;
9527      }
9528  
9529      Py_ssize_t count = 0;
9530      Py_ssize_t n_zeros;
9531      int loop_broken = 0;
9532      int use_separator = 0; /* First time through, don't append the
9533                                separator. They only go between
9534                                groups. */
9535      Py_ssize_t buffer_pos;
9536      Py_ssize_t digits_pos;
9537      Py_ssize_t len;
9538      Py_ssize_t n_chars;
9539      Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9540                                          be looked at */
9541      /* A generator that returns all of the grouping widths, until it
9542         returns 0. */
9543      GroupGenerator groupgen;
9544      GroupGenerator_init(&groupgen, grouping);
9545      const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9546  
9547      /* if digits are not grouped, thousands separator
9548         should be an empty string */
9549      assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9550  
9551      digits_pos = d_pos + n_digits;
9552      if (writer) {
9553          buffer_pos = writer->pos + n_buffer;
9554          assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9555          assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9556      }
9557      else {
9558          buffer_pos = n_buffer;
9559      }
9560  
9561      if (!writer) {
9562          *maxchar = 127;
9563      }
9564  
9565      while ((len = GroupGenerator_next(&groupgen)) > 0) {
9566          len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9567          n_zeros = Py_MAX(0, len - remaining);
9568          n_chars = Py_MAX(0, Py_MIN(remaining, len));
9569  
9570          /* Use n_zero zero's and n_chars chars */
9571  
9572          /* Count only, don't do anything. */
9573          count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9574  
9575          /* Copy into the writer. */
9576          InsertThousandsGrouping_fill(writer, &buffer_pos,
9577                                       digits, &digits_pos,
9578                                       n_chars, n_zeros,
9579                                       use_separator ? thousands_sep : NULL,
9580                                       thousands_sep_len, maxchar);
9581  
9582          /* Use a separator next time. */
9583          use_separator = 1;
9584  
9585          remaining -= n_chars;
9586          min_width -= len;
9587  
9588          if (remaining <= 0 && min_width <= 0) {
9589              loop_broken = 1;
9590              break;
9591          }
9592          min_width -= thousands_sep_len;
9593      }
9594      if (!loop_broken) {
9595          /* We left the loop without using a break statement. */
9596  
9597          len = Py_MAX(Py_MAX(remaining, min_width), 1);
9598          n_zeros = Py_MAX(0, len - remaining);
9599          n_chars = Py_MAX(0, Py_MIN(remaining, len));
9600  
9601          /* Use n_zero zero's and n_chars chars */
9602          count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9603  
9604          /* Copy into the writer. */
9605          InsertThousandsGrouping_fill(writer, &buffer_pos,
9606                                       digits, &digits_pos,
9607                                       n_chars, n_zeros,
9608                                       use_separator ? thousands_sep : NULL,
9609                                       thousands_sep_len, maxchar);
9610      }
9611      return count;
9612  }
9613  
9614  
9615  Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9616  PyUnicode_Count(PyObject *str,
9617                  PyObject *substr,
9618                  Py_ssize_t start,
9619                  Py_ssize_t end)
9620  {
9621      Py_ssize_t result;
9622      int kind1, kind2;
9623      const void *buf1 = NULL, *buf2 = NULL;
9624      Py_ssize_t len1, len2;
9625  
9626      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9627          return -1;
9628  
9629      kind1 = PyUnicode_KIND(str);
9630      kind2 = PyUnicode_KIND(substr);
9631      if (kind1 < kind2)
9632          return 0;
9633  
9634      len1 = PyUnicode_GET_LENGTH(str);
9635      len2 = PyUnicode_GET_LENGTH(substr);
9636      ADJUST_INDICES(start, end, len1);
9637      if (end - start < len2)
9638          return 0;
9639  
9640      buf1 = PyUnicode_DATA(str);
9641      buf2 = PyUnicode_DATA(substr);
9642      if (kind2 != kind1) {
9643          buf2 = unicode_askind(kind2, buf2, len2, kind1);
9644          if (!buf2)
9645              goto onError;
9646      }
9647  
9648      switch (kind1) {
9649      case PyUnicode_1BYTE_KIND:
9650          if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9651              result = asciilib_count(
9652                  ((const Py_UCS1*)buf1) + start, end - start,
9653                  buf2, len2, PY_SSIZE_T_MAX
9654                  );
9655          else
9656              result = ucs1lib_count(
9657                  ((const Py_UCS1*)buf1) + start, end - start,
9658                  buf2, len2, PY_SSIZE_T_MAX
9659                  );
9660          break;
9661      case PyUnicode_2BYTE_KIND:
9662          result = ucs2lib_count(
9663              ((const Py_UCS2*)buf1) + start, end - start,
9664              buf2, len2, PY_SSIZE_T_MAX
9665              );
9666          break;
9667      case PyUnicode_4BYTE_KIND:
9668          result = ucs4lib_count(
9669              ((const Py_UCS4*)buf1) + start, end - start,
9670              buf2, len2, PY_SSIZE_T_MAX
9671              );
9672          break;
9673      default:
9674          Py_UNREACHABLE();
9675      }
9676  
9677      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9678      if (kind2 != kind1)
9679          PyMem_Free((void *)buf2);
9680  
9681      return result;
9682    onError:
9683      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9684      if (kind2 != kind1)
9685          PyMem_Free((void *)buf2);
9686      return -1;
9687  }
9688  
9689  Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9690  PyUnicode_Find(PyObject *str,
9691                 PyObject *substr,
9692                 Py_ssize_t start,
9693                 Py_ssize_t end,
9694                 int direction)
9695  {
9696      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9697          return -2;
9698  
9699      return any_find_slice(str, substr, start, end, direction);
9700  }
9701  
9702  Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9703  PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9704                     Py_ssize_t start, Py_ssize_t end,
9705                     int direction)
9706  {
9707      int kind;
9708      Py_ssize_t len, result;
9709      if (PyUnicode_READY(str) == -1)
9710          return -2;
9711      len = PyUnicode_GET_LENGTH(str);
9712      ADJUST_INDICES(start, end, len);
9713      if (end - start < 1)
9714          return -1;
9715      kind = PyUnicode_KIND(str);
9716      result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9717                        kind, end-start, ch, direction);
9718      if (result == -1)
9719          return -1;
9720      else
9721          return start + result;
9722  }
9723  
9724  static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9725  tailmatch(PyObject *self,
9726            PyObject *substring,
9727            Py_ssize_t start,
9728            Py_ssize_t end,
9729            int direction)
9730  {
9731      int kind_self;
9732      int kind_sub;
9733      const void *data_self;
9734      const void *data_sub;
9735      Py_ssize_t offset;
9736      Py_ssize_t i;
9737      Py_ssize_t end_sub;
9738  
9739      if (PyUnicode_READY(self) == -1 ||
9740          PyUnicode_READY(substring) == -1)
9741          return -1;
9742  
9743      ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9744      end -= PyUnicode_GET_LENGTH(substring);
9745      if (end < start)
9746          return 0;
9747  
9748      if (PyUnicode_GET_LENGTH(substring) == 0)
9749          return 1;
9750  
9751      kind_self = PyUnicode_KIND(self);
9752      data_self = PyUnicode_DATA(self);
9753      kind_sub = PyUnicode_KIND(substring);
9754      data_sub = PyUnicode_DATA(substring);
9755      end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9756  
9757      if (direction > 0)
9758          offset = end;
9759      else
9760          offset = start;
9761  
9762      if (PyUnicode_READ(kind_self, data_self, offset) ==
9763          PyUnicode_READ(kind_sub, data_sub, 0) &&
9764          PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9765          PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9766          /* If both are of the same kind, memcmp is sufficient */
9767          if (kind_self == kind_sub) {
9768              return ! memcmp((char *)data_self +
9769                                  (offset * PyUnicode_KIND(substring)),
9770                              data_sub,
9771                              PyUnicode_GET_LENGTH(substring) *
9772                                  PyUnicode_KIND(substring));
9773          }
9774          /* otherwise we have to compare each character by first accessing it */
9775          else {
9776              /* We do not need to compare 0 and len(substring)-1 because
9777                 the if statement above ensured already that they are equal
9778                 when we end up here. */
9779              for (i = 1; i < end_sub; ++i) {
9780                  if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9781                      PyUnicode_READ(kind_sub, data_sub, i))
9782                      return 0;
9783              }
9784              return 1;
9785          }
9786      }
9787  
9788      return 0;
9789  }
9790  
9791  Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9792  PyUnicode_Tailmatch(PyObject *str,
9793                      PyObject *substr,
9794                      Py_ssize_t start,
9795                      Py_ssize_t end,
9796                      int direction)
9797  {
9798      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9799          return -1;
9800  
9801      return tailmatch(str, substr, start, end, direction);
9802  }
9803  
9804  static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9805  ascii_upper_or_lower(PyObject *self, int lower)
9806  {
9807      Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9808      const char *data = PyUnicode_DATA(self);
9809      char *resdata;
9810      PyObject *res;
9811  
9812      res = PyUnicode_New(len, 127);
9813      if (res == NULL)
9814          return NULL;
9815      resdata = PyUnicode_DATA(res);
9816      if (lower)
9817          _Py_bytes_lower(resdata, data, len);
9818      else
9819          _Py_bytes_upper(resdata, data, len);
9820      return res;
9821  }
9822  
9823  static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9824  handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9825  {
9826      Py_ssize_t j;
9827      int final_sigma;
9828      Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9829      /* U+03A3 is in the Final_Sigma context when, it is found like this:
9830  
9831       \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9832  
9833      where ! is a negation and \p{xxx} is a character with property xxx.
9834      */
9835      for (j = i - 1; j >= 0; j--) {
9836          c = PyUnicode_READ(kind, data, j);
9837          if (!_PyUnicode_IsCaseIgnorable(c))
9838              break;
9839      }
9840      final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9841      if (final_sigma) {
9842          for (j = i + 1; j < length; j++) {
9843              c = PyUnicode_READ(kind, data, j);
9844              if (!_PyUnicode_IsCaseIgnorable(c))
9845                  break;
9846          }
9847          final_sigma = j == length || !_PyUnicode_IsCased(c);
9848      }
9849      return (final_sigma) ? 0x3C2 : 0x3C3;
9850  }
9851  
9852  static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9853  lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9854             Py_UCS4 c, Py_UCS4 *mapped)
9855  {
9856      /* Obscure special case. */
9857      if (c == 0x3A3) {
9858          mapped[0] = handle_capital_sigma(kind, data, length, i);
9859          return 1;
9860      }
9861      return _PyUnicode_ToLowerFull(c, mapped);
9862  }
9863  
9864  static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9865  do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9866  {
9867      Py_ssize_t i, k = 0;
9868      int n_res, j;
9869      Py_UCS4 c, mapped[3];
9870  
9871      c = PyUnicode_READ(kind, data, 0);
9872      n_res = _PyUnicode_ToTitleFull(c, mapped);
9873      for (j = 0; j < n_res; j++) {
9874          *maxchar = Py_MAX(*maxchar, mapped[j]);
9875          res[k++] = mapped[j];
9876      }
9877      for (i = 1; i < length; i++) {
9878          c = PyUnicode_READ(kind, data, i);
9879          n_res = lower_ucs4(kind, data, length, i, c, mapped);
9880          for (j = 0; j < n_res; j++) {
9881              *maxchar = Py_MAX(*maxchar, mapped[j]);
9882              res[k++] = mapped[j];
9883          }
9884      }
9885      return k;
9886  }
9887  
9888  static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9889  do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9890      Py_ssize_t i, k = 0;
9891  
9892      for (i = 0; i < length; i++) {
9893          Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9894          int n_res, j;
9895          if (Py_UNICODE_ISUPPER(c)) {
9896              n_res = lower_ucs4(kind, data, length, i, c, mapped);
9897          }
9898          else if (Py_UNICODE_ISLOWER(c)) {
9899              n_res = _PyUnicode_ToUpperFull(c, mapped);
9900          }
9901          else {
9902              n_res = 1;
9903              mapped[0] = c;
9904          }
9905          for (j = 0; j < n_res; j++) {
9906              *maxchar = Py_MAX(*maxchar, mapped[j]);
9907              res[k++] = mapped[j];
9908          }
9909      }
9910      return k;
9911  }
9912  
9913  static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9914  do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9915                    Py_UCS4 *maxchar, int lower)
9916  {
9917      Py_ssize_t i, k = 0;
9918  
9919      for (i = 0; i < length; i++) {
9920          Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921          int n_res, j;
9922          if (lower)
9923              n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924          else
9925              n_res = _PyUnicode_ToUpperFull(c, mapped);
9926          for (j = 0; j < n_res; j++) {
9927              *maxchar = Py_MAX(*maxchar, mapped[j]);
9928              res[k++] = mapped[j];
9929          }
9930      }
9931      return k;
9932  }
9933  
9934  static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9935  do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9936  {
9937      return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9938  }
9939  
9940  static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9941  do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9942  {
9943      return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9944  }
9945  
9946  static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947  do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9948  {
9949      Py_ssize_t i, k = 0;
9950  
9951      for (i = 0; i < length; i++) {
9952          Py_UCS4 c = PyUnicode_READ(kind, data, i);
9953          Py_UCS4 mapped[3];
9954          int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9955          for (j = 0; j < n_res; j++) {
9956              *maxchar = Py_MAX(*maxchar, mapped[j]);
9957              res[k++] = mapped[j];
9958          }
9959      }
9960      return k;
9961  }
9962  
9963  static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9964  do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9965  {
9966      Py_ssize_t i, k = 0;
9967      int previous_is_cased;
9968  
9969      previous_is_cased = 0;
9970      for (i = 0; i < length; i++) {
9971          const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9972          Py_UCS4 mapped[3];
9973          int n_res, j;
9974  
9975          if (previous_is_cased)
9976              n_res = lower_ucs4(kind, data, length, i, c, mapped);
9977          else
9978              n_res = _PyUnicode_ToTitleFull(c, mapped);
9979  
9980          for (j = 0; j < n_res; j++) {
9981              *maxchar = Py_MAX(*maxchar, mapped[j]);
9982              res[k++] = mapped[j];
9983          }
9984  
9985          previous_is_cased = _PyUnicode_IsCased(c);
9986      }
9987      return k;
9988  }
9989  
9990  static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9991  case_operation(PyObject *self,
9992                 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9993  {
9994      PyObject *res = NULL;
9995      Py_ssize_t length, newlength = 0;
9996      int kind, outkind;
9997      const void *data;
9998      void *outdata;
9999      Py_UCS4 maxchar = 0, *tmp, *tmpend;
10000  
10001      assert(PyUnicode_IS_READY(self));
10002  
10003      kind = PyUnicode_KIND(self);
10004      data = PyUnicode_DATA(self);
10005      length = PyUnicode_GET_LENGTH(self);
10006      if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10007          PyErr_SetString(PyExc_OverflowError, "string is too long");
10008          return NULL;
10009      }
10010      tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10011      if (tmp == NULL)
10012          return PyErr_NoMemory();
10013      newlength = perform(kind, data, length, tmp, &maxchar);
10014      res = PyUnicode_New(newlength, maxchar);
10015      if (res == NULL)
10016          goto leave;
10017      tmpend = tmp + newlength;
10018      outdata = PyUnicode_DATA(res);
10019      outkind = PyUnicode_KIND(res);
10020      switch (outkind) {
10021      case PyUnicode_1BYTE_KIND:
10022          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10023          break;
10024      case PyUnicode_2BYTE_KIND:
10025          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10026          break;
10027      case PyUnicode_4BYTE_KIND:
10028          memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10029          break;
10030      default:
10031          Py_UNREACHABLE();
10032      }
10033    leave:
10034      PyMem_Free(tmp);
10035      return res;
10036  }
10037  
10038  PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10039  PyUnicode_Join(PyObject *separator, PyObject *seq)
10040  {
10041      PyObject *res;
10042      PyObject *fseq;
10043      Py_ssize_t seqlen;
10044      PyObject **items;
10045  
10046      fseq = PySequence_Fast(seq, "can only join an iterable");
10047      if (fseq == NULL) {
10048          return NULL;
10049      }
10050  
10051      /* NOTE: the following code can't call back into Python code,
10052       * so we are sure that fseq won't be mutated.
10053       */
10054  
10055      items = PySequence_Fast_ITEMS(fseq);
10056      seqlen = PySequence_Fast_GET_SIZE(fseq);
10057      res = _PyUnicode_JoinArray(separator, items, seqlen);
10058      Py_DECREF(fseq);
10059      return res;
10060  }
10061  
10062  PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10063  _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10064  {
10065      PyObject *res = NULL; /* the result */
10066      PyObject *sep = NULL;
10067      Py_ssize_t seplen;
10068      PyObject *item;
10069      Py_ssize_t sz, i, res_offset;
10070      Py_UCS4 maxchar;
10071      Py_UCS4 item_maxchar;
10072      int use_memcpy;
10073      unsigned char *res_data = NULL, *sep_data = NULL;
10074      PyObject *last_obj;
10075      unsigned int kind = 0;
10076  
10077      /* If empty sequence, return u"". */
10078      if (seqlen == 0) {
10079          _Py_RETURN_UNICODE_EMPTY();
10080      }
10081  
10082      /* If singleton sequence with an exact Unicode, return that. */
10083      last_obj = NULL;
10084      if (seqlen == 1) {
10085          if (PyUnicode_CheckExact(items[0])) {
10086              res = items[0];
10087              Py_INCREF(res);
10088              return res;
10089          }
10090          seplen = 0;
10091          maxchar = 0;
10092      }
10093      else {
10094          /* Set up sep and seplen */
10095          if (separator == NULL) {
10096              /* fall back to a blank space separator */
10097              sep = PyUnicode_FromOrdinal(' ');
10098              if (!sep)
10099                  goto onError;
10100              seplen = 1;
10101              maxchar = 32;
10102          }
10103          else {
10104              if (!PyUnicode_Check(separator)) {
10105                  PyErr_Format(PyExc_TypeError,
10106                               "separator: expected str instance,"
10107                               " %.80s found",
10108                               Py_TYPE(separator)->tp_name);
10109                  goto onError;
10110              }
10111              if (PyUnicode_READY(separator))
10112                  goto onError;
10113              sep = separator;
10114              seplen = PyUnicode_GET_LENGTH(separator);
10115              maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10116              /* inc refcount to keep this code path symmetric with the
10117                 above case of a blank separator */
10118              Py_INCREF(sep);
10119          }
10120          last_obj = sep;
10121      }
10122  
10123      /* There are at least two things to join, or else we have a subclass
10124       * of str in the sequence.
10125       * Do a pre-pass to figure out the total amount of space we'll
10126       * need (sz), and see whether all argument are strings.
10127       */
10128      sz = 0;
10129  #ifdef Py_DEBUG
10130      use_memcpy = 0;
10131  #else
10132      use_memcpy = 1;
10133  #endif
10134      for (i = 0; i < seqlen; i++) {
10135          size_t add_sz;
10136          item = items[i];
10137          if (!PyUnicode_Check(item)) {
10138              PyErr_Format(PyExc_TypeError,
10139                           "sequence item %zd: expected str instance,"
10140                           " %.80s found",
10141                           i, Py_TYPE(item)->tp_name);
10142              goto onError;
10143          }
10144          if (PyUnicode_READY(item) == -1)
10145              goto onError;
10146          add_sz = PyUnicode_GET_LENGTH(item);
10147          item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10148          maxchar = Py_MAX(maxchar, item_maxchar);
10149          if (i != 0) {
10150              add_sz += seplen;
10151          }
10152          if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10153              PyErr_SetString(PyExc_OverflowError,
10154                              "join() result is too long for a Python string");
10155              goto onError;
10156          }
10157          sz += add_sz;
10158          if (use_memcpy && last_obj != NULL) {
10159              if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10160                  use_memcpy = 0;
10161          }
10162          last_obj = item;
10163      }
10164  
10165      res = PyUnicode_New(sz, maxchar);
10166      if (res == NULL)
10167          goto onError;
10168  
10169      /* Catenate everything. */
10170  #ifdef Py_DEBUG
10171      use_memcpy = 0;
10172  #else
10173      if (use_memcpy) {
10174          res_data = PyUnicode_1BYTE_DATA(res);
10175          kind = PyUnicode_KIND(res);
10176          if (seplen != 0)
10177              sep_data = PyUnicode_1BYTE_DATA(sep);
10178      }
10179  #endif
10180      if (use_memcpy) {
10181          for (i = 0; i < seqlen; ++i) {
10182              Py_ssize_t itemlen;
10183              item = items[i];
10184  
10185              /* Copy item, and maybe the separator. */
10186              if (i && seplen != 0) {
10187                  memcpy(res_data,
10188                            sep_data,
10189                            kind * seplen);
10190                  res_data += kind * seplen;
10191              }
10192  
10193              itemlen = PyUnicode_GET_LENGTH(item);
10194              if (itemlen != 0) {
10195                  memcpy(res_data,
10196                            PyUnicode_DATA(item),
10197                            kind * itemlen);
10198                  res_data += kind * itemlen;
10199              }
10200          }
10201          assert(res_data == PyUnicode_1BYTE_DATA(res)
10202                             + kind * PyUnicode_GET_LENGTH(res));
10203      }
10204      else {
10205          for (i = 0, res_offset = 0; i < seqlen; ++i) {
10206              Py_ssize_t itemlen;
10207              item = items[i];
10208  
10209              /* Copy item, and maybe the separator. */
10210              if (i && seplen != 0) {
10211                  _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10212                  res_offset += seplen;
10213              }
10214  
10215              itemlen = PyUnicode_GET_LENGTH(item);
10216              if (itemlen != 0) {
10217                  _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10218                  res_offset += itemlen;
10219              }
10220          }
10221          assert(res_offset == PyUnicode_GET_LENGTH(res));
10222      }
10223  
10224      Py_XDECREF(sep);
10225      assert(_PyUnicode_CheckConsistency(res, 1));
10226      return res;
10227  
10228    onError:
10229      Py_XDECREF(sep);
10230      Py_XDECREF(res);
10231      return NULL;
10232  }
10233  
10234  void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10235  _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10236                      Py_UCS4 fill_char)
10237  {
10238      const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10239      void *data = PyUnicode_DATA(unicode);
10240      assert(PyUnicode_IS_READY(unicode));
10241      assert(unicode_modifiable(unicode));
10242      assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10243      assert(start >= 0);
10244      assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10245      unicode_fill(kind, data, fill_char, start, length);
10246  }
10247  
10248  Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10249  PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10250                 Py_UCS4 fill_char)
10251  {
10252      Py_ssize_t maxlen;
10253  
10254      if (!PyUnicode_Check(unicode)) {
10255          PyErr_BadInternalCall();
10256          return -1;
10257      }
10258      if (PyUnicode_READY(unicode) == -1)
10259          return -1;
10260      if (unicode_check_modifiable(unicode))
10261          return -1;
10262  
10263      if (start < 0) {
10264          PyErr_SetString(PyExc_IndexError, "string index out of range");
10265          return -1;
10266      }
10267      if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10268          PyErr_SetString(PyExc_ValueError,
10269                           "fill character is bigger than "
10270                           "the string maximum character");
10271          return -1;
10272      }
10273  
10274      maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10275      length = Py_MIN(maxlen, length);
10276      if (length <= 0)
10277          return 0;
10278  
10279      _PyUnicode_FastFill(unicode, start, length, fill_char);
10280      return length;
10281  }
10282  
10283  static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10284  pad(PyObject *self,
10285      Py_ssize_t left,
10286      Py_ssize_t right,
10287      Py_UCS4 fill)
10288  {
10289      PyObject *u;
10290      Py_UCS4 maxchar;
10291      int kind;
10292      void *data;
10293  
10294      if (left < 0)
10295          left = 0;
10296      if (right < 0)
10297          right = 0;
10298  
10299      if (left == 0 && right == 0)
10300          return unicode_result_unchanged(self);
10301  
10302      if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10303          right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10304          PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10305          return NULL;
10306      }
10307      maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10308      maxchar = Py_MAX(maxchar, fill);
10309      u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10310      if (!u)
10311          return NULL;
10312  
10313      kind = PyUnicode_KIND(u);
10314      data = PyUnicode_DATA(u);
10315      if (left)
10316          unicode_fill(kind, data, fill, 0, left);
10317      if (right)
10318          unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10319      _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10320      assert(_PyUnicode_CheckConsistency(u, 1));
10321      return u;
10322  }
10323  
10324  PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10325  PyUnicode_Splitlines(PyObject *string, int keepends)
10326  {
10327      PyObject *list;
10328  
10329      if (ensure_unicode(string) < 0)
10330          return NULL;
10331  
10332      switch (PyUnicode_KIND(string)) {
10333      case PyUnicode_1BYTE_KIND:
10334          if (PyUnicode_IS_ASCII(string))
10335              list = asciilib_splitlines(
10336                  string, PyUnicode_1BYTE_DATA(string),
10337                  PyUnicode_GET_LENGTH(string), keepends);
10338          else
10339              list = ucs1lib_splitlines(
10340                  string, PyUnicode_1BYTE_DATA(string),
10341                  PyUnicode_GET_LENGTH(string), keepends);
10342          break;
10343      case PyUnicode_2BYTE_KIND:
10344          list = ucs2lib_splitlines(
10345              string, PyUnicode_2BYTE_DATA(string),
10346              PyUnicode_GET_LENGTH(string), keepends);
10347          break;
10348      case PyUnicode_4BYTE_KIND:
10349          list = ucs4lib_splitlines(
10350              string, PyUnicode_4BYTE_DATA(string),
10351              PyUnicode_GET_LENGTH(string), keepends);
10352          break;
10353      default:
10354          Py_UNREACHABLE();
10355      }
10356      return list;
10357  }
10358  
10359  static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10360  split(PyObject *self,
10361        PyObject *substring,
10362        Py_ssize_t maxcount)
10363  {
10364      int kind1, kind2;
10365      const void *buf1, *buf2;
10366      Py_ssize_t len1, len2;
10367      PyObject* out;
10368  
10369      if (maxcount < 0)
10370          maxcount = PY_SSIZE_T_MAX;
10371  
10372      if (PyUnicode_READY(self) == -1)
10373          return NULL;
10374  
10375      if (substring == NULL)
10376          switch (PyUnicode_KIND(self)) {
10377          case PyUnicode_1BYTE_KIND:
10378              if (PyUnicode_IS_ASCII(self))
10379                  return asciilib_split_whitespace(
10380                      self,  PyUnicode_1BYTE_DATA(self),
10381                      PyUnicode_GET_LENGTH(self), maxcount
10382                      );
10383              else
10384                  return ucs1lib_split_whitespace(
10385                      self,  PyUnicode_1BYTE_DATA(self),
10386                      PyUnicode_GET_LENGTH(self), maxcount
10387                      );
10388          case PyUnicode_2BYTE_KIND:
10389              return ucs2lib_split_whitespace(
10390                  self,  PyUnicode_2BYTE_DATA(self),
10391                  PyUnicode_GET_LENGTH(self), maxcount
10392                  );
10393          case PyUnicode_4BYTE_KIND:
10394              return ucs4lib_split_whitespace(
10395                  self,  PyUnicode_4BYTE_DATA(self),
10396                  PyUnicode_GET_LENGTH(self), maxcount
10397                  );
10398          default:
10399              Py_UNREACHABLE();
10400          }
10401  
10402      if (PyUnicode_READY(substring) == -1)
10403          return NULL;
10404  
10405      kind1 = PyUnicode_KIND(self);
10406      kind2 = PyUnicode_KIND(substring);
10407      len1 = PyUnicode_GET_LENGTH(self);
10408      len2 = PyUnicode_GET_LENGTH(substring);
10409      if (kind1 < kind2 || len1 < len2) {
10410          out = PyList_New(1);
10411          if (out == NULL)
10412              return NULL;
10413          Py_INCREF(self);
10414          PyList_SET_ITEM(out, 0, self);
10415          return out;
10416      }
10417      buf1 = PyUnicode_DATA(self);
10418      buf2 = PyUnicode_DATA(substring);
10419      if (kind2 != kind1) {
10420          buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421          if (!buf2)
10422              return NULL;
10423      }
10424  
10425      switch (kind1) {
10426      case PyUnicode_1BYTE_KIND:
10427          if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428              out = asciilib_split(
10429                  self,  buf1, len1, buf2, len2, maxcount);
10430          else
10431              out = ucs1lib_split(
10432                  self,  buf1, len1, buf2, len2, maxcount);
10433          break;
10434      case PyUnicode_2BYTE_KIND:
10435          out = ucs2lib_split(
10436              self,  buf1, len1, buf2, len2, maxcount);
10437          break;
10438      case PyUnicode_4BYTE_KIND:
10439          out = ucs4lib_split(
10440              self,  buf1, len1, buf2, len2, maxcount);
10441          break;
10442      default:
10443          out = NULL;
10444      }
10445      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446      if (kind2 != kind1)
10447          PyMem_Free((void *)buf2);
10448      return out;
10449  }
10450  
10451  static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10452  rsplit(PyObject *self,
10453         PyObject *substring,
10454         Py_ssize_t maxcount)
10455  {
10456      int kind1, kind2;
10457      const void *buf1, *buf2;
10458      Py_ssize_t len1, len2;
10459      PyObject* out;
10460  
10461      if (maxcount < 0)
10462          maxcount = PY_SSIZE_T_MAX;
10463  
10464      if (PyUnicode_READY(self) == -1)
10465          return NULL;
10466  
10467      if (substring == NULL)
10468          switch (PyUnicode_KIND(self)) {
10469          case PyUnicode_1BYTE_KIND:
10470              if (PyUnicode_IS_ASCII(self))
10471                  return asciilib_rsplit_whitespace(
10472                      self,  PyUnicode_1BYTE_DATA(self),
10473                      PyUnicode_GET_LENGTH(self), maxcount
10474                      );
10475              else
10476                  return ucs1lib_rsplit_whitespace(
10477                      self,  PyUnicode_1BYTE_DATA(self),
10478                      PyUnicode_GET_LENGTH(self), maxcount
10479                      );
10480          case PyUnicode_2BYTE_KIND:
10481              return ucs2lib_rsplit_whitespace(
10482                  self,  PyUnicode_2BYTE_DATA(self),
10483                  PyUnicode_GET_LENGTH(self), maxcount
10484                  );
10485          case PyUnicode_4BYTE_KIND:
10486              return ucs4lib_rsplit_whitespace(
10487                  self,  PyUnicode_4BYTE_DATA(self),
10488                  PyUnicode_GET_LENGTH(self), maxcount
10489                  );
10490          default:
10491              Py_UNREACHABLE();
10492          }
10493  
10494      if (PyUnicode_READY(substring) == -1)
10495          return NULL;
10496  
10497      kind1 = PyUnicode_KIND(self);
10498      kind2 = PyUnicode_KIND(substring);
10499      len1 = PyUnicode_GET_LENGTH(self);
10500      len2 = PyUnicode_GET_LENGTH(substring);
10501      if (kind1 < kind2 || len1 < len2) {
10502          out = PyList_New(1);
10503          if (out == NULL)
10504              return NULL;
10505          Py_INCREF(self);
10506          PyList_SET_ITEM(out, 0, self);
10507          return out;
10508      }
10509      buf1 = PyUnicode_DATA(self);
10510      buf2 = PyUnicode_DATA(substring);
10511      if (kind2 != kind1) {
10512          buf2 = unicode_askind(kind2, buf2, len2, kind1);
10513          if (!buf2)
10514              return NULL;
10515      }
10516  
10517      switch (kind1) {
10518      case PyUnicode_1BYTE_KIND:
10519          if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10520              out = asciilib_rsplit(
10521                  self,  buf1, len1, buf2, len2, maxcount);
10522          else
10523              out = ucs1lib_rsplit(
10524                  self,  buf1, len1, buf2, len2, maxcount);
10525          break;
10526      case PyUnicode_2BYTE_KIND:
10527          out = ucs2lib_rsplit(
10528              self,  buf1, len1, buf2, len2, maxcount);
10529          break;
10530      case PyUnicode_4BYTE_KIND:
10531          out = ucs4lib_rsplit(
10532              self,  buf1, len1, buf2, len2, maxcount);
10533          break;
10534      default:
10535          out = NULL;
10536      }
10537      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10538      if (kind2 != kind1)
10539          PyMem_Free((void *)buf2);
10540      return out;
10541  }
10542  
10543  static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10544  anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10545              PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10546  {
10547      switch (kind) {
10548      case PyUnicode_1BYTE_KIND:
10549          if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10550              return asciilib_find(buf1, len1, buf2, len2, offset);
10551          else
10552              return ucs1lib_find(buf1, len1, buf2, len2, offset);
10553      case PyUnicode_2BYTE_KIND:
10554          return ucs2lib_find(buf1, len1, buf2, len2, offset);
10555      case PyUnicode_4BYTE_KIND:
10556          return ucs4lib_find(buf1, len1, buf2, len2, offset);
10557      }
10558      Py_UNREACHABLE();
10559  }
10560  
10561  static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10562  anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10563               PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10564  {
10565      switch (kind) {
10566      case PyUnicode_1BYTE_KIND:
10567          if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10568              return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10569          else
10570              return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10571      case PyUnicode_2BYTE_KIND:
10572          return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10573      case PyUnicode_4BYTE_KIND:
10574          return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10575      }
10576      Py_UNREACHABLE();
10577  }
10578  
10579  static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10580  replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10581                        Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10582  {
10583      int kind = PyUnicode_KIND(u);
10584      void *data = PyUnicode_DATA(u);
10585      Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10586      if (kind == PyUnicode_1BYTE_KIND) {
10587          ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10588                                        (Py_UCS1 *)data + len,
10589                                        u1, u2, maxcount);
10590      }
10591      else if (kind == PyUnicode_2BYTE_KIND) {
10592          ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10593                                        (Py_UCS2 *)data + len,
10594                                        u1, u2, maxcount);
10595      }
10596      else {
10597          assert(kind == PyUnicode_4BYTE_KIND);
10598          ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10599                                        (Py_UCS4 *)data + len,
10600                                        u1, u2, maxcount);
10601      }
10602  }
10603  
10604  static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10605  replace(PyObject *self, PyObject *str1,
10606          PyObject *str2, Py_ssize_t maxcount)
10607  {
10608      PyObject *u;
10609      const char *sbuf = PyUnicode_DATA(self);
10610      const void *buf1 = PyUnicode_DATA(str1);
10611      const void *buf2 = PyUnicode_DATA(str2);
10612      int srelease = 0, release1 = 0, release2 = 0;
10613      int skind = PyUnicode_KIND(self);
10614      int kind1 = PyUnicode_KIND(str1);
10615      int kind2 = PyUnicode_KIND(str2);
10616      Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10617      Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10618      Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10619      int mayshrink;
10620      Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10621  
10622      if (slen < len1)
10623          goto nothing;
10624  
10625      if (maxcount < 0)
10626          maxcount = PY_SSIZE_T_MAX;
10627      else if (maxcount == 0)
10628          goto nothing;
10629  
10630      if (str1 == str2)
10631          goto nothing;
10632  
10633      maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10634      maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10635      if (maxchar < maxchar_str1)
10636          /* substring too wide to be present */
10637          goto nothing;
10638      maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10639      /* Replacing str1 with str2 may cause a maxchar reduction in the
10640         result string. */
10641      mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10642      maxchar = Py_MAX(maxchar, maxchar_str2);
10643  
10644      if (len1 == len2) {
10645          /* same length */
10646          if (len1 == 0)
10647              goto nothing;
10648          if (len1 == 1) {
10649              /* replace characters */
10650              Py_UCS4 u1, u2;
10651              Py_ssize_t pos;
10652  
10653              u1 = PyUnicode_READ(kind1, buf1, 0);
10654              pos = findchar(sbuf, skind, slen, u1, 1);
10655              if (pos < 0)
10656                  goto nothing;
10657              u2 = PyUnicode_READ(kind2, buf2, 0);
10658              u = PyUnicode_New(slen, maxchar);
10659              if (!u)
10660                  goto error;
10661  
10662              _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10663              replace_1char_inplace(u, pos, u1, u2, maxcount);
10664          }
10665          else {
10666              int rkind = skind;
10667              char *res;
10668              Py_ssize_t i;
10669  
10670              if (kind1 < rkind) {
10671                  /* widen substring */
10672                  buf1 = unicode_askind(kind1, buf1, len1, rkind);
10673                  if (!buf1) goto error;
10674                  release1 = 1;
10675              }
10676              i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10677              if (i < 0)
10678                  goto nothing;
10679              if (rkind > kind2) {
10680                  /* widen replacement */
10681                  buf2 = unicode_askind(kind2, buf2, len2, rkind);
10682                  if (!buf2) goto error;
10683                  release2 = 1;
10684              }
10685              else if (rkind < kind2) {
10686                  /* widen self and buf1 */
10687                  rkind = kind2;
10688                  if (release1) {
10689                      assert(buf1 != PyUnicode_DATA(str1));
10690                      PyMem_Free((void *)buf1);
10691                      buf1 = PyUnicode_DATA(str1);
10692                      release1 = 0;
10693                  }
10694                  sbuf = unicode_askind(skind, sbuf, slen, rkind);
10695                  if (!sbuf) goto error;
10696                  srelease = 1;
10697                  buf1 = unicode_askind(kind1, buf1, len1, rkind);
10698                  if (!buf1) goto error;
10699                  release1 = 1;
10700              }
10701              u = PyUnicode_New(slen, maxchar);
10702              if (!u)
10703                  goto error;
10704              assert(PyUnicode_KIND(u) == rkind);
10705              res = PyUnicode_DATA(u);
10706  
10707              memcpy(res, sbuf, rkind * slen);
10708              /* change everything in-place, starting with this one */
10709              memcpy(res + rkind * i,
10710                     buf2,
10711                     rkind * len2);
10712              i += len1;
10713  
10714              while ( --maxcount > 0) {
10715                  i = anylib_find(rkind, self,
10716                                  sbuf+rkind*i, slen-i,
10717                                  str1, buf1, len1, i);
10718                  if (i == -1)
10719                      break;
10720                  memcpy(res + rkind * i,
10721                         buf2,
10722                         rkind * len2);
10723                  i += len1;
10724              }
10725          }
10726      }
10727      else {
10728          Py_ssize_t n, i, j, ires;
10729          Py_ssize_t new_size;
10730          int rkind = skind;
10731          char *res;
10732  
10733          if (kind1 < rkind) {
10734              /* widen substring */
10735              buf1 = unicode_askind(kind1, buf1, len1, rkind);
10736              if (!buf1) goto error;
10737              release1 = 1;
10738          }
10739          n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10740          if (n == 0)
10741              goto nothing;
10742          if (kind2 < rkind) {
10743              /* widen replacement */
10744              buf2 = unicode_askind(kind2, buf2, len2, rkind);
10745              if (!buf2) goto error;
10746              release2 = 1;
10747          }
10748          else if (kind2 > rkind) {
10749              /* widen self and buf1 */
10750              rkind = kind2;
10751              sbuf = unicode_askind(skind, sbuf, slen, rkind);
10752              if (!sbuf) goto error;
10753              srelease = 1;
10754              if (release1) {
10755                  assert(buf1 != PyUnicode_DATA(str1));
10756                  PyMem_Free((void *)buf1);
10757                  buf1 = PyUnicode_DATA(str1);
10758                  release1 = 0;
10759              }
10760              buf1 = unicode_askind(kind1, buf1, len1, rkind);
10761              if (!buf1) goto error;
10762              release1 = 1;
10763          }
10764          /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10765             PyUnicode_GET_LENGTH(str1)); */
10766          if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10767                  PyErr_SetString(PyExc_OverflowError,
10768                                  "replace string is too long");
10769                  goto error;
10770          }
10771          new_size = slen + n * (len2 - len1);
10772          if (new_size == 0) {
10773              u = unicode_new_empty();
10774              goto done;
10775          }
10776          if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10777              PyErr_SetString(PyExc_OverflowError,
10778                              "replace string is too long");
10779              goto error;
10780          }
10781          u = PyUnicode_New(new_size, maxchar);
10782          if (!u)
10783              goto error;
10784          assert(PyUnicode_KIND(u) == rkind);
10785          res = PyUnicode_DATA(u);
10786          ires = i = 0;
10787          if (len1 > 0) {
10788              while (n-- > 0) {
10789                  /* look for next match */
10790                  j = anylib_find(rkind, self,
10791                                  sbuf + rkind * i, slen-i,
10792                                  str1, buf1, len1, i);
10793                  if (j == -1)
10794                      break;
10795                  else if (j > i) {
10796                      /* copy unchanged part [i:j] */
10797                      memcpy(res + rkind * ires,
10798                             sbuf + rkind * i,
10799                             rkind * (j-i));
10800                      ires += j - i;
10801                  }
10802                  /* copy substitution string */
10803                  if (len2 > 0) {
10804                      memcpy(res + rkind * ires,
10805                             buf2,
10806                             rkind * len2);
10807                      ires += len2;
10808                  }
10809                  i = j + len1;
10810              }
10811              if (i < slen)
10812                  /* copy tail [i:] */
10813                  memcpy(res + rkind * ires,
10814                         sbuf + rkind * i,
10815                         rkind * (slen-i));
10816          }
10817          else {
10818              /* interleave */
10819              while (n > 0) {
10820                  memcpy(res + rkind * ires,
10821                         buf2,
10822                         rkind * len2);
10823                  ires += len2;
10824                  if (--n <= 0)
10825                      break;
10826                  memcpy(res + rkind * ires,
10827                         sbuf + rkind * i,
10828                         rkind);
10829                  ires++;
10830                  i++;
10831              }
10832              memcpy(res + rkind * ires,
10833                     sbuf + rkind * i,
10834                     rkind * (slen-i));
10835          }
10836      }
10837  
10838      if (mayshrink) {
10839          unicode_adjust_maxchar(&u);
10840          if (u == NULL)
10841              goto error;
10842      }
10843  
10844    done:
10845      assert(srelease == (sbuf != PyUnicode_DATA(self)));
10846      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10847      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10848      if (srelease)
10849          PyMem_Free((void *)sbuf);
10850      if (release1)
10851          PyMem_Free((void *)buf1);
10852      if (release2)
10853          PyMem_Free((void *)buf2);
10854      assert(_PyUnicode_CheckConsistency(u, 1));
10855      return u;
10856  
10857    nothing:
10858      /* nothing to replace; return original string (when possible) */
10859      assert(srelease == (sbuf != PyUnicode_DATA(self)));
10860      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10861      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10862      if (srelease)
10863          PyMem_Free((void *)sbuf);
10864      if (release1)
10865          PyMem_Free((void *)buf1);
10866      if (release2)
10867          PyMem_Free((void *)buf2);
10868      return unicode_result_unchanged(self);
10869  
10870    error:
10871      assert(srelease == (sbuf != PyUnicode_DATA(self)));
10872      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10873      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10874      if (srelease)
10875          PyMem_Free((void *)sbuf);
10876      if (release1)
10877          PyMem_Free((void *)buf1);
10878      if (release2)
10879          PyMem_Free((void *)buf2);
10880      return NULL;
10881  }
10882  
10883  /* --- Unicode Object Methods --------------------------------------------- */
10884  
10885  /*[clinic input]
10886  str.title as unicode_title
10887  
10888  Return a version of the string where each word is titlecased.
10889  
10890  More specifically, words start with uppercased characters and all remaining
10891  cased characters have lower case.
10892  [clinic start generated code]*/
10893  
10894  static PyObject *
unicode_title_impl(PyObject * self)10895  unicode_title_impl(PyObject *self)
10896  /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10897  {
10898      if (PyUnicode_READY(self) == -1)
10899          return NULL;
10900      return case_operation(self, do_title);
10901  }
10902  
10903  /*[clinic input]
10904  str.capitalize as unicode_capitalize
10905  
10906  Return a capitalized version of the string.
10907  
10908  More specifically, make the first character have upper case and the rest lower
10909  case.
10910  [clinic start generated code]*/
10911  
10912  static PyObject *
unicode_capitalize_impl(PyObject * self)10913  unicode_capitalize_impl(PyObject *self)
10914  /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10915  {
10916      if (PyUnicode_READY(self) == -1)
10917          return NULL;
10918      if (PyUnicode_GET_LENGTH(self) == 0)
10919          return unicode_result_unchanged(self);
10920      return case_operation(self, do_capitalize);
10921  }
10922  
10923  /*[clinic input]
10924  str.casefold as unicode_casefold
10925  
10926  Return a version of the string suitable for caseless comparisons.
10927  [clinic start generated code]*/
10928  
10929  static PyObject *
unicode_casefold_impl(PyObject * self)10930  unicode_casefold_impl(PyObject *self)
10931  /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10932  {
10933      if (PyUnicode_READY(self) == -1)
10934          return NULL;
10935      if (PyUnicode_IS_ASCII(self))
10936          return ascii_upper_or_lower(self, 1);
10937      return case_operation(self, do_casefold);
10938  }
10939  
10940  
10941  /* Argument converter. Accepts a single Unicode character. */
10942  
10943  static int
convert_uc(PyObject * obj,void * addr)10944  convert_uc(PyObject *obj, void *addr)
10945  {
10946      Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10947  
10948      if (!PyUnicode_Check(obj)) {
10949          PyErr_Format(PyExc_TypeError,
10950                       "The fill character must be a unicode character, "
10951                       "not %.100s", Py_TYPE(obj)->tp_name);
10952          return 0;
10953      }
10954      if (PyUnicode_READY(obj) < 0)
10955          return 0;
10956      if (PyUnicode_GET_LENGTH(obj) != 1) {
10957          PyErr_SetString(PyExc_TypeError,
10958                          "The fill character must be exactly one character long");
10959          return 0;
10960      }
10961      *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10962      return 1;
10963  }
10964  
10965  /*[clinic input]
10966  str.center as unicode_center
10967  
10968      width: Py_ssize_t
10969      fillchar: Py_UCS4 = ' '
10970      /
10971  
10972  Return a centered string of length width.
10973  
10974  Padding is done using the specified fill character (default is a space).
10975  [clinic start generated code]*/
10976  
10977  static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10978  unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10979  /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10980  {
10981      Py_ssize_t marg, left;
10982  
10983      if (PyUnicode_READY(self) == -1)
10984          return NULL;
10985  
10986      if (PyUnicode_GET_LENGTH(self) >= width)
10987          return unicode_result_unchanged(self);
10988  
10989      marg = width - PyUnicode_GET_LENGTH(self);
10990      left = marg / 2 + (marg & width & 1);
10991  
10992      return pad(self, left, marg - left, fillchar);
10993  }
10994  
10995  /* This function assumes that str1 and str2 are readied by the caller. */
10996  
10997  static int
unicode_compare(PyObject * str1,PyObject * str2)10998  unicode_compare(PyObject *str1, PyObject *str2)
10999  {
11000  #define COMPARE(TYPE1, TYPE2) \
11001      do { \
11002          TYPE1* p1 = (TYPE1 *)data1; \
11003          TYPE2* p2 = (TYPE2 *)data2; \
11004          TYPE1* end = p1 + len; \
11005          Py_UCS4 c1, c2; \
11006          for (; p1 != end; p1++, p2++) { \
11007              c1 = *p1; \
11008              c2 = *p2; \
11009              if (c1 != c2) \
11010                  return (c1 < c2) ? -1 : 1; \
11011          } \
11012      } \
11013      while (0)
11014  
11015      int kind1, kind2;
11016      const void *data1, *data2;
11017      Py_ssize_t len1, len2, len;
11018  
11019      kind1 = PyUnicode_KIND(str1);
11020      kind2 = PyUnicode_KIND(str2);
11021      data1 = PyUnicode_DATA(str1);
11022      data2 = PyUnicode_DATA(str2);
11023      len1 = PyUnicode_GET_LENGTH(str1);
11024      len2 = PyUnicode_GET_LENGTH(str2);
11025      len = Py_MIN(len1, len2);
11026  
11027      switch(kind1) {
11028      case PyUnicode_1BYTE_KIND:
11029      {
11030          switch(kind2) {
11031          case PyUnicode_1BYTE_KIND:
11032          {
11033              int cmp = memcmp(data1, data2, len);
11034              /* normalize result of memcmp() into the range [-1; 1] */
11035              if (cmp < 0)
11036                  return -1;
11037              if (cmp > 0)
11038                  return 1;
11039              break;
11040          }
11041          case PyUnicode_2BYTE_KIND:
11042              COMPARE(Py_UCS1, Py_UCS2);
11043              break;
11044          case PyUnicode_4BYTE_KIND:
11045              COMPARE(Py_UCS1, Py_UCS4);
11046              break;
11047          default:
11048              Py_UNREACHABLE();
11049          }
11050          break;
11051      }
11052      case PyUnicode_2BYTE_KIND:
11053      {
11054          switch(kind2) {
11055          case PyUnicode_1BYTE_KIND:
11056              COMPARE(Py_UCS2, Py_UCS1);
11057              break;
11058          case PyUnicode_2BYTE_KIND:
11059          {
11060              COMPARE(Py_UCS2, Py_UCS2);
11061              break;
11062          }
11063          case PyUnicode_4BYTE_KIND:
11064              COMPARE(Py_UCS2, Py_UCS4);
11065              break;
11066          default:
11067              Py_UNREACHABLE();
11068          }
11069          break;
11070      }
11071      case PyUnicode_4BYTE_KIND:
11072      {
11073          switch(kind2) {
11074          case PyUnicode_1BYTE_KIND:
11075              COMPARE(Py_UCS4, Py_UCS1);
11076              break;
11077          case PyUnicode_2BYTE_KIND:
11078              COMPARE(Py_UCS4, Py_UCS2);
11079              break;
11080          case PyUnicode_4BYTE_KIND:
11081          {
11082  #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11083              int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11084              /* normalize result of wmemcmp() into the range [-1; 1] */
11085              if (cmp < 0)
11086                  return -1;
11087              if (cmp > 0)
11088                  return 1;
11089  #else
11090              COMPARE(Py_UCS4, Py_UCS4);
11091  #endif
11092              break;
11093          }
11094          default:
11095              Py_UNREACHABLE();
11096          }
11097          break;
11098      }
11099      default:
11100          Py_UNREACHABLE();
11101      }
11102  
11103      if (len1 == len2)
11104          return 0;
11105      if (len1 < len2)
11106          return -1;
11107      else
11108          return 1;
11109  
11110  #undef COMPARE
11111  }
11112  
11113  static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11114  unicode_compare_eq(PyObject *str1, PyObject *str2)
11115  {
11116      int kind;
11117      const void *data1, *data2;
11118      Py_ssize_t len;
11119      int cmp;
11120  
11121      len = PyUnicode_GET_LENGTH(str1);
11122      if (PyUnicode_GET_LENGTH(str2) != len)
11123          return 0;
11124      kind = PyUnicode_KIND(str1);
11125      if (PyUnicode_KIND(str2) != kind)
11126          return 0;
11127      data1 = PyUnicode_DATA(str1);
11128      data2 = PyUnicode_DATA(str2);
11129  
11130      cmp = memcmp(data1, data2, len * kind);
11131      return (cmp == 0);
11132  }
11133  
11134  int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)11135  _PyUnicode_Equal(PyObject *str1, PyObject *str2)
11136  {
11137      assert(PyUnicode_Check(str1));
11138      assert(PyUnicode_Check(str2));
11139      if (str1 == str2) {
11140          return 1;
11141      }
11142      if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11143          return -1;
11144      }
11145      return unicode_compare_eq(str1, str2);
11146  }
11147  
11148  
11149  int
PyUnicode_Compare(PyObject * left,PyObject * right)11150  PyUnicode_Compare(PyObject *left, PyObject *right)
11151  {
11152      if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11153          if (PyUnicode_READY(left) == -1 ||
11154              PyUnicode_READY(right) == -1)
11155              return -1;
11156  
11157          /* a string is equal to itself */
11158          if (left == right)
11159              return 0;
11160  
11161          return unicode_compare(left, right);
11162      }
11163      PyErr_Format(PyExc_TypeError,
11164                   "Can't compare %.100s and %.100s",
11165                   Py_TYPE(left)->tp_name,
11166                   Py_TYPE(right)->tp_name);
11167      return -1;
11168  }
11169  
11170  int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11171  PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11172  {
11173      Py_ssize_t i;
11174      int kind;
11175      Py_UCS4 chr;
11176      const unsigned char *ustr = (const unsigned char *)str;
11177  
11178      assert(_PyUnicode_CHECK(uni));
11179      if (!PyUnicode_IS_READY(uni)) {
11180          const wchar_t *ws = _PyUnicode_WSTR(uni);
11181          /* Compare Unicode string and source character set string */
11182          for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11183              if (chr != ustr[i])
11184                  return (chr < ustr[i]) ? -1 : 1;
11185          }
11186          /* This check keeps Python strings that end in '\0' from comparing equal
11187           to C strings identical up to that point. */
11188          if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11189              return 1; /* uni is longer */
11190          if (ustr[i])
11191              return -1; /* str is longer */
11192          return 0;
11193      }
11194      kind = PyUnicode_KIND(uni);
11195      if (kind == PyUnicode_1BYTE_KIND) {
11196          const void *data = PyUnicode_1BYTE_DATA(uni);
11197          size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11198          size_t len, len2 = strlen(str);
11199          int cmp;
11200  
11201          len = Py_MIN(len1, len2);
11202          cmp = memcmp(data, str, len);
11203          if (cmp != 0) {
11204              if (cmp < 0)
11205                  return -1;
11206              else
11207                  return 1;
11208          }
11209          if (len1 > len2)
11210              return 1; /* uni is longer */
11211          if (len1 < len2)
11212              return -1; /* str is longer */
11213          return 0;
11214      }
11215      else {
11216          const void *data = PyUnicode_DATA(uni);
11217          /* Compare Unicode string and source character set string */
11218          for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11219              if (chr != (unsigned char)str[i])
11220                  return (chr < (unsigned char)(str[i])) ? -1 : 1;
11221          /* This check keeps Python strings that end in '\0' from comparing equal
11222           to C strings identical up to that point. */
11223          if (PyUnicode_GET_LENGTH(uni) != i || chr)
11224              return 1; /* uni is longer */
11225          if (str[i])
11226              return -1; /* str is longer */
11227          return 0;
11228      }
11229  }
11230  
11231  static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11232  non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11233  {
11234      size_t i, len;
11235      const wchar_t *p;
11236      len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11237      if (strlen(str) != len)
11238          return 0;
11239      p = _PyUnicode_WSTR(unicode);
11240      assert(p);
11241      for (i = 0; i < len; i++) {
11242          unsigned char c = (unsigned char)str[i];
11243          if (c >= 128 || p[i] != (wchar_t)c)
11244              return 0;
11245      }
11246      return 1;
11247  }
11248  
11249  int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11250  _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11251  {
11252      size_t len;
11253      assert(_PyUnicode_CHECK(unicode));
11254      assert(str);
11255  #ifndef NDEBUG
11256      for (const char *p = str; *p; p++) {
11257          assert((unsigned char)*p < 128);
11258      }
11259  #endif
11260      if (PyUnicode_READY(unicode) == -1) {
11261          /* Memory error or bad data */
11262          PyErr_Clear();
11263          return non_ready_unicode_equal_to_ascii_string(unicode, str);
11264      }
11265      if (!PyUnicode_IS_ASCII(unicode))
11266          return 0;
11267      len = (size_t)PyUnicode_GET_LENGTH(unicode);
11268      return strlen(str) == len &&
11269             memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11270  }
11271  
11272  int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11273  _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11274  {
11275      PyObject *right_uni;
11276  
11277      assert(_PyUnicode_CHECK(left));
11278      assert(right->string);
11279  #ifndef NDEBUG
11280      for (const char *p = right->string; *p; p++) {
11281          assert((unsigned char)*p < 128);
11282      }
11283  #endif
11284  
11285      if (PyUnicode_READY(left) == -1) {
11286          /* memory error or bad data */
11287          PyErr_Clear();
11288          return non_ready_unicode_equal_to_ascii_string(left, right->string);
11289      }
11290  
11291      if (!PyUnicode_IS_ASCII(left))
11292          return 0;
11293  
11294      right_uni = _PyUnicode_FromId(right);       /* borrowed */
11295      if (right_uni == NULL) {
11296          /* memory error or bad data */
11297          PyErr_Clear();
11298          return _PyUnicode_EqualToASCIIString(left, right->string);
11299      }
11300  
11301      if (left == right_uni)
11302          return 1;
11303  
11304      if (PyUnicode_CHECK_INTERNED(left))
11305          return 0;
11306  
11307      assert(_PyUnicode_HASH(right_uni) != -1);
11308      Py_hash_t hash = _PyUnicode_HASH(left);
11309      if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11310          return 0;
11311      }
11312  
11313      return unicode_compare_eq(left, right_uni);
11314  }
11315  
11316  PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11317  PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11318  {
11319      int result;
11320  
11321      if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11322          Py_RETURN_NOTIMPLEMENTED;
11323  
11324      if (PyUnicode_READY(left) == -1 ||
11325          PyUnicode_READY(right) == -1)
11326          return NULL;
11327  
11328      if (left == right) {
11329          switch (op) {
11330          case Py_EQ:
11331          case Py_LE:
11332          case Py_GE:
11333              /* a string is equal to itself */
11334              Py_RETURN_TRUE;
11335          case Py_NE:
11336          case Py_LT:
11337          case Py_GT:
11338              Py_RETURN_FALSE;
11339          default:
11340              PyErr_BadArgument();
11341              return NULL;
11342          }
11343      }
11344      else if (op == Py_EQ || op == Py_NE) {
11345          result = unicode_compare_eq(left, right);
11346          result ^= (op == Py_NE);
11347          return PyBool_FromLong(result);
11348      }
11349      else {
11350          result = unicode_compare(left, right);
11351          Py_RETURN_RICHCOMPARE(result, 0, op);
11352      }
11353  }
11354  
11355  int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11356  _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11357  {
11358      return unicode_eq(aa, bb);
11359  }
11360  
11361  int
PyUnicode_Contains(PyObject * str,PyObject * substr)11362  PyUnicode_Contains(PyObject *str, PyObject *substr)
11363  {
11364      int kind1, kind2;
11365      const void *buf1, *buf2;
11366      Py_ssize_t len1, len2;
11367      int result;
11368  
11369      if (!PyUnicode_Check(substr)) {
11370          PyErr_Format(PyExc_TypeError,
11371                       "'in <string>' requires string as left operand, not %.100s",
11372                       Py_TYPE(substr)->tp_name);
11373          return -1;
11374      }
11375      if (PyUnicode_READY(substr) == -1)
11376          return -1;
11377      if (ensure_unicode(str) < 0)
11378          return -1;
11379  
11380      kind1 = PyUnicode_KIND(str);
11381      kind2 = PyUnicode_KIND(substr);
11382      if (kind1 < kind2)
11383          return 0;
11384      len1 = PyUnicode_GET_LENGTH(str);
11385      len2 = PyUnicode_GET_LENGTH(substr);
11386      if (len1 < len2)
11387          return 0;
11388      buf1 = PyUnicode_DATA(str);
11389      buf2 = PyUnicode_DATA(substr);
11390      if (len2 == 1) {
11391          Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11392          result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11393          return result;
11394      }
11395      if (kind2 != kind1) {
11396          buf2 = unicode_askind(kind2, buf2, len2, kind1);
11397          if (!buf2)
11398              return -1;
11399      }
11400  
11401      switch (kind1) {
11402      case PyUnicode_1BYTE_KIND:
11403          result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11404          break;
11405      case PyUnicode_2BYTE_KIND:
11406          result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11407          break;
11408      case PyUnicode_4BYTE_KIND:
11409          result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11410          break;
11411      default:
11412          Py_UNREACHABLE();
11413      }
11414  
11415      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11416      if (kind2 != kind1)
11417          PyMem_Free((void *)buf2);
11418  
11419      return result;
11420  }
11421  
11422  /* Concat to string or Unicode object giving a new Unicode object. */
11423  
11424  PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11425  PyUnicode_Concat(PyObject *left, PyObject *right)
11426  {
11427      PyObject *result;
11428      Py_UCS4 maxchar, maxchar2;
11429      Py_ssize_t left_len, right_len, new_len;
11430  
11431      if (ensure_unicode(left) < 0)
11432          return NULL;
11433  
11434      if (!PyUnicode_Check(right)) {
11435          PyErr_Format(PyExc_TypeError,
11436                       "can only concatenate str (not \"%.200s\") to str",
11437                       Py_TYPE(right)->tp_name);
11438          return NULL;
11439      }
11440      if (PyUnicode_READY(right) < 0)
11441          return NULL;
11442  
11443      /* Shortcuts */
11444      PyObject *empty = unicode_get_empty();  // Borrowed reference
11445      if (left == empty) {
11446          return PyUnicode_FromObject(right);
11447      }
11448      if (right == empty) {
11449          return PyUnicode_FromObject(left);
11450      }
11451  
11452      left_len = PyUnicode_GET_LENGTH(left);
11453      right_len = PyUnicode_GET_LENGTH(right);
11454      if (left_len > PY_SSIZE_T_MAX - right_len) {
11455          PyErr_SetString(PyExc_OverflowError,
11456                          "strings are too large to concat");
11457          return NULL;
11458      }
11459      new_len = left_len + right_len;
11460  
11461      maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11462      maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11463      maxchar = Py_MAX(maxchar, maxchar2);
11464  
11465      /* Concat the two Unicode strings */
11466      result = PyUnicode_New(new_len, maxchar);
11467      if (result == NULL)
11468          return NULL;
11469      _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11470      _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11471      assert(_PyUnicode_CheckConsistency(result, 1));
11472      return result;
11473  }
11474  
11475  void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11476  PyUnicode_Append(PyObject **p_left, PyObject *right)
11477  {
11478      PyObject *left, *res;
11479      Py_UCS4 maxchar, maxchar2;
11480      Py_ssize_t left_len, right_len, new_len;
11481  
11482      if (p_left == NULL) {
11483          if (!PyErr_Occurred())
11484              PyErr_BadInternalCall();
11485          return;
11486      }
11487      left = *p_left;
11488      if (right == NULL || left == NULL
11489          || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11490          if (!PyErr_Occurred())
11491              PyErr_BadInternalCall();
11492          goto error;
11493      }
11494  
11495      if (PyUnicode_READY(left) == -1)
11496          goto error;
11497      if (PyUnicode_READY(right) == -1)
11498          goto error;
11499  
11500      /* Shortcuts */
11501      PyObject *empty = unicode_get_empty();  // Borrowed reference
11502      if (left == empty) {
11503          Py_DECREF(left);
11504          Py_INCREF(right);
11505          *p_left = right;
11506          return;
11507      }
11508      if (right == empty) {
11509          return;
11510      }
11511  
11512      left_len = PyUnicode_GET_LENGTH(left);
11513      right_len = PyUnicode_GET_LENGTH(right);
11514      if (left_len > PY_SSIZE_T_MAX - right_len) {
11515          PyErr_SetString(PyExc_OverflowError,
11516                          "strings are too large to concat");
11517          goto error;
11518      }
11519      new_len = left_len + right_len;
11520  
11521      if (unicode_modifiable(left)
11522          && PyUnicode_CheckExact(right)
11523          && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11524          /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11525             to change the structure size, but characters are stored just after
11526             the structure, and so it requires to move all characters which is
11527             not so different than duplicating the string. */
11528          && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11529      {
11530          /* append inplace */
11531          if (unicode_resize(p_left, new_len) != 0)
11532              goto error;
11533  
11534          /* copy 'right' into the newly allocated area of 'left' */
11535          _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11536      }
11537      else {
11538          maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11539          maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11540          maxchar = Py_MAX(maxchar, maxchar2);
11541  
11542          /* Concat the two Unicode strings */
11543          res = PyUnicode_New(new_len, maxchar);
11544          if (res == NULL)
11545              goto error;
11546          _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11547          _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11548          Py_DECREF(left);
11549          *p_left = res;
11550      }
11551      assert(_PyUnicode_CheckConsistency(*p_left, 1));
11552      return;
11553  
11554  error:
11555      Py_CLEAR(*p_left);
11556  }
11557  
11558  void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11559  PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11560  {
11561      PyUnicode_Append(pleft, right);
11562      Py_XDECREF(right);
11563  }
11564  
11565  /*
11566  Wraps stringlib_parse_args_finds() and additionally ensures that the
11567  first argument is a unicode object.
11568  */
11569  
11570  static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11571  parse_args_finds_unicode(const char * function_name, PyObject *args,
11572                           PyObject **substring,
11573                           Py_ssize_t *start, Py_ssize_t *end)
11574  {
11575      if(stringlib_parse_args_finds(function_name, args, substring,
11576                                    start, end)) {
11577          if (ensure_unicode(*substring) < 0)
11578              return 0;
11579          return 1;
11580      }
11581      return 0;
11582  }
11583  
11584  PyDoc_STRVAR(count__doc__,
11585               "S.count(sub[, start[, end]]) -> int\n\
11586  \n\
11587  Return the number of non-overlapping occurrences of substring sub in\n\
11588  string S[start:end].  Optional arguments start and end are\n\
11589  interpreted as in slice notation.");
11590  
11591  static PyObject *
unicode_count(PyObject * self,PyObject * args)11592  unicode_count(PyObject *self, PyObject *args)
11593  {
11594      PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11595      Py_ssize_t start = 0;
11596      Py_ssize_t end = PY_SSIZE_T_MAX;
11597      PyObject *result;
11598      int kind1, kind2;
11599      const void *buf1, *buf2;
11600      Py_ssize_t len1, len2, iresult;
11601  
11602      if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11603          return NULL;
11604  
11605      kind1 = PyUnicode_KIND(self);
11606      kind2 = PyUnicode_KIND(substring);
11607      if (kind1 < kind2)
11608          return PyLong_FromLong(0);
11609  
11610      len1 = PyUnicode_GET_LENGTH(self);
11611      len2 = PyUnicode_GET_LENGTH(substring);
11612      ADJUST_INDICES(start, end, len1);
11613      if (end - start < len2)
11614          return PyLong_FromLong(0);
11615  
11616      buf1 = PyUnicode_DATA(self);
11617      buf2 = PyUnicode_DATA(substring);
11618      if (kind2 != kind1) {
11619          buf2 = unicode_askind(kind2, buf2, len2, kind1);
11620          if (!buf2)
11621              return NULL;
11622      }
11623      switch (kind1) {
11624      case PyUnicode_1BYTE_KIND:
11625          iresult = ucs1lib_count(
11626              ((const Py_UCS1*)buf1) + start, end - start,
11627              buf2, len2, PY_SSIZE_T_MAX
11628              );
11629          break;
11630      case PyUnicode_2BYTE_KIND:
11631          iresult = ucs2lib_count(
11632              ((const Py_UCS2*)buf1) + start, end - start,
11633              buf2, len2, PY_SSIZE_T_MAX
11634              );
11635          break;
11636      case PyUnicode_4BYTE_KIND:
11637          iresult = ucs4lib_count(
11638              ((const Py_UCS4*)buf1) + start, end - start,
11639              buf2, len2, PY_SSIZE_T_MAX
11640              );
11641          break;
11642      default:
11643          Py_UNREACHABLE();
11644      }
11645  
11646      result = PyLong_FromSsize_t(iresult);
11647  
11648      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11649      if (kind2 != kind1)
11650          PyMem_Free((void *)buf2);
11651  
11652      return result;
11653  }
11654  
11655  /*[clinic input]
11656  str.encode as unicode_encode
11657  
11658      encoding: str(c_default="NULL") = 'utf-8'
11659          The encoding in which to encode the string.
11660      errors: str(c_default="NULL") = 'strict'
11661          The error handling scheme to use for encoding errors.
11662          The default is 'strict' meaning that encoding errors raise a
11663          UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11664          'xmlcharrefreplace' as well as any other name registered with
11665          codecs.register_error that can handle UnicodeEncodeErrors.
11666  
11667  Encode the string using the codec registered for encoding.
11668  [clinic start generated code]*/
11669  
11670  static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11671  unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672  /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673  {
11674      return PyUnicode_AsEncodedString(self, encoding, errors);
11675  }
11676  
11677  /*[clinic input]
11678  str.expandtabs as unicode_expandtabs
11679  
11680      tabsize: int = 8
11681  
11682  Return a copy where all tab characters are expanded using spaces.
11683  
11684  If tabsize is not given, a tab size of 8 characters is assumed.
11685  [clinic start generated code]*/
11686  
11687  static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11688  unicode_expandtabs_impl(PyObject *self, int tabsize)
11689  /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690  {
11691      Py_ssize_t i, j, line_pos, src_len, incr;
11692      Py_UCS4 ch;
11693      PyObject *u;
11694      const void *src_data;
11695      void *dest_data;
11696      int kind;
11697      int found;
11698  
11699      if (PyUnicode_READY(self) == -1)
11700          return NULL;
11701  
11702      /* First pass: determine size of output string */
11703      src_len = PyUnicode_GET_LENGTH(self);
11704      i = j = line_pos = 0;
11705      kind = PyUnicode_KIND(self);
11706      src_data = PyUnicode_DATA(self);
11707      found = 0;
11708      for (; i < src_len; i++) {
11709          ch = PyUnicode_READ(kind, src_data, i);
11710          if (ch == '\t') {
11711              found = 1;
11712              if (tabsize > 0) {
11713                  incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11714                  if (j > PY_SSIZE_T_MAX - incr)
11715                      goto overflow;
11716                  line_pos += incr;
11717                  j += incr;
11718              }
11719          }
11720          else {
11721              if (j > PY_SSIZE_T_MAX - 1)
11722                  goto overflow;
11723              line_pos++;
11724              j++;
11725              if (ch == '\n' || ch == '\r')
11726                  line_pos = 0;
11727          }
11728      }
11729      if (!found)
11730          return unicode_result_unchanged(self);
11731  
11732      /* Second pass: create output string and fill it */
11733      u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11734      if (!u)
11735          return NULL;
11736      dest_data = PyUnicode_DATA(u);
11737  
11738      i = j = line_pos = 0;
11739  
11740      for (; i < src_len; i++) {
11741          ch = PyUnicode_READ(kind, src_data, i);
11742          if (ch == '\t') {
11743              if (tabsize > 0) {
11744                  incr = tabsize - (line_pos % tabsize);
11745                  line_pos += incr;
11746                  unicode_fill(kind, dest_data, ' ', j, incr);
11747                  j += incr;
11748              }
11749          }
11750          else {
11751              line_pos++;
11752              PyUnicode_WRITE(kind, dest_data, j, ch);
11753              j++;
11754              if (ch == '\n' || ch == '\r')
11755                  line_pos = 0;
11756          }
11757      }
11758      assert (j == PyUnicode_GET_LENGTH(u));
11759      return unicode_result(u);
11760  
11761    overflow:
11762      PyErr_SetString(PyExc_OverflowError, "new string is too long");
11763      return NULL;
11764  }
11765  
11766  PyDoc_STRVAR(find__doc__,
11767               "S.find(sub[, start[, end]]) -> int\n\
11768  \n\
11769  Return the lowest index in S where substring sub is found,\n\
11770  such that sub is contained within S[start:end].  Optional\n\
11771  arguments start and end are interpreted as in slice notation.\n\
11772  \n\
11773  Return -1 on failure.");
11774  
11775  static PyObject *
unicode_find(PyObject * self,PyObject * args)11776  unicode_find(PyObject *self, PyObject *args)
11777  {
11778      /* initialize variables to prevent gcc warning */
11779      PyObject *substring = NULL;
11780      Py_ssize_t start = 0;
11781      Py_ssize_t end = 0;
11782      Py_ssize_t result;
11783  
11784      if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11785          return NULL;
11786  
11787      if (PyUnicode_READY(self) == -1)
11788          return NULL;
11789  
11790      result = any_find_slice(self, substring, start, end, 1);
11791  
11792      if (result == -2)
11793          return NULL;
11794  
11795      return PyLong_FromSsize_t(result);
11796  }
11797  
11798  static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11799  unicode_getitem(PyObject *self, Py_ssize_t index)
11800  {
11801      const void *data;
11802      enum PyUnicode_Kind kind;
11803      Py_UCS4 ch;
11804  
11805      if (!PyUnicode_Check(self)) {
11806          PyErr_BadArgument();
11807          return NULL;
11808      }
11809      if (PyUnicode_READY(self) == -1) {
11810          return NULL;
11811      }
11812      if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11813          PyErr_SetString(PyExc_IndexError, "string index out of range");
11814          return NULL;
11815      }
11816      kind = PyUnicode_KIND(self);
11817      data = PyUnicode_DATA(self);
11818      ch = PyUnicode_READ(kind, data, index);
11819      return unicode_char(ch);
11820  }
11821  
11822  /* Believe it or not, this produces the same value for ASCII strings
11823     as bytes_hash(). */
11824  static Py_hash_t
unicode_hash(PyObject * self)11825  unicode_hash(PyObject *self)
11826  {
11827      Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11828  
11829  #ifdef Py_DEBUG
11830      assert(_Py_HashSecret_Initialized);
11831  #endif
11832      if (_PyUnicode_HASH(self) != -1)
11833          return _PyUnicode_HASH(self);
11834      if (PyUnicode_READY(self) == -1)
11835          return -1;
11836  
11837      x = _Py_HashBytes(PyUnicode_DATA(self),
11838                        PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11839      _PyUnicode_HASH(self) = x;
11840      return x;
11841  }
11842  
11843  PyDoc_STRVAR(index__doc__,
11844               "S.index(sub[, start[, end]]) -> int\n\
11845  \n\
11846  Return the lowest index in S where substring sub is found,\n\
11847  such that sub is contained within S[start:end].  Optional\n\
11848  arguments start and end are interpreted as in slice notation.\n\
11849  \n\
11850  Raises ValueError when the substring is not found.");
11851  
11852  static PyObject *
unicode_index(PyObject * self,PyObject * args)11853  unicode_index(PyObject *self, PyObject *args)
11854  {
11855      /* initialize variables to prevent gcc warning */
11856      Py_ssize_t result;
11857      PyObject *substring = NULL;
11858      Py_ssize_t start = 0;
11859      Py_ssize_t end = 0;
11860  
11861      if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11862          return NULL;
11863  
11864      if (PyUnicode_READY(self) == -1)
11865          return NULL;
11866  
11867      result = any_find_slice(self, substring, start, end, 1);
11868  
11869      if (result == -2)
11870          return NULL;
11871  
11872      if (result < 0) {
11873          PyErr_SetString(PyExc_ValueError, "substring not found");
11874          return NULL;
11875      }
11876  
11877      return PyLong_FromSsize_t(result);
11878  }
11879  
11880  /*[clinic input]
11881  str.isascii as unicode_isascii
11882  
11883  Return True if all characters in the string are ASCII, False otherwise.
11884  
11885  ASCII characters have code points in the range U+0000-U+007F.
11886  Empty string is ASCII too.
11887  [clinic start generated code]*/
11888  
11889  static PyObject *
unicode_isascii_impl(PyObject * self)11890  unicode_isascii_impl(PyObject *self)
11891  /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11892  {
11893      if (PyUnicode_READY(self) == -1) {
11894          return NULL;
11895      }
11896      return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11897  }
11898  
11899  /*[clinic input]
11900  str.islower as unicode_islower
11901  
11902  Return True if the string is a lowercase string, False otherwise.
11903  
11904  A string is lowercase if all cased characters in the string are lowercase and
11905  there is at least one cased character in the string.
11906  [clinic start generated code]*/
11907  
11908  static PyObject *
unicode_islower_impl(PyObject * self)11909  unicode_islower_impl(PyObject *self)
11910  /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11911  {
11912      Py_ssize_t i, length;
11913      int kind;
11914      const void *data;
11915      int cased;
11916  
11917      if (PyUnicode_READY(self) == -1)
11918          return NULL;
11919      length = PyUnicode_GET_LENGTH(self);
11920      kind = PyUnicode_KIND(self);
11921      data = PyUnicode_DATA(self);
11922  
11923      /* Shortcut for single character strings */
11924      if (length == 1)
11925          return PyBool_FromLong(
11926              Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11927  
11928      /* Special case for empty strings */
11929      if (length == 0)
11930          Py_RETURN_FALSE;
11931  
11932      cased = 0;
11933      for (i = 0; i < length; i++) {
11934          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11935  
11936          if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11937              Py_RETURN_FALSE;
11938          else if (!cased && Py_UNICODE_ISLOWER(ch))
11939              cased = 1;
11940      }
11941      return PyBool_FromLong(cased);
11942  }
11943  
11944  /*[clinic input]
11945  str.isupper as unicode_isupper
11946  
11947  Return True if the string is an uppercase string, False otherwise.
11948  
11949  A string is uppercase if all cased characters in the string are uppercase and
11950  there is at least one cased character in the string.
11951  [clinic start generated code]*/
11952  
11953  static PyObject *
unicode_isupper_impl(PyObject * self)11954  unicode_isupper_impl(PyObject *self)
11955  /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11956  {
11957      Py_ssize_t i, length;
11958      int kind;
11959      const void *data;
11960      int cased;
11961  
11962      if (PyUnicode_READY(self) == -1)
11963          return NULL;
11964      length = PyUnicode_GET_LENGTH(self);
11965      kind = PyUnicode_KIND(self);
11966      data = PyUnicode_DATA(self);
11967  
11968      /* Shortcut for single character strings */
11969      if (length == 1)
11970          return PyBool_FromLong(
11971              Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11972  
11973      /* Special case for empty strings */
11974      if (length == 0)
11975          Py_RETURN_FALSE;
11976  
11977      cased = 0;
11978      for (i = 0; i < length; i++) {
11979          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980  
11981          if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11982              Py_RETURN_FALSE;
11983          else if (!cased && Py_UNICODE_ISUPPER(ch))
11984              cased = 1;
11985      }
11986      return PyBool_FromLong(cased);
11987  }
11988  
11989  /*[clinic input]
11990  str.istitle as unicode_istitle
11991  
11992  Return True if the string is a title-cased string, False otherwise.
11993  
11994  In a title-cased string, upper- and title-case characters may only
11995  follow uncased characters and lowercase characters only cased ones.
11996  [clinic start generated code]*/
11997  
11998  static PyObject *
unicode_istitle_impl(PyObject * self)11999  unicode_istitle_impl(PyObject *self)
12000  /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12001  {
12002      Py_ssize_t i, length;
12003      int kind;
12004      const void *data;
12005      int cased, previous_is_cased;
12006  
12007      if (PyUnicode_READY(self) == -1)
12008          return NULL;
12009      length = PyUnicode_GET_LENGTH(self);
12010      kind = PyUnicode_KIND(self);
12011      data = PyUnicode_DATA(self);
12012  
12013      /* Shortcut for single character strings */
12014      if (length == 1) {
12015          Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12016          return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12017                                 (Py_UNICODE_ISUPPER(ch) != 0));
12018      }
12019  
12020      /* Special case for empty strings */
12021      if (length == 0)
12022          Py_RETURN_FALSE;
12023  
12024      cased = 0;
12025      previous_is_cased = 0;
12026      for (i = 0; i < length; i++) {
12027          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12028  
12029          if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12030              if (previous_is_cased)
12031                  Py_RETURN_FALSE;
12032              previous_is_cased = 1;
12033              cased = 1;
12034          }
12035          else if (Py_UNICODE_ISLOWER(ch)) {
12036              if (!previous_is_cased)
12037                  Py_RETURN_FALSE;
12038              previous_is_cased = 1;
12039              cased = 1;
12040          }
12041          else
12042              previous_is_cased = 0;
12043      }
12044      return PyBool_FromLong(cased);
12045  }
12046  
12047  /*[clinic input]
12048  str.isspace as unicode_isspace
12049  
12050  Return True if the string is a whitespace string, False otherwise.
12051  
12052  A string is whitespace if all characters in the string are whitespace and there
12053  is at least one character in the string.
12054  [clinic start generated code]*/
12055  
12056  static PyObject *
unicode_isspace_impl(PyObject * self)12057  unicode_isspace_impl(PyObject *self)
12058  /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12059  {
12060      Py_ssize_t i, length;
12061      int kind;
12062      const void *data;
12063  
12064      if (PyUnicode_READY(self) == -1)
12065          return NULL;
12066      length = PyUnicode_GET_LENGTH(self);
12067      kind = PyUnicode_KIND(self);
12068      data = PyUnicode_DATA(self);
12069  
12070      /* Shortcut for single character strings */
12071      if (length == 1)
12072          return PyBool_FromLong(
12073              Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12074  
12075      /* Special case for empty strings */
12076      if (length == 0)
12077          Py_RETURN_FALSE;
12078  
12079      for (i = 0; i < length; i++) {
12080          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12081          if (!Py_UNICODE_ISSPACE(ch))
12082              Py_RETURN_FALSE;
12083      }
12084      Py_RETURN_TRUE;
12085  }
12086  
12087  /*[clinic input]
12088  str.isalpha as unicode_isalpha
12089  
12090  Return True if the string is an alphabetic string, False otherwise.
12091  
12092  A string is alphabetic if all characters in the string are alphabetic and there
12093  is at least one character in the string.
12094  [clinic start generated code]*/
12095  
12096  static PyObject *
unicode_isalpha_impl(PyObject * self)12097  unicode_isalpha_impl(PyObject *self)
12098  /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12099  {
12100      Py_ssize_t i, length;
12101      int kind;
12102      const void *data;
12103  
12104      if (PyUnicode_READY(self) == -1)
12105          return NULL;
12106      length = PyUnicode_GET_LENGTH(self);
12107      kind = PyUnicode_KIND(self);
12108      data = PyUnicode_DATA(self);
12109  
12110      /* Shortcut for single character strings */
12111      if (length == 1)
12112          return PyBool_FromLong(
12113              Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12114  
12115      /* Special case for empty strings */
12116      if (length == 0)
12117          Py_RETURN_FALSE;
12118  
12119      for (i = 0; i < length; i++) {
12120          if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12121              Py_RETURN_FALSE;
12122      }
12123      Py_RETURN_TRUE;
12124  }
12125  
12126  /*[clinic input]
12127  str.isalnum as unicode_isalnum
12128  
12129  Return True if the string is an alpha-numeric string, False otherwise.
12130  
12131  A string is alpha-numeric if all characters in the string are alpha-numeric and
12132  there is at least one character in the string.
12133  [clinic start generated code]*/
12134  
12135  static PyObject *
unicode_isalnum_impl(PyObject * self)12136  unicode_isalnum_impl(PyObject *self)
12137  /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12138  {
12139      int kind;
12140      const void *data;
12141      Py_ssize_t len, i;
12142  
12143      if (PyUnicode_READY(self) == -1)
12144          return NULL;
12145  
12146      kind = PyUnicode_KIND(self);
12147      data = PyUnicode_DATA(self);
12148      len = PyUnicode_GET_LENGTH(self);
12149  
12150      /* Shortcut for single character strings */
12151      if (len == 1) {
12152          const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12153          return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12154      }
12155  
12156      /* Special case for empty strings */
12157      if (len == 0)
12158          Py_RETURN_FALSE;
12159  
12160      for (i = 0; i < len; i++) {
12161          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12162          if (!Py_UNICODE_ISALNUM(ch))
12163              Py_RETURN_FALSE;
12164      }
12165      Py_RETURN_TRUE;
12166  }
12167  
12168  /*[clinic input]
12169  str.isdecimal as unicode_isdecimal
12170  
12171  Return True if the string is a decimal string, False otherwise.
12172  
12173  A string is a decimal string if all characters in the string are decimal and
12174  there is at least one character in the string.
12175  [clinic start generated code]*/
12176  
12177  static PyObject *
unicode_isdecimal_impl(PyObject * self)12178  unicode_isdecimal_impl(PyObject *self)
12179  /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12180  {
12181      Py_ssize_t i, length;
12182      int kind;
12183      const void *data;
12184  
12185      if (PyUnicode_READY(self) == -1)
12186          return NULL;
12187      length = PyUnicode_GET_LENGTH(self);
12188      kind = PyUnicode_KIND(self);
12189      data = PyUnicode_DATA(self);
12190  
12191      /* Shortcut for single character strings */
12192      if (length == 1)
12193          return PyBool_FromLong(
12194              Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12195  
12196      /* Special case for empty strings */
12197      if (length == 0)
12198          Py_RETURN_FALSE;
12199  
12200      for (i = 0; i < length; i++) {
12201          if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12202              Py_RETURN_FALSE;
12203      }
12204      Py_RETURN_TRUE;
12205  }
12206  
12207  /*[clinic input]
12208  str.isdigit as unicode_isdigit
12209  
12210  Return True if the string is a digit string, False otherwise.
12211  
12212  A string is a digit string if all characters in the string are digits and there
12213  is at least one character in the string.
12214  [clinic start generated code]*/
12215  
12216  static PyObject *
unicode_isdigit_impl(PyObject * self)12217  unicode_isdigit_impl(PyObject *self)
12218  /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12219  {
12220      Py_ssize_t i, length;
12221      int kind;
12222      const void *data;
12223  
12224      if (PyUnicode_READY(self) == -1)
12225          return NULL;
12226      length = PyUnicode_GET_LENGTH(self);
12227      kind = PyUnicode_KIND(self);
12228      data = PyUnicode_DATA(self);
12229  
12230      /* Shortcut for single character strings */
12231      if (length == 1) {
12232          const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12233          return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12234      }
12235  
12236      /* Special case for empty strings */
12237      if (length == 0)
12238          Py_RETURN_FALSE;
12239  
12240      for (i = 0; i < length; i++) {
12241          if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12242              Py_RETURN_FALSE;
12243      }
12244      Py_RETURN_TRUE;
12245  }
12246  
12247  /*[clinic input]
12248  str.isnumeric as unicode_isnumeric
12249  
12250  Return True if the string is a numeric string, False otherwise.
12251  
12252  A string is numeric if all characters in the string are numeric and there is at
12253  least one character in the string.
12254  [clinic start generated code]*/
12255  
12256  static PyObject *
unicode_isnumeric_impl(PyObject * self)12257  unicode_isnumeric_impl(PyObject *self)
12258  /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12259  {
12260      Py_ssize_t i, length;
12261      int kind;
12262      const void *data;
12263  
12264      if (PyUnicode_READY(self) == -1)
12265          return NULL;
12266      length = PyUnicode_GET_LENGTH(self);
12267      kind = PyUnicode_KIND(self);
12268      data = PyUnicode_DATA(self);
12269  
12270      /* Shortcut for single character strings */
12271      if (length == 1)
12272          return PyBool_FromLong(
12273              Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12274  
12275      /* Special case for empty strings */
12276      if (length == 0)
12277          Py_RETURN_FALSE;
12278  
12279      for (i = 0; i < length; i++) {
12280          if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12281              Py_RETURN_FALSE;
12282      }
12283      Py_RETURN_TRUE;
12284  }
12285  
12286  Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12287  _PyUnicode_ScanIdentifier(PyObject *self)
12288  {
12289      Py_ssize_t i;
12290      if (PyUnicode_READY(self) == -1)
12291          return -1;
12292  
12293      Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12294      if (len == 0) {
12295          /* an empty string is not a valid identifier */
12296          return 0;
12297      }
12298  
12299      int kind = PyUnicode_KIND(self);
12300      const void *data = PyUnicode_DATA(self);
12301      Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12302      /* PEP 3131 says that the first character must be in
12303         XID_Start and subsequent characters in XID_Continue,
12304         and for the ASCII range, the 2.x rules apply (i.e
12305         start with letters and underscore, continue with
12306         letters, digits, underscore). However, given the current
12307         definition of XID_Start and XID_Continue, it is sufficient
12308         to check just for these, except that _ must be allowed
12309         as starting an identifier.  */
12310      if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12311          return 0;
12312      }
12313  
12314      for (i = 1; i < len; i++) {
12315          ch = PyUnicode_READ(kind, data, i);
12316          if (!_PyUnicode_IsXidContinue(ch)) {
12317              return i;
12318          }
12319      }
12320      return i;
12321  }
12322  
12323  int
PyUnicode_IsIdentifier(PyObject * self)12324  PyUnicode_IsIdentifier(PyObject *self)
12325  {
12326      if (PyUnicode_IS_READY(self)) {
12327          Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12328          Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12329          /* an empty string is not a valid identifier */
12330          return len && i == len;
12331      }
12332      else {
12333  _Py_COMP_DIAG_PUSH
12334  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12335          Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12336          if (len == 0) {
12337              /* an empty string is not a valid identifier */
12338              return 0;
12339          }
12340  
12341          const wchar_t *wstr = _PyUnicode_WSTR(self);
12342          Py_UCS4 ch = wstr[i++];
12343  #if SIZEOF_WCHAR_T == 2
12344          if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12345              && i < len
12346              && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12347          {
12348              ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12349              i++;
12350          }
12351  #endif
12352          if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12353              return 0;
12354          }
12355  
12356          while (i < len) {
12357              ch = wstr[i++];
12358  #if SIZEOF_WCHAR_T == 2
12359              if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12360                  && i < len
12361                  && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12362              {
12363                  ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12364                  i++;
12365              }
12366  #endif
12367              if (!_PyUnicode_IsXidContinue(ch)) {
12368                  return 0;
12369              }
12370          }
12371          return 1;
12372  _Py_COMP_DIAG_POP
12373      }
12374  }
12375  
12376  /*[clinic input]
12377  str.isidentifier as unicode_isidentifier
12378  
12379  Return True if the string is a valid Python identifier, False otherwise.
12380  
12381  Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12382  such as "def" or "class".
12383  [clinic start generated code]*/
12384  
12385  static PyObject *
unicode_isidentifier_impl(PyObject * self)12386  unicode_isidentifier_impl(PyObject *self)
12387  /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12388  {
12389      return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12390  }
12391  
12392  /*[clinic input]
12393  str.isprintable as unicode_isprintable
12394  
12395  Return True if the string is printable, False otherwise.
12396  
12397  A string is printable if all of its characters are considered printable in
12398  repr() or if it is empty.
12399  [clinic start generated code]*/
12400  
12401  static PyObject *
unicode_isprintable_impl(PyObject * self)12402  unicode_isprintable_impl(PyObject *self)
12403  /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12404  {
12405      Py_ssize_t i, length;
12406      int kind;
12407      const void *data;
12408  
12409      if (PyUnicode_READY(self) == -1)
12410          return NULL;
12411      length = PyUnicode_GET_LENGTH(self);
12412      kind = PyUnicode_KIND(self);
12413      data = PyUnicode_DATA(self);
12414  
12415      /* Shortcut for single character strings */
12416      if (length == 1)
12417          return PyBool_FromLong(
12418              Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12419  
12420      for (i = 0; i < length; i++) {
12421          if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12422              Py_RETURN_FALSE;
12423          }
12424      }
12425      Py_RETURN_TRUE;
12426  }
12427  
12428  /*[clinic input]
12429  str.join as unicode_join
12430  
12431      iterable: object
12432      /
12433  
12434  Concatenate any number of strings.
12435  
12436  The string whose method is called is inserted in between each given string.
12437  The result is returned as a new string.
12438  
12439  Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12440  [clinic start generated code]*/
12441  
12442  static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12443  unicode_join(PyObject *self, PyObject *iterable)
12444  /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12445  {
12446      return PyUnicode_Join(self, iterable);
12447  }
12448  
12449  static Py_ssize_t
unicode_length(PyObject * self)12450  unicode_length(PyObject *self)
12451  {
12452      if (PyUnicode_READY(self) == -1)
12453          return -1;
12454      return PyUnicode_GET_LENGTH(self);
12455  }
12456  
12457  /*[clinic input]
12458  str.ljust as unicode_ljust
12459  
12460      width: Py_ssize_t
12461      fillchar: Py_UCS4 = ' '
12462      /
12463  
12464  Return a left-justified string of length width.
12465  
12466  Padding is done using the specified fill character (default is a space).
12467  [clinic start generated code]*/
12468  
12469  static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12470  unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12471  /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12472  {
12473      if (PyUnicode_READY(self) == -1)
12474          return NULL;
12475  
12476      if (PyUnicode_GET_LENGTH(self) >= width)
12477          return unicode_result_unchanged(self);
12478  
12479      return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12480  }
12481  
12482  /*[clinic input]
12483  str.lower as unicode_lower
12484  
12485  Return a copy of the string converted to lowercase.
12486  [clinic start generated code]*/
12487  
12488  static PyObject *
unicode_lower_impl(PyObject * self)12489  unicode_lower_impl(PyObject *self)
12490  /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12491  {
12492      if (PyUnicode_READY(self) == -1)
12493          return NULL;
12494      if (PyUnicode_IS_ASCII(self))
12495          return ascii_upper_or_lower(self, 1);
12496      return case_operation(self, do_lower);
12497  }
12498  
12499  #define LEFTSTRIP 0
12500  #define RIGHTSTRIP 1
12501  #define BOTHSTRIP 2
12502  
12503  /* Arrays indexed by above */
12504  static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12505  
12506  #define STRIPNAME(i) (stripfuncnames[i])
12507  
12508  /* externally visible for str.strip(unicode) */
12509  PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12510  _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12511  {
12512      const void *data;
12513      int kind;
12514      Py_ssize_t i, j, len;
12515      BLOOM_MASK sepmask;
12516      Py_ssize_t seplen;
12517  
12518      if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12519          return NULL;
12520  
12521      kind = PyUnicode_KIND(self);
12522      data = PyUnicode_DATA(self);
12523      len = PyUnicode_GET_LENGTH(self);
12524      seplen = PyUnicode_GET_LENGTH(sepobj);
12525      sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12526                                PyUnicode_DATA(sepobj),
12527                                seplen);
12528  
12529      i = 0;
12530      if (striptype != RIGHTSTRIP) {
12531          while (i < len) {
12532              Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12533              if (!BLOOM(sepmask, ch))
12534                  break;
12535              if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12536                  break;
12537              i++;
12538          }
12539      }
12540  
12541      j = len;
12542      if (striptype != LEFTSTRIP) {
12543          j--;
12544          while (j >= i) {
12545              Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12546              if (!BLOOM(sepmask, ch))
12547                  break;
12548              if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12549                  break;
12550              j--;
12551          }
12552  
12553          j++;
12554      }
12555  
12556      return PyUnicode_Substring(self, i, j);
12557  }
12558  
12559  PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12560  PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12561  {
12562      const unsigned char *data;
12563      int kind;
12564      Py_ssize_t length;
12565  
12566      if (PyUnicode_READY(self) == -1)
12567          return NULL;
12568  
12569      length = PyUnicode_GET_LENGTH(self);
12570      end = Py_MIN(end, length);
12571  
12572      if (start == 0 && end == length)
12573          return unicode_result_unchanged(self);
12574  
12575      if (start < 0 || end < 0) {
12576          PyErr_SetString(PyExc_IndexError, "string index out of range");
12577          return NULL;
12578      }
12579      if (start >= length || end < start)
12580          _Py_RETURN_UNICODE_EMPTY();
12581  
12582      length = end - start;
12583      if (PyUnicode_IS_ASCII(self)) {
12584          data = PyUnicode_1BYTE_DATA(self);
12585          return _PyUnicode_FromASCII((const char*)(data + start), length);
12586      }
12587      else {
12588          kind = PyUnicode_KIND(self);
12589          data = PyUnicode_1BYTE_DATA(self);
12590          return PyUnicode_FromKindAndData(kind,
12591                                           data + kind * start,
12592                                           length);
12593      }
12594  }
12595  
12596  static PyObject *
do_strip(PyObject * self,int striptype)12597  do_strip(PyObject *self, int striptype)
12598  {
12599      Py_ssize_t len, i, j;
12600  
12601      if (PyUnicode_READY(self) == -1)
12602          return NULL;
12603  
12604      len = PyUnicode_GET_LENGTH(self);
12605  
12606      if (PyUnicode_IS_ASCII(self)) {
12607          const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12608  
12609          i = 0;
12610          if (striptype != RIGHTSTRIP) {
12611              while (i < len) {
12612                  Py_UCS1 ch = data[i];
12613                  if (!_Py_ascii_whitespace[ch])
12614                      break;
12615                  i++;
12616              }
12617          }
12618  
12619          j = len;
12620          if (striptype != LEFTSTRIP) {
12621              j--;
12622              while (j >= i) {
12623                  Py_UCS1 ch = data[j];
12624                  if (!_Py_ascii_whitespace[ch])
12625                      break;
12626                  j--;
12627              }
12628              j++;
12629          }
12630      }
12631      else {
12632          int kind = PyUnicode_KIND(self);
12633          const void *data = PyUnicode_DATA(self);
12634  
12635          i = 0;
12636          if (striptype != RIGHTSTRIP) {
12637              while (i < len) {
12638                  Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12639                  if (!Py_UNICODE_ISSPACE(ch))
12640                      break;
12641                  i++;
12642              }
12643          }
12644  
12645          j = len;
12646          if (striptype != LEFTSTRIP) {
12647              j--;
12648              while (j >= i) {
12649                  Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12650                  if (!Py_UNICODE_ISSPACE(ch))
12651                      break;
12652                  j--;
12653              }
12654              j++;
12655          }
12656      }
12657  
12658      return PyUnicode_Substring(self, i, j);
12659  }
12660  
12661  
12662  static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12663  do_argstrip(PyObject *self, int striptype, PyObject *sep)
12664  {
12665      if (sep != Py_None) {
12666          if (PyUnicode_Check(sep))
12667              return _PyUnicode_XStrip(self, striptype, sep);
12668          else {
12669              PyErr_Format(PyExc_TypeError,
12670                           "%s arg must be None or str",
12671                           STRIPNAME(striptype));
12672              return NULL;
12673          }
12674      }
12675  
12676      return do_strip(self, striptype);
12677  }
12678  
12679  
12680  /*[clinic input]
12681  str.strip as unicode_strip
12682  
12683      chars: object = None
12684      /
12685  
12686  Return a copy of the string with leading and trailing whitespace removed.
12687  
12688  If chars is given and not None, remove characters in chars instead.
12689  [clinic start generated code]*/
12690  
12691  static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12692  unicode_strip_impl(PyObject *self, PyObject *chars)
12693  /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12694  {
12695      return do_argstrip(self, BOTHSTRIP, chars);
12696  }
12697  
12698  
12699  /*[clinic input]
12700  str.lstrip as unicode_lstrip
12701  
12702      chars: object = None
12703      /
12704  
12705  Return a copy of the string with leading whitespace removed.
12706  
12707  If chars is given and not None, remove characters in chars instead.
12708  [clinic start generated code]*/
12709  
12710  static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12711  unicode_lstrip_impl(PyObject *self, PyObject *chars)
12712  /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12713  {
12714      return do_argstrip(self, LEFTSTRIP, chars);
12715  }
12716  
12717  
12718  /*[clinic input]
12719  str.rstrip as unicode_rstrip
12720  
12721      chars: object = None
12722      /
12723  
12724  Return a copy of the string with trailing whitespace removed.
12725  
12726  If chars is given and not None, remove characters in chars instead.
12727  [clinic start generated code]*/
12728  
12729  static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12730  unicode_rstrip_impl(PyObject *self, PyObject *chars)
12731  /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12732  {
12733      return do_argstrip(self, RIGHTSTRIP, chars);
12734  }
12735  
12736  
12737  static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12738  unicode_repeat(PyObject *str, Py_ssize_t len)
12739  {
12740      PyObject *u;
12741      Py_ssize_t nchars, n;
12742  
12743      if (len < 1)
12744          _Py_RETURN_UNICODE_EMPTY();
12745  
12746      /* no repeat, return original string */
12747      if (len == 1)
12748          return unicode_result_unchanged(str);
12749  
12750      if (PyUnicode_READY(str) == -1)
12751          return NULL;
12752  
12753      if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12754          PyErr_SetString(PyExc_OverflowError,
12755                          "repeated string is too long");
12756          return NULL;
12757      }
12758      nchars = len * PyUnicode_GET_LENGTH(str);
12759  
12760      u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12761      if (!u)
12762          return NULL;
12763      assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12764  
12765      if (PyUnicode_GET_LENGTH(str) == 1) {
12766          int kind = PyUnicode_KIND(str);
12767          Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12768          if (kind == PyUnicode_1BYTE_KIND) {
12769              void *to = PyUnicode_DATA(u);
12770              memset(to, (unsigned char)fill_char, len);
12771          }
12772          else if (kind == PyUnicode_2BYTE_KIND) {
12773              Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12774              for (n = 0; n < len; ++n)
12775                  ucs2[n] = fill_char;
12776          } else {
12777              Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12778              assert(kind == PyUnicode_4BYTE_KIND);
12779              for (n = 0; n < len; ++n)
12780                  ucs4[n] = fill_char;
12781          }
12782      }
12783      else {
12784          Py_ssize_t char_size = PyUnicode_KIND(str);
12785          char *to = (char *) PyUnicode_DATA(u);
12786          _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12787              PyUnicode_GET_LENGTH(str) * char_size);
12788      }
12789  
12790      assert(_PyUnicode_CheckConsistency(u, 1));
12791      return u;
12792  }
12793  
12794  PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12795  PyUnicode_Replace(PyObject *str,
12796                    PyObject *substr,
12797                    PyObject *replstr,
12798                    Py_ssize_t maxcount)
12799  {
12800      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12801              ensure_unicode(replstr) < 0)
12802          return NULL;
12803      return replace(str, substr, replstr, maxcount);
12804  }
12805  
12806  /*[clinic input]
12807  str.replace as unicode_replace
12808  
12809      old: unicode
12810      new: unicode
12811      count: Py_ssize_t = -1
12812          Maximum number of occurrences to replace.
12813          -1 (the default value) means replace all occurrences.
12814      /
12815  
12816  Return a copy with all occurrences of substring old replaced by new.
12817  
12818  If the optional argument count is given, only the first count occurrences are
12819  replaced.
12820  [clinic start generated code]*/
12821  
12822  static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12823  unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12824                       Py_ssize_t count)
12825  /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12826  {
12827      if (PyUnicode_READY(self) == -1)
12828          return NULL;
12829      return replace(self, old, new, count);
12830  }
12831  
12832  /*[clinic input]
12833  str.removeprefix as unicode_removeprefix
12834  
12835      prefix: unicode
12836      /
12837  
12838  Return a str with the given prefix string removed if present.
12839  
12840  If the string starts with the prefix string, return string[len(prefix):].
12841  Otherwise, return a copy of the original string.
12842  [clinic start generated code]*/
12843  
12844  static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12845  unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12846  /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12847  {
12848      int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12849      if (match == -1) {
12850          return NULL;
12851      }
12852      if (match) {
12853          return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12854                                     PyUnicode_GET_LENGTH(self));
12855      }
12856      return unicode_result_unchanged(self);
12857  }
12858  
12859  /*[clinic input]
12860  str.removesuffix as unicode_removesuffix
12861  
12862      suffix: unicode
12863      /
12864  
12865  Return a str with the given suffix string removed if present.
12866  
12867  If the string ends with the suffix string and that suffix is not empty,
12868  return string[:-len(suffix)]. Otherwise, return a copy of the original
12869  string.
12870  [clinic start generated code]*/
12871  
12872  static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12873  unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12874  /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12875  {
12876      int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12877      if (match == -1) {
12878          return NULL;
12879      }
12880      if (match) {
12881          return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12882                                              - PyUnicode_GET_LENGTH(suffix));
12883      }
12884      return unicode_result_unchanged(self);
12885  }
12886  
12887  static PyObject *
unicode_repr(PyObject * unicode)12888  unicode_repr(PyObject *unicode)
12889  {
12890      PyObject *repr;
12891      Py_ssize_t isize;
12892      Py_ssize_t osize, squote, dquote, i, o;
12893      Py_UCS4 max, quote;
12894      int ikind, okind, unchanged;
12895      const void *idata;
12896      void *odata;
12897  
12898      if (PyUnicode_READY(unicode) == -1)
12899          return NULL;
12900  
12901      isize = PyUnicode_GET_LENGTH(unicode);
12902      idata = PyUnicode_DATA(unicode);
12903  
12904      /* Compute length of output, quote characters, and
12905         maximum character */
12906      osize = 0;
12907      max = 127;
12908      squote = dquote = 0;
12909      ikind = PyUnicode_KIND(unicode);
12910      for (i = 0; i < isize; i++) {
12911          Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12912          Py_ssize_t incr = 1;
12913          switch (ch) {
12914          case '\'': squote++; break;
12915          case '"':  dquote++; break;
12916          case '\\': case '\t': case '\r': case '\n':
12917              incr = 2;
12918              break;
12919          default:
12920              /* Fast-path ASCII */
12921              if (ch < ' ' || ch == 0x7f)
12922                  incr = 4; /* \xHH */
12923              else if (ch < 0x7f)
12924                  ;
12925              else if (Py_UNICODE_ISPRINTABLE(ch))
12926                  max = ch > max ? ch : max;
12927              else if (ch < 0x100)
12928                  incr = 4; /* \xHH */
12929              else if (ch < 0x10000)
12930                  incr = 6; /* \uHHHH */
12931              else
12932                  incr = 10; /* \uHHHHHHHH */
12933          }
12934          if (osize > PY_SSIZE_T_MAX - incr) {
12935              PyErr_SetString(PyExc_OverflowError,
12936                              "string is too long to generate repr");
12937              return NULL;
12938          }
12939          osize += incr;
12940      }
12941  
12942      quote = '\'';
12943      unchanged = (osize == isize);
12944      if (squote) {
12945          unchanged = 0;
12946          if (dquote)
12947              /* Both squote and dquote present. Use squote,
12948                 and escape them */
12949              osize += squote;
12950          else
12951              quote = '"';
12952      }
12953      osize += 2;   /* quotes */
12954  
12955      repr = PyUnicode_New(osize, max);
12956      if (repr == NULL)
12957          return NULL;
12958      okind = PyUnicode_KIND(repr);
12959      odata = PyUnicode_DATA(repr);
12960  
12961      PyUnicode_WRITE(okind, odata, 0, quote);
12962      PyUnicode_WRITE(okind, odata, osize-1, quote);
12963      if (unchanged) {
12964          _PyUnicode_FastCopyCharacters(repr, 1,
12965                                        unicode, 0,
12966                                        isize);
12967      }
12968      else {
12969          for (i = 0, o = 1; i < isize; i++) {
12970              Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12971  
12972              /* Escape quotes and backslashes */
12973              if ((ch == quote) || (ch == '\\')) {
12974                  PyUnicode_WRITE(okind, odata, o++, '\\');
12975                  PyUnicode_WRITE(okind, odata, o++, ch);
12976                  continue;
12977              }
12978  
12979              /* Map special whitespace to '\t', \n', '\r' */
12980              if (ch == '\t') {
12981                  PyUnicode_WRITE(okind, odata, o++, '\\');
12982                  PyUnicode_WRITE(okind, odata, o++, 't');
12983              }
12984              else if (ch == '\n') {
12985                  PyUnicode_WRITE(okind, odata, o++, '\\');
12986                  PyUnicode_WRITE(okind, odata, o++, 'n');
12987              }
12988              else if (ch == '\r') {
12989                  PyUnicode_WRITE(okind, odata, o++, '\\');
12990                  PyUnicode_WRITE(okind, odata, o++, 'r');
12991              }
12992  
12993              /* Map non-printable US ASCII to '\xhh' */
12994              else if (ch < ' ' || ch == 0x7F) {
12995                  PyUnicode_WRITE(okind, odata, o++, '\\');
12996                  PyUnicode_WRITE(okind, odata, o++, 'x');
12997                  PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12998                  PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12999              }
13000  
13001              /* Copy ASCII characters as-is */
13002              else if (ch < 0x7F) {
13003                  PyUnicode_WRITE(okind, odata, o++, ch);
13004              }
13005  
13006              /* Non-ASCII characters */
13007              else {
13008                  /* Map Unicode whitespace and control characters
13009                     (categories Z* and C* except ASCII space)
13010                  */
13011                  if (!Py_UNICODE_ISPRINTABLE(ch)) {
13012                      PyUnicode_WRITE(okind, odata, o++, '\\');
13013                      /* Map 8-bit characters to '\xhh' */
13014                      if (ch <= 0xff) {
13015                          PyUnicode_WRITE(okind, odata, o++, 'x');
13016                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13017                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13018                      }
13019                      /* Map 16-bit characters to '\uxxxx' */
13020                      else if (ch <= 0xffff) {
13021                          PyUnicode_WRITE(okind, odata, o++, 'u');
13022                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13023                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13024                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13025                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13026                      }
13027                      /* Map 21-bit characters to '\U00xxxxxx' */
13028                      else {
13029                          PyUnicode_WRITE(okind, odata, o++, 'U');
13030                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13031                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13032                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13033                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13034                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13035                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13036                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13037                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13038                      }
13039                  }
13040                  /* Copy characters as-is */
13041                  else {
13042                      PyUnicode_WRITE(okind, odata, o++, ch);
13043                  }
13044              }
13045          }
13046      }
13047      /* Closing quote already added at the beginning */
13048      assert(_PyUnicode_CheckConsistency(repr, 1));
13049      return repr;
13050  }
13051  
13052  PyDoc_STRVAR(rfind__doc__,
13053               "S.rfind(sub[, start[, end]]) -> int\n\
13054  \n\
13055  Return the highest index in S where substring sub is found,\n\
13056  such that sub is contained within S[start:end].  Optional\n\
13057  arguments start and end are interpreted as in slice notation.\n\
13058  \n\
13059  Return -1 on failure.");
13060  
13061  static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13062  unicode_rfind(PyObject *self, PyObject *args)
13063  {
13064      /* initialize variables to prevent gcc warning */
13065      PyObject *substring = NULL;
13066      Py_ssize_t start = 0;
13067      Py_ssize_t end = 0;
13068      Py_ssize_t result;
13069  
13070      if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13071          return NULL;
13072  
13073      if (PyUnicode_READY(self) == -1)
13074          return NULL;
13075  
13076      result = any_find_slice(self, substring, start, end, -1);
13077  
13078      if (result == -2)
13079          return NULL;
13080  
13081      return PyLong_FromSsize_t(result);
13082  }
13083  
13084  PyDoc_STRVAR(rindex__doc__,
13085               "S.rindex(sub[, start[, end]]) -> int\n\
13086  \n\
13087  Return the highest index in S where substring sub is found,\n\
13088  such that sub is contained within S[start:end].  Optional\n\
13089  arguments start and end are interpreted as in slice notation.\n\
13090  \n\
13091  Raises ValueError when the substring is not found.");
13092  
13093  static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13094  unicode_rindex(PyObject *self, PyObject *args)
13095  {
13096      /* initialize variables to prevent gcc warning */
13097      PyObject *substring = NULL;
13098      Py_ssize_t start = 0;
13099      Py_ssize_t end = 0;
13100      Py_ssize_t result;
13101  
13102      if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13103          return NULL;
13104  
13105      if (PyUnicode_READY(self) == -1)
13106          return NULL;
13107  
13108      result = any_find_slice(self, substring, start, end, -1);
13109  
13110      if (result == -2)
13111          return NULL;
13112  
13113      if (result < 0) {
13114          PyErr_SetString(PyExc_ValueError, "substring not found");
13115          return NULL;
13116      }
13117  
13118      return PyLong_FromSsize_t(result);
13119  }
13120  
13121  /*[clinic input]
13122  str.rjust as unicode_rjust
13123  
13124      width: Py_ssize_t
13125      fillchar: Py_UCS4 = ' '
13126      /
13127  
13128  Return a right-justified string of length width.
13129  
13130  Padding is done using the specified fill character (default is a space).
13131  [clinic start generated code]*/
13132  
13133  static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13134  unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13135  /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13136  {
13137      if (PyUnicode_READY(self) == -1)
13138          return NULL;
13139  
13140      if (PyUnicode_GET_LENGTH(self) >= width)
13141          return unicode_result_unchanged(self);
13142  
13143      return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13144  }
13145  
13146  PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13147  PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13148  {
13149      if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13150          return NULL;
13151  
13152      return split(s, sep, maxsplit);
13153  }
13154  
13155  /*[clinic input]
13156  str.split as unicode_split
13157  
13158      sep: object = None
13159          The separator used to split the string.
13160  
13161          When set to None (the default value), will split on any whitespace
13162          character (including \\n \\r \\t \\f and spaces) and will discard
13163          empty strings from the result.
13164      maxsplit: Py_ssize_t = -1
13165          Maximum number of splits (starting from the left).
13166          -1 (the default value) means no limit.
13167  
13168  Return a list of the substrings in the string, using sep as the separator string.
13169  
13170  Note, str.split() is mainly useful for data that has been intentionally
13171  delimited.  With natural text that includes punctuation, consider using
13172  the regular expression module.
13173  
13174  [clinic start generated code]*/
13175  
13176  static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13177  unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13178  /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13179  {
13180      if (sep == Py_None)
13181          return split(self, NULL, maxsplit);
13182      if (PyUnicode_Check(sep))
13183          return split(self, sep, maxsplit);
13184  
13185      PyErr_Format(PyExc_TypeError,
13186                   "must be str or None, not %.100s",
13187                   Py_TYPE(sep)->tp_name);
13188      return NULL;
13189  }
13190  
13191  PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13192  PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13193  {
13194      PyObject* out;
13195      int kind1, kind2;
13196      const void *buf1, *buf2;
13197      Py_ssize_t len1, len2;
13198  
13199      if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13200          return NULL;
13201  
13202      kind1 = PyUnicode_KIND(str_obj);
13203      kind2 = PyUnicode_KIND(sep_obj);
13204      len1 = PyUnicode_GET_LENGTH(str_obj);
13205      len2 = PyUnicode_GET_LENGTH(sep_obj);
13206      if (kind1 < kind2 || len1 < len2) {
13207          PyObject *empty = unicode_get_empty();  // Borrowed reference
13208          return PyTuple_Pack(3, str_obj, empty, empty);
13209      }
13210      buf1 = PyUnicode_DATA(str_obj);
13211      buf2 = PyUnicode_DATA(sep_obj);
13212      if (kind2 != kind1) {
13213          buf2 = unicode_askind(kind2, buf2, len2, kind1);
13214          if (!buf2)
13215              return NULL;
13216      }
13217  
13218      switch (kind1) {
13219      case PyUnicode_1BYTE_KIND:
13220          if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13221              out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13222          else
13223              out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13224          break;
13225      case PyUnicode_2BYTE_KIND:
13226          out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13227          break;
13228      case PyUnicode_4BYTE_KIND:
13229          out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13230          break;
13231      default:
13232          Py_UNREACHABLE();
13233      }
13234  
13235      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13236      if (kind2 != kind1)
13237          PyMem_Free((void *)buf2);
13238  
13239      return out;
13240  }
13241  
13242  
13243  PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13244  PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13245  {
13246      PyObject* out;
13247      int kind1, kind2;
13248      const void *buf1, *buf2;
13249      Py_ssize_t len1, len2;
13250  
13251      if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13252          return NULL;
13253  
13254      kind1 = PyUnicode_KIND(str_obj);
13255      kind2 = PyUnicode_KIND(sep_obj);
13256      len1 = PyUnicode_GET_LENGTH(str_obj);
13257      len2 = PyUnicode_GET_LENGTH(sep_obj);
13258      if (kind1 < kind2 || len1 < len2) {
13259          PyObject *empty = unicode_get_empty();  // Borrowed reference
13260          return PyTuple_Pack(3, empty, empty, str_obj);
13261      }
13262      buf1 = PyUnicode_DATA(str_obj);
13263      buf2 = PyUnicode_DATA(sep_obj);
13264      if (kind2 != kind1) {
13265          buf2 = unicode_askind(kind2, buf2, len2, kind1);
13266          if (!buf2)
13267              return NULL;
13268      }
13269  
13270      switch (kind1) {
13271      case PyUnicode_1BYTE_KIND:
13272          if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13273              out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13274          else
13275              out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276          break;
13277      case PyUnicode_2BYTE_KIND:
13278          out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279          break;
13280      case PyUnicode_4BYTE_KIND:
13281          out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13282          break;
13283      default:
13284          Py_UNREACHABLE();
13285      }
13286  
13287      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13288      if (kind2 != kind1)
13289          PyMem_Free((void *)buf2);
13290  
13291      return out;
13292  }
13293  
13294  /*[clinic input]
13295  str.partition as unicode_partition
13296  
13297      sep: object
13298      /
13299  
13300  Partition the string into three parts using the given separator.
13301  
13302  This will search for the separator in the string.  If the separator is found,
13303  returns a 3-tuple containing the part before the separator, the separator
13304  itself, and the part after it.
13305  
13306  If the separator is not found, returns a 3-tuple containing the original string
13307  and two empty strings.
13308  [clinic start generated code]*/
13309  
13310  static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13311  unicode_partition(PyObject *self, PyObject *sep)
13312  /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13313  {
13314      return PyUnicode_Partition(self, sep);
13315  }
13316  
13317  /*[clinic input]
13318  str.rpartition as unicode_rpartition = str.partition
13319  
13320  Partition the string into three parts using the given separator.
13321  
13322  This will search for the separator in the string, starting at the end. If
13323  the separator is found, returns a 3-tuple containing the part before the
13324  separator, the separator itself, and the part after it.
13325  
13326  If the separator is not found, returns a 3-tuple containing two empty strings
13327  and the original string.
13328  [clinic start generated code]*/
13329  
13330  static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13331  unicode_rpartition(PyObject *self, PyObject *sep)
13332  /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13333  {
13334      return PyUnicode_RPartition(self, sep);
13335  }
13336  
13337  PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13338  PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13339  {
13340      if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13341          return NULL;
13342  
13343      return rsplit(s, sep, maxsplit);
13344  }
13345  
13346  /*[clinic input]
13347  str.rsplit as unicode_rsplit = str.split
13348  
13349  Return a list of the substrings in the string, using sep as the separator string.
13350  
13351  Splitting starts at the end of the string and works to the front.
13352  [clinic start generated code]*/
13353  
13354  static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13355  unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13356  /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13357  {
13358      if (sep == Py_None)
13359          return rsplit(self, NULL, maxsplit);
13360      if (PyUnicode_Check(sep))
13361          return rsplit(self, sep, maxsplit);
13362  
13363      PyErr_Format(PyExc_TypeError,
13364                   "must be str or None, not %.100s",
13365                   Py_TYPE(sep)->tp_name);
13366      return NULL;
13367  }
13368  
13369  /*[clinic input]
13370  str.splitlines as unicode_splitlines
13371  
13372      keepends: bool(accept={int}) = False
13373  
13374  Return a list of the lines in the string, breaking at line boundaries.
13375  
13376  Line breaks are not included in the resulting list unless keepends is given and
13377  true.
13378  [clinic start generated code]*/
13379  
13380  static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13381  unicode_splitlines_impl(PyObject *self, int keepends)
13382  /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13383  {
13384      return PyUnicode_Splitlines(self, keepends);
13385  }
13386  
13387  static
unicode_str(PyObject * self)13388  PyObject *unicode_str(PyObject *self)
13389  {
13390      return unicode_result_unchanged(self);
13391  }
13392  
13393  /*[clinic input]
13394  str.swapcase as unicode_swapcase
13395  
13396  Convert uppercase characters to lowercase and lowercase characters to uppercase.
13397  [clinic start generated code]*/
13398  
13399  static PyObject *
unicode_swapcase_impl(PyObject * self)13400  unicode_swapcase_impl(PyObject *self)
13401  /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13402  {
13403      if (PyUnicode_READY(self) == -1)
13404          return NULL;
13405      return case_operation(self, do_swapcase);
13406  }
13407  
13408  /*[clinic input]
13409  
13410  @staticmethod
13411  str.maketrans as unicode_maketrans
13412  
13413    x: object
13414  
13415    y: unicode=NULL
13416  
13417    z: unicode=NULL
13418  
13419    /
13420  
13421  Return a translation table usable for str.translate().
13422  
13423  If there is only one argument, it must be a dictionary mapping Unicode
13424  ordinals (integers) or characters to Unicode ordinals, strings or None.
13425  Character keys will be then converted to ordinals.
13426  If there are two arguments, they must be strings of equal length, and
13427  in the resulting dictionary, each character in x will be mapped to the
13428  character at the same position in y. If there is a third argument, it
13429  must be a string, whose characters will be mapped to None in the result.
13430  [clinic start generated code]*/
13431  
13432  static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13433  unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13434  /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13435  {
13436      PyObject *new = NULL, *key, *value;
13437      Py_ssize_t i = 0;
13438      int res;
13439  
13440      new = PyDict_New();
13441      if (!new)
13442          return NULL;
13443      if (y != NULL) {
13444          int x_kind, y_kind, z_kind;
13445          const void *x_data, *y_data, *z_data;
13446  
13447          /* x must be a string too, of equal length */
13448          if (!PyUnicode_Check(x)) {
13449              PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13450                              "be a string if there is a second argument");
13451              goto err;
13452          }
13453          if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13454              PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13455                              "arguments must have equal length");
13456              goto err;
13457          }
13458          /* create entries for translating chars in x to those in y */
13459          x_kind = PyUnicode_KIND(x);
13460          y_kind = PyUnicode_KIND(y);
13461          x_data = PyUnicode_DATA(x);
13462          y_data = PyUnicode_DATA(y);
13463          for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13464              key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13465              if (!key)
13466                  goto err;
13467              value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13468              if (!value) {
13469                  Py_DECREF(key);
13470                  goto err;
13471              }
13472              res = PyDict_SetItem(new, key, value);
13473              Py_DECREF(key);
13474              Py_DECREF(value);
13475              if (res < 0)
13476                  goto err;
13477          }
13478          /* create entries for deleting chars in z */
13479          if (z != NULL) {
13480              z_kind = PyUnicode_KIND(z);
13481              z_data = PyUnicode_DATA(z);
13482              for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13483                  key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13484                  if (!key)
13485                      goto err;
13486                  res = PyDict_SetItem(new, key, Py_None);
13487                  Py_DECREF(key);
13488                  if (res < 0)
13489                      goto err;
13490              }
13491          }
13492      } else {
13493          int kind;
13494          const void *data;
13495  
13496          /* x must be a dict */
13497          if (!PyDict_CheckExact(x)) {
13498              PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13499                              "to maketrans it must be a dict");
13500              goto err;
13501          }
13502          /* copy entries into the new dict, converting string keys to int keys */
13503          while (PyDict_Next(x, &i, &key, &value)) {
13504              if (PyUnicode_Check(key)) {
13505                  /* convert string keys to integer keys */
13506                  PyObject *newkey;
13507                  if (PyUnicode_GET_LENGTH(key) != 1) {
13508                      PyErr_SetString(PyExc_ValueError, "string keys in translate "
13509                                      "table must be of length 1");
13510                      goto err;
13511                  }
13512                  kind = PyUnicode_KIND(key);
13513                  data = PyUnicode_DATA(key);
13514                  newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13515                  if (!newkey)
13516                      goto err;
13517                  res = PyDict_SetItem(new, newkey, value);
13518                  Py_DECREF(newkey);
13519                  if (res < 0)
13520                      goto err;
13521              } else if (PyLong_Check(key)) {
13522                  /* just keep integer keys */
13523                  if (PyDict_SetItem(new, key, value) < 0)
13524                      goto err;
13525              } else {
13526                  PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13527                                  "be strings or integers");
13528                  goto err;
13529              }
13530          }
13531      }
13532      return new;
13533    err:
13534      Py_DECREF(new);
13535      return NULL;
13536  }
13537  
13538  /*[clinic input]
13539  str.translate as unicode_translate
13540  
13541      table: object
13542          Translation table, which must be a mapping of Unicode ordinals to
13543          Unicode ordinals, strings, or None.
13544      /
13545  
13546  Replace each character in the string using the given translation table.
13547  
13548  The table must implement lookup/indexing via __getitem__, for instance a
13549  dictionary or list.  If this operation raises LookupError, the character is
13550  left untouched.  Characters mapped to None are deleted.
13551  [clinic start generated code]*/
13552  
13553  static PyObject *
unicode_translate(PyObject * self,PyObject * table)13554  unicode_translate(PyObject *self, PyObject *table)
13555  /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13556  {
13557      return _PyUnicode_TranslateCharmap(self, table, "ignore");
13558  }
13559  
13560  /*[clinic input]
13561  str.upper as unicode_upper
13562  
13563  Return a copy of the string converted to uppercase.
13564  [clinic start generated code]*/
13565  
13566  static PyObject *
unicode_upper_impl(PyObject * self)13567  unicode_upper_impl(PyObject *self)
13568  /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13569  {
13570      if (PyUnicode_READY(self) == -1)
13571          return NULL;
13572      if (PyUnicode_IS_ASCII(self))
13573          return ascii_upper_or_lower(self, 0);
13574      return case_operation(self, do_upper);
13575  }
13576  
13577  /*[clinic input]
13578  str.zfill as unicode_zfill
13579  
13580      width: Py_ssize_t
13581      /
13582  
13583  Pad a numeric string with zeros on the left, to fill a field of the given width.
13584  
13585  The string is never truncated.
13586  [clinic start generated code]*/
13587  
13588  static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13589  unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13590  /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13591  {
13592      Py_ssize_t fill;
13593      PyObject *u;
13594      int kind;
13595      const void *data;
13596      Py_UCS4 chr;
13597  
13598      if (PyUnicode_READY(self) == -1)
13599          return NULL;
13600  
13601      if (PyUnicode_GET_LENGTH(self) >= width)
13602          return unicode_result_unchanged(self);
13603  
13604      fill = width - PyUnicode_GET_LENGTH(self);
13605  
13606      u = pad(self, fill, 0, '0');
13607  
13608      if (u == NULL)
13609          return NULL;
13610  
13611      kind = PyUnicode_KIND(u);
13612      data = PyUnicode_DATA(u);
13613      chr = PyUnicode_READ(kind, data, fill);
13614  
13615      if (chr == '+' || chr == '-') {
13616          /* move sign to beginning of string */
13617          PyUnicode_WRITE(kind, data, 0, chr);
13618          PyUnicode_WRITE(kind, data, fill, '0');
13619      }
13620  
13621      assert(_PyUnicode_CheckConsistency(u, 1));
13622      return u;
13623  }
13624  
13625  PyDoc_STRVAR(startswith__doc__,
13626               "S.startswith(prefix[, start[, end]]) -> bool\n\
13627  \n\
13628  Return True if S starts with the specified prefix, False otherwise.\n\
13629  With optional start, test S beginning at that position.\n\
13630  With optional end, stop comparing S at that position.\n\
13631  prefix can also be a tuple of strings to try.");
13632  
13633  static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13634  unicode_startswith(PyObject *self,
13635                     PyObject *args)
13636  {
13637      PyObject *subobj;
13638      PyObject *substring;
13639      Py_ssize_t start = 0;
13640      Py_ssize_t end = PY_SSIZE_T_MAX;
13641      int result;
13642  
13643      if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13644          return NULL;
13645      if (PyTuple_Check(subobj)) {
13646          Py_ssize_t i;
13647          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13648              substring = PyTuple_GET_ITEM(subobj, i);
13649              if (!PyUnicode_Check(substring)) {
13650                  PyErr_Format(PyExc_TypeError,
13651                               "tuple for startswith must only contain str, "
13652                               "not %.100s",
13653                               Py_TYPE(substring)->tp_name);
13654                  return NULL;
13655              }
13656              result = tailmatch(self, substring, start, end, -1);
13657              if (result == -1)
13658                  return NULL;
13659              if (result) {
13660                  Py_RETURN_TRUE;
13661              }
13662          }
13663          /* nothing matched */
13664          Py_RETURN_FALSE;
13665      }
13666      if (!PyUnicode_Check(subobj)) {
13667          PyErr_Format(PyExc_TypeError,
13668                       "startswith first arg must be str or "
13669                       "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13670          return NULL;
13671      }
13672      result = tailmatch(self, subobj, start, end, -1);
13673      if (result == -1)
13674          return NULL;
13675      return PyBool_FromLong(result);
13676  }
13677  
13678  
13679  PyDoc_STRVAR(endswith__doc__,
13680               "S.endswith(suffix[, start[, end]]) -> bool\n\
13681  \n\
13682  Return True if S ends with the specified suffix, False otherwise.\n\
13683  With optional start, test S beginning at that position.\n\
13684  With optional end, stop comparing S at that position.\n\
13685  suffix can also be a tuple of strings to try.");
13686  
13687  static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13688  unicode_endswith(PyObject *self,
13689                   PyObject *args)
13690  {
13691      PyObject *subobj;
13692      PyObject *substring;
13693      Py_ssize_t start = 0;
13694      Py_ssize_t end = PY_SSIZE_T_MAX;
13695      int result;
13696  
13697      if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13698          return NULL;
13699      if (PyTuple_Check(subobj)) {
13700          Py_ssize_t i;
13701          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13702              substring = PyTuple_GET_ITEM(subobj, i);
13703              if (!PyUnicode_Check(substring)) {
13704                  PyErr_Format(PyExc_TypeError,
13705                               "tuple for endswith must only contain str, "
13706                               "not %.100s",
13707                               Py_TYPE(substring)->tp_name);
13708                  return NULL;
13709              }
13710              result = tailmatch(self, substring, start, end, +1);
13711              if (result == -1)
13712                  return NULL;
13713              if (result) {
13714                  Py_RETURN_TRUE;
13715              }
13716          }
13717          Py_RETURN_FALSE;
13718      }
13719      if (!PyUnicode_Check(subobj)) {
13720          PyErr_Format(PyExc_TypeError,
13721                       "endswith first arg must be str or "
13722                       "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13723          return NULL;
13724      }
13725      result = tailmatch(self, subobj, start, end, +1);
13726      if (result == -1)
13727          return NULL;
13728      return PyBool_FromLong(result);
13729  }
13730  
13731  static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13732  _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13733  {
13734      writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13735      writer->data = PyUnicode_DATA(writer->buffer);
13736  
13737      if (!writer->readonly) {
13738          writer->kind = PyUnicode_KIND(writer->buffer);
13739          writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13740      }
13741      else {
13742          /* use a value smaller than PyUnicode_1BYTE_KIND() so
13743             _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13744          writer->kind = PyUnicode_WCHAR_KIND;
13745          assert(writer->kind <= PyUnicode_1BYTE_KIND);
13746  
13747          /* Copy-on-write mode: set buffer size to 0 so
13748           * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13749           * next write. */
13750          writer->size = 0;
13751      }
13752  }
13753  
13754  void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13755  _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13756  {
13757      memset(writer, 0, sizeof(*writer));
13758  
13759      /* ASCII is the bare minimum */
13760      writer->min_char = 127;
13761  
13762      /* use a value smaller than PyUnicode_1BYTE_KIND() so
13763         _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13764      writer->kind = PyUnicode_WCHAR_KIND;
13765      assert(writer->kind <= PyUnicode_1BYTE_KIND);
13766  }
13767  
13768  // Initialize _PyUnicodeWriter with initial buffer
13769  static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13770  _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13771  {
13772      memset(writer, 0, sizeof(*writer));
13773      writer->buffer = buffer;
13774      _PyUnicodeWriter_Update(writer);
13775      writer->min_length = writer->size;
13776  }
13777  
13778  int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13779  _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13780                                   Py_ssize_t length, Py_UCS4 maxchar)
13781  {
13782      Py_ssize_t newlen;
13783      PyObject *newbuffer;
13784  
13785      assert(maxchar <= MAX_UNICODE);
13786  
13787      /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13788      assert((maxchar > writer->maxchar && length >= 0)
13789             || length > 0);
13790  
13791      if (length > PY_SSIZE_T_MAX - writer->pos) {
13792          PyErr_NoMemory();
13793          return -1;
13794      }
13795      newlen = writer->pos + length;
13796  
13797      maxchar = Py_MAX(maxchar, writer->min_char);
13798  
13799      if (writer->buffer == NULL) {
13800          assert(!writer->readonly);
13801          if (writer->overallocate
13802              && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13803              /* overallocate to limit the number of realloc() */
13804              newlen += newlen / OVERALLOCATE_FACTOR;
13805          }
13806          if (newlen < writer->min_length)
13807              newlen = writer->min_length;
13808  
13809          writer->buffer = PyUnicode_New(newlen, maxchar);
13810          if (writer->buffer == NULL)
13811              return -1;
13812      }
13813      else if (newlen > writer->size) {
13814          if (writer->overallocate
13815              && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13816              /* overallocate to limit the number of realloc() */
13817              newlen += newlen / OVERALLOCATE_FACTOR;
13818          }
13819          if (newlen < writer->min_length)
13820              newlen = writer->min_length;
13821  
13822          if (maxchar > writer->maxchar || writer->readonly) {
13823              /* resize + widen */
13824              maxchar = Py_MAX(maxchar, writer->maxchar);
13825              newbuffer = PyUnicode_New(newlen, maxchar);
13826              if (newbuffer == NULL)
13827                  return -1;
13828              _PyUnicode_FastCopyCharacters(newbuffer, 0,
13829                                            writer->buffer, 0, writer->pos);
13830              Py_DECREF(writer->buffer);
13831              writer->readonly = 0;
13832          }
13833          else {
13834              newbuffer = resize_compact(writer->buffer, newlen);
13835              if (newbuffer == NULL)
13836                  return -1;
13837          }
13838          writer->buffer = newbuffer;
13839      }
13840      else if (maxchar > writer->maxchar) {
13841          assert(!writer->readonly);
13842          newbuffer = PyUnicode_New(writer->size, maxchar);
13843          if (newbuffer == NULL)
13844              return -1;
13845          _PyUnicode_FastCopyCharacters(newbuffer, 0,
13846                                        writer->buffer, 0, writer->pos);
13847          Py_SETREF(writer->buffer, newbuffer);
13848      }
13849      _PyUnicodeWriter_Update(writer);
13850      return 0;
13851  
13852  #undef OVERALLOCATE_FACTOR
13853  }
13854  
13855  int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13856  _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13857                                       enum PyUnicode_Kind kind)
13858  {
13859      Py_UCS4 maxchar;
13860  
13861      /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13862      assert(writer->kind < kind);
13863  
13864      switch (kind)
13865      {
13866      case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13867      case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13868      case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13869      default:
13870          Py_UNREACHABLE();
13871      }
13872  
13873      return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13874  }
13875  
13876  static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13877  _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13878  {
13879      assert(ch <= MAX_UNICODE);
13880      if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13881          return -1;
13882      PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13883      writer->pos++;
13884      return 0;
13885  }
13886  
13887  int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13888  _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13889  {
13890      return _PyUnicodeWriter_WriteCharInline(writer, ch);
13891  }
13892  
13893  int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13894  _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13895  {
13896      Py_UCS4 maxchar;
13897      Py_ssize_t len;
13898  
13899      if (PyUnicode_READY(str) == -1)
13900          return -1;
13901      len = PyUnicode_GET_LENGTH(str);
13902      if (len == 0)
13903          return 0;
13904      maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13905      if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13906          if (writer->buffer == NULL && !writer->overallocate) {
13907              assert(_PyUnicode_CheckConsistency(str, 1));
13908              writer->readonly = 1;
13909              Py_INCREF(str);
13910              writer->buffer = str;
13911              _PyUnicodeWriter_Update(writer);
13912              writer->pos += len;
13913              return 0;
13914          }
13915          if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13916              return -1;
13917      }
13918      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13919                                    str, 0, len);
13920      writer->pos += len;
13921      return 0;
13922  }
13923  
13924  int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13925  _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13926                                  Py_ssize_t start, Py_ssize_t end)
13927  {
13928      Py_UCS4 maxchar;
13929      Py_ssize_t len;
13930  
13931      if (PyUnicode_READY(str) == -1)
13932          return -1;
13933  
13934      assert(0 <= start);
13935      assert(end <= PyUnicode_GET_LENGTH(str));
13936      assert(start <= end);
13937  
13938      if (end == 0)
13939          return 0;
13940  
13941      if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13942          return _PyUnicodeWriter_WriteStr(writer, str);
13943  
13944      if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13945          maxchar = _PyUnicode_FindMaxChar(str, start, end);
13946      else
13947          maxchar = writer->maxchar;
13948      len = end - start;
13949  
13950      if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13951          return -1;
13952  
13953      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13954                                    str, start, len);
13955      writer->pos += len;
13956      return 0;
13957  }
13958  
13959  int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13960  _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13961                                    const char *ascii, Py_ssize_t len)
13962  {
13963      if (len == -1)
13964          len = strlen(ascii);
13965  
13966      assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13967  
13968      if (writer->buffer == NULL && !writer->overallocate) {
13969          PyObject *str;
13970  
13971          str = _PyUnicode_FromASCII(ascii, len);
13972          if (str == NULL)
13973              return -1;
13974  
13975          writer->readonly = 1;
13976          writer->buffer = str;
13977          _PyUnicodeWriter_Update(writer);
13978          writer->pos += len;
13979          return 0;
13980      }
13981  
13982      if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13983          return -1;
13984  
13985      switch (writer->kind)
13986      {
13987      case PyUnicode_1BYTE_KIND:
13988      {
13989          const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13990          Py_UCS1 *data = writer->data;
13991  
13992          memcpy(data + writer->pos, str, len);
13993          break;
13994      }
13995      case PyUnicode_2BYTE_KIND:
13996      {
13997          _PyUnicode_CONVERT_BYTES(
13998              Py_UCS1, Py_UCS2,
13999              ascii, ascii + len,
14000              (Py_UCS2 *)writer->data + writer->pos);
14001          break;
14002      }
14003      case PyUnicode_4BYTE_KIND:
14004      {
14005          _PyUnicode_CONVERT_BYTES(
14006              Py_UCS1, Py_UCS4,
14007              ascii, ascii + len,
14008              (Py_UCS4 *)writer->data + writer->pos);
14009          break;
14010      }
14011      default:
14012          Py_UNREACHABLE();
14013      }
14014  
14015      writer->pos += len;
14016      return 0;
14017  }
14018  
14019  int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14020  _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14021                                     const char *str, Py_ssize_t len)
14022  {
14023      Py_UCS4 maxchar;
14024  
14025      maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14026      if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14027          return -1;
14028      unicode_write_cstr(writer->buffer, writer->pos, str, len);
14029      writer->pos += len;
14030      return 0;
14031  }
14032  
14033  PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14034  _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14035  {
14036      PyObject *str;
14037  
14038      if (writer->pos == 0) {
14039          Py_CLEAR(writer->buffer);
14040          _Py_RETURN_UNICODE_EMPTY();
14041      }
14042  
14043      str = writer->buffer;
14044      writer->buffer = NULL;
14045  
14046      if (writer->readonly) {
14047          assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14048          return str;
14049      }
14050  
14051      if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14052          PyObject *str2;
14053          str2 = resize_compact(str, writer->pos);
14054          if (str2 == NULL) {
14055              Py_DECREF(str);
14056              return NULL;
14057          }
14058          str = str2;
14059      }
14060  
14061      assert(_PyUnicode_CheckConsistency(str, 1));
14062      return unicode_result_ready(str);
14063  }
14064  
14065  void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14066  _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14067  {
14068      Py_CLEAR(writer->buffer);
14069  }
14070  
14071  #include "stringlib/unicode_format.h"
14072  
14073  PyDoc_STRVAR(format__doc__,
14074               "S.format(*args, **kwargs) -> str\n\
14075  \n\
14076  Return a formatted version of S, using substitutions from args and kwargs.\n\
14077  The substitutions are identified by braces ('{' and '}').");
14078  
14079  PyDoc_STRVAR(format_map__doc__,
14080               "S.format_map(mapping) -> str\n\
14081  \n\
14082  Return a formatted version of S, using substitutions from mapping.\n\
14083  The substitutions are identified by braces ('{' and '}').");
14084  
14085  /*[clinic input]
14086  str.__format__ as unicode___format__
14087  
14088      format_spec: unicode
14089      /
14090  
14091  Return a formatted version of the string as described by format_spec.
14092  [clinic start generated code]*/
14093  
14094  static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14095  unicode___format___impl(PyObject *self, PyObject *format_spec)
14096  /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14097  {
14098      _PyUnicodeWriter writer;
14099      int ret;
14100  
14101      if (PyUnicode_READY(self) == -1)
14102          return NULL;
14103      _PyUnicodeWriter_Init(&writer);
14104      ret = _PyUnicode_FormatAdvancedWriter(&writer,
14105                                            self, format_spec, 0,
14106                                            PyUnicode_GET_LENGTH(format_spec));
14107      if (ret == -1) {
14108          _PyUnicodeWriter_Dealloc(&writer);
14109          return NULL;
14110      }
14111      return _PyUnicodeWriter_Finish(&writer);
14112  }
14113  
14114  /*[clinic input]
14115  str.__sizeof__ as unicode_sizeof
14116  
14117  Return the size of the string in memory, in bytes.
14118  [clinic start generated code]*/
14119  
14120  static PyObject *
unicode_sizeof_impl(PyObject * self)14121  unicode_sizeof_impl(PyObject *self)
14122  /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14123  {
14124      Py_ssize_t size;
14125  
14126      /* If it's a compact object, account for base structure +
14127         character data. */
14128      if (PyUnicode_IS_COMPACT_ASCII(self))
14129          size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14130      else if (PyUnicode_IS_COMPACT(self))
14131          size = sizeof(PyCompactUnicodeObject) +
14132              (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14133      else {
14134          /* If it is a two-block object, account for base object, and
14135             for character block if present. */
14136          size = sizeof(PyUnicodeObject);
14137          if (_PyUnicode_DATA_ANY(self))
14138              size += (PyUnicode_GET_LENGTH(self) + 1) *
14139                  PyUnicode_KIND(self);
14140      }
14141      /* If the wstr pointer is present, account for it unless it is shared
14142         with the data pointer. Check if the data is not shared. */
14143      if (_PyUnicode_HAS_WSTR_MEMORY(self))
14144          size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14145      if (_PyUnicode_HAS_UTF8_MEMORY(self))
14146          size += PyUnicode_UTF8_LENGTH(self) + 1;
14147  
14148      return PyLong_FromSsize_t(size);
14149  }
14150  
14151  static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14152  unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14153  {
14154      PyObject *copy = _PyUnicode_Copy(v);
14155      if (!copy)
14156          return NULL;
14157      return Py_BuildValue("(N)", copy);
14158  }
14159  
14160  static PyMethodDef unicode_methods[] = {
14161      UNICODE_ENCODE_METHODDEF
14162      UNICODE_REPLACE_METHODDEF
14163      UNICODE_SPLIT_METHODDEF
14164      UNICODE_RSPLIT_METHODDEF
14165      UNICODE_JOIN_METHODDEF
14166      UNICODE_CAPITALIZE_METHODDEF
14167      UNICODE_CASEFOLD_METHODDEF
14168      UNICODE_TITLE_METHODDEF
14169      UNICODE_CENTER_METHODDEF
14170      {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14171      UNICODE_EXPANDTABS_METHODDEF
14172      {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14173      UNICODE_PARTITION_METHODDEF
14174      {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14175      UNICODE_LJUST_METHODDEF
14176      UNICODE_LOWER_METHODDEF
14177      UNICODE_LSTRIP_METHODDEF
14178      {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14179      {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14180      UNICODE_RJUST_METHODDEF
14181      UNICODE_RSTRIP_METHODDEF
14182      UNICODE_RPARTITION_METHODDEF
14183      UNICODE_SPLITLINES_METHODDEF
14184      UNICODE_STRIP_METHODDEF
14185      UNICODE_SWAPCASE_METHODDEF
14186      UNICODE_TRANSLATE_METHODDEF
14187      UNICODE_UPPER_METHODDEF
14188      {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14189      {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14190      UNICODE_REMOVEPREFIX_METHODDEF
14191      UNICODE_REMOVESUFFIX_METHODDEF
14192      UNICODE_ISASCII_METHODDEF
14193      UNICODE_ISLOWER_METHODDEF
14194      UNICODE_ISUPPER_METHODDEF
14195      UNICODE_ISTITLE_METHODDEF
14196      UNICODE_ISSPACE_METHODDEF
14197      UNICODE_ISDECIMAL_METHODDEF
14198      UNICODE_ISDIGIT_METHODDEF
14199      UNICODE_ISNUMERIC_METHODDEF
14200      UNICODE_ISALPHA_METHODDEF
14201      UNICODE_ISALNUM_METHODDEF
14202      UNICODE_ISIDENTIFIER_METHODDEF
14203      UNICODE_ISPRINTABLE_METHODDEF
14204      UNICODE_ZFILL_METHODDEF
14205      {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14206      {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14207      UNICODE___FORMAT___METHODDEF
14208      UNICODE_MAKETRANS_METHODDEF
14209      UNICODE_SIZEOF_METHODDEF
14210      {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14211      {NULL, NULL}
14212  };
14213  
14214  static PyObject *
unicode_mod(PyObject * v,PyObject * w)14215  unicode_mod(PyObject *v, PyObject *w)
14216  {
14217      if (!PyUnicode_Check(v))
14218          Py_RETURN_NOTIMPLEMENTED;
14219      return PyUnicode_Format(v, w);
14220  }
14221  
14222  static PyNumberMethods unicode_as_number = {
14223      0,              /*nb_add*/
14224      0,              /*nb_subtract*/
14225      0,              /*nb_multiply*/
14226      unicode_mod,            /*nb_remainder*/
14227  };
14228  
14229  static PySequenceMethods unicode_as_sequence = {
14230      (lenfunc) unicode_length,       /* sq_length */
14231      PyUnicode_Concat,           /* sq_concat */
14232      (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14233      (ssizeargfunc) unicode_getitem,     /* sq_item */
14234      0,                  /* sq_slice */
14235      0,                  /* sq_ass_item */
14236      0,                  /* sq_ass_slice */
14237      PyUnicode_Contains,         /* sq_contains */
14238  };
14239  
14240  static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14241  unicode_subscript(PyObject* self, PyObject* item)
14242  {
14243      if (PyUnicode_READY(self) == -1)
14244          return NULL;
14245  
14246      if (_PyIndex_Check(item)) {
14247          Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14248          if (i == -1 && PyErr_Occurred())
14249              return NULL;
14250          if (i < 0)
14251              i += PyUnicode_GET_LENGTH(self);
14252          return unicode_getitem(self, i);
14253      } else if (PySlice_Check(item)) {
14254          Py_ssize_t start, stop, step, slicelength, i;
14255          size_t cur;
14256          PyObject *result;
14257          const void *src_data;
14258          void *dest_data;
14259          int src_kind, dest_kind;
14260          Py_UCS4 ch, max_char, kind_limit;
14261  
14262          if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14263              return NULL;
14264          }
14265          slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14266                                              &start, &stop, step);
14267  
14268          if (slicelength <= 0) {
14269              _Py_RETURN_UNICODE_EMPTY();
14270          } else if (start == 0 && step == 1 &&
14271                     slicelength == PyUnicode_GET_LENGTH(self)) {
14272              return unicode_result_unchanged(self);
14273          } else if (step == 1) {
14274              return PyUnicode_Substring(self,
14275                                         start, start + slicelength);
14276          }
14277          /* General case */
14278          src_kind = PyUnicode_KIND(self);
14279          src_data = PyUnicode_DATA(self);
14280          if (!PyUnicode_IS_ASCII(self)) {
14281              kind_limit = kind_maxchar_limit(src_kind);
14282              max_char = 0;
14283              for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14284                  ch = PyUnicode_READ(src_kind, src_data, cur);
14285                  if (ch > max_char) {
14286                      max_char = ch;
14287                      if (max_char >= kind_limit)
14288                          break;
14289                  }
14290              }
14291          }
14292          else
14293              max_char = 127;
14294          result = PyUnicode_New(slicelength, max_char);
14295          if (result == NULL)
14296              return NULL;
14297          dest_kind = PyUnicode_KIND(result);
14298          dest_data = PyUnicode_DATA(result);
14299  
14300          for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14301              Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14302              PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14303          }
14304          assert(_PyUnicode_CheckConsistency(result, 1));
14305          return result;
14306      } else {
14307          PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14308                       Py_TYPE(item)->tp_name);
14309          return NULL;
14310      }
14311  }
14312  
14313  static PyMappingMethods unicode_as_mapping = {
14314      (lenfunc)unicode_length,        /* mp_length */
14315      (binaryfunc)unicode_subscript,  /* mp_subscript */
14316      (objobjargproc)0,           /* mp_ass_subscript */
14317  };
14318  
14319  
14320  /* Helpers for PyUnicode_Format() */
14321  
14322  struct unicode_formatter_t {
14323      PyObject *args;
14324      int args_owned;
14325      Py_ssize_t arglen, argidx;
14326      PyObject *dict;
14327  
14328      enum PyUnicode_Kind fmtkind;
14329      Py_ssize_t fmtcnt, fmtpos;
14330      const void *fmtdata;
14331      PyObject *fmtstr;
14332  
14333      _PyUnicodeWriter writer;
14334  };
14335  
14336  struct unicode_format_arg_t {
14337      Py_UCS4 ch;
14338      int flags;
14339      Py_ssize_t width;
14340      int prec;
14341      int sign;
14342  };
14343  
14344  static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14345  unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14346  {
14347      Py_ssize_t argidx = ctx->argidx;
14348  
14349      if (argidx < ctx->arglen) {
14350          ctx->argidx++;
14351          if (ctx->arglen < 0)
14352              return ctx->args;
14353          else
14354              return PyTuple_GetItem(ctx->args, argidx);
14355      }
14356      PyErr_SetString(PyExc_TypeError,
14357                      "not enough arguments for format string");
14358      return NULL;
14359  }
14360  
14361  /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14362  
14363  /* Format a float into the writer if the writer is not NULL, or into *p_output
14364     otherwise.
14365  
14366     Return 0 on success, raise an exception and return -1 on error. */
14367  static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14368  formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14369              PyObject **p_output,
14370              _PyUnicodeWriter *writer)
14371  {
14372      char *p;
14373      double x;
14374      Py_ssize_t len;
14375      int prec;
14376      int dtoa_flags = 0;
14377  
14378      x = PyFloat_AsDouble(v);
14379      if (x == -1.0 && PyErr_Occurred())
14380          return -1;
14381  
14382      prec = arg->prec;
14383      if (prec < 0)
14384          prec = 6;
14385  
14386      if (arg->flags & F_ALT)
14387          dtoa_flags |= Py_DTSF_ALT;
14388      p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14389      if (p == NULL)
14390          return -1;
14391      len = strlen(p);
14392      if (writer) {
14393          if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14394              PyMem_Free(p);
14395              return -1;
14396          }
14397      }
14398      else
14399          *p_output = _PyUnicode_FromASCII(p, len);
14400      PyMem_Free(p);
14401      return 0;
14402  }
14403  
14404  /* formatlong() emulates the format codes d, u, o, x and X, and
14405   * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14406   * Python's regular ints.
14407   * Return value:  a new PyUnicodeObject*, or NULL if error.
14408   *     The output string is of the form
14409   *         "-"? ("0x" | "0X")? digit+
14410   *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14411   *         set in flags.  The case of hex digits will be correct,
14412   *     There will be at least prec digits, zero-filled on the left if
14413   *         necessary to get that many.
14414   * val          object to be converted
14415   * flags        bitmask of format flags; only F_ALT is looked at
14416   * prec         minimum number of digits; 0-fill on left if needed
14417   * type         a character in [duoxX]; u acts the same as d
14418   *
14419   * CAUTION:  o, x and X conversions on regular ints can never
14420   * produce a '-' sign, but can for Python's unbounded ints.
14421   */
14422  PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14423  _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14424  {
14425      PyObject *result = NULL;
14426      char *buf;
14427      Py_ssize_t i;
14428      int sign;           /* 1 if '-', else 0 */
14429      int len;            /* number of characters */
14430      Py_ssize_t llen;
14431      int numdigits;      /* len == numnondigits + numdigits */
14432      int numnondigits = 0;
14433  
14434      /* Avoid exceeding SSIZE_T_MAX */
14435      if (prec > INT_MAX-3) {
14436          PyErr_SetString(PyExc_OverflowError,
14437                          "precision too large");
14438          return NULL;
14439      }
14440  
14441      assert(PyLong_Check(val));
14442  
14443      switch (type) {
14444      default:
14445          Py_UNREACHABLE();
14446      case 'd':
14447      case 'i':
14448      case 'u':
14449          /* int and int subclasses should print numerically when a numeric */
14450          /* format code is used (see issue18780) */
14451          result = PyNumber_ToBase(val, 10);
14452          break;
14453      case 'o':
14454          numnondigits = 2;
14455          result = PyNumber_ToBase(val, 8);
14456          break;
14457      case 'x':
14458      case 'X':
14459          numnondigits = 2;
14460          result = PyNumber_ToBase(val, 16);
14461          break;
14462      }
14463      if (!result)
14464          return NULL;
14465  
14466      assert(unicode_modifiable(result));
14467      assert(PyUnicode_IS_READY(result));
14468      assert(PyUnicode_IS_ASCII(result));
14469  
14470      /* To modify the string in-place, there can only be one reference. */
14471      if (Py_REFCNT(result) != 1) {
14472          Py_DECREF(result);
14473          PyErr_BadInternalCall();
14474          return NULL;
14475      }
14476      buf = PyUnicode_DATA(result);
14477      llen = PyUnicode_GET_LENGTH(result);
14478      if (llen > INT_MAX) {
14479          Py_DECREF(result);
14480          PyErr_SetString(PyExc_ValueError,
14481                          "string too large in _PyUnicode_FormatLong");
14482          return NULL;
14483      }
14484      len = (int)llen;
14485      sign = buf[0] == '-';
14486      numnondigits += sign;
14487      numdigits = len - numnondigits;
14488      assert(numdigits > 0);
14489  
14490      /* Get rid of base marker unless F_ALT */
14491      if (((alt) == 0 &&
14492          (type == 'o' || type == 'x' || type == 'X'))) {
14493          assert(buf[sign] == '0');
14494          assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14495                 buf[sign+1] == 'o');
14496          numnondigits -= 2;
14497          buf += 2;
14498          len -= 2;
14499          if (sign)
14500              buf[0] = '-';
14501          assert(len == numnondigits + numdigits);
14502          assert(numdigits > 0);
14503      }
14504  
14505      /* Fill with leading zeroes to meet minimum width. */
14506      if (prec > numdigits) {
14507          PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14508                                  numnondigits + prec);
14509          char *b1;
14510          if (!r1) {
14511              Py_DECREF(result);
14512              return NULL;
14513          }
14514          b1 = PyBytes_AS_STRING(r1);
14515          for (i = 0; i < numnondigits; ++i)
14516              *b1++ = *buf++;
14517          for (i = 0; i < prec - numdigits; i++)
14518              *b1++ = '0';
14519          for (i = 0; i < numdigits; i++)
14520              *b1++ = *buf++;
14521          *b1 = '\0';
14522          Py_DECREF(result);
14523          result = r1;
14524          buf = PyBytes_AS_STRING(result);
14525          len = numnondigits + prec;
14526      }
14527  
14528      /* Fix up case for hex conversions. */
14529      if (type == 'X') {
14530          /* Need to convert all lower case letters to upper case.
14531             and need to convert 0x to 0X (and -0x to -0X). */
14532          for (i = 0; i < len; i++)
14533              if (buf[i] >= 'a' && buf[i] <= 'x')
14534                  buf[i] -= 'a'-'A';
14535      }
14536      if (!PyUnicode_Check(result)
14537          || buf != PyUnicode_DATA(result)) {
14538          PyObject *unicode;
14539          unicode = _PyUnicode_FromASCII(buf, len);
14540          Py_DECREF(result);
14541          result = unicode;
14542      }
14543      else if (len != PyUnicode_GET_LENGTH(result)) {
14544          if (PyUnicode_Resize(&result, len) < 0)
14545              Py_CLEAR(result);
14546      }
14547      return result;
14548  }
14549  
14550  /* Format an integer or a float as an integer.
14551   * Return 1 if the number has been formatted into the writer,
14552   *        0 if the number has been formatted into *p_output
14553   *       -1 and raise an exception on error */
14554  static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14555  mainformatlong(PyObject *v,
14556                 struct unicode_format_arg_t *arg,
14557                 PyObject **p_output,
14558                 _PyUnicodeWriter *writer)
14559  {
14560      PyObject *iobj, *res;
14561      char type = (char)arg->ch;
14562  
14563      if (!PyNumber_Check(v))
14564          goto wrongtype;
14565  
14566      /* make sure number is a type of integer for o, x, and X */
14567      if (!PyLong_Check(v)) {
14568          if (type == 'o' || type == 'x' || type == 'X') {
14569              iobj = _PyNumber_Index(v);
14570          }
14571          else {
14572              iobj = PyNumber_Long(v);
14573          }
14574          if (iobj == NULL ) {
14575              if (PyErr_ExceptionMatches(PyExc_TypeError))
14576                  goto wrongtype;
14577              return -1;
14578          }
14579          assert(PyLong_Check(iobj));
14580      }
14581      else {
14582          iobj = v;
14583          Py_INCREF(iobj);
14584      }
14585  
14586      if (PyLong_CheckExact(v)
14587          && arg->width == -1 && arg->prec == -1
14588          && !(arg->flags & (F_SIGN | F_BLANK))
14589          && type != 'X')
14590      {
14591          /* Fast path */
14592          int alternate = arg->flags & F_ALT;
14593          int base;
14594  
14595          switch(type)
14596          {
14597              default:
14598                  Py_UNREACHABLE();
14599              case 'd':
14600              case 'i':
14601              case 'u':
14602                  base = 10;
14603                  break;
14604              case 'o':
14605                  base = 8;
14606                  break;
14607              case 'x':
14608              case 'X':
14609                  base = 16;
14610                  break;
14611          }
14612  
14613          if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14614              Py_DECREF(iobj);
14615              return -1;
14616          }
14617          Py_DECREF(iobj);
14618          return 1;
14619      }
14620  
14621      res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14622      Py_DECREF(iobj);
14623      if (res == NULL)
14624          return -1;
14625      *p_output = res;
14626      return 0;
14627  
14628  wrongtype:
14629      switch(type)
14630      {
14631          case 'o':
14632          case 'x':
14633          case 'X':
14634              PyErr_Format(PyExc_TypeError,
14635                      "%%%c format: an integer is required, "
14636                      "not %.200s",
14637                      type, Py_TYPE(v)->tp_name);
14638              break;
14639          default:
14640              PyErr_Format(PyExc_TypeError,
14641                      "%%%c format: a real number is required, "
14642                      "not %.200s",
14643                      type, Py_TYPE(v)->tp_name);
14644              break;
14645      }
14646      return -1;
14647  }
14648  
14649  static Py_UCS4
formatchar(PyObject * v)14650  formatchar(PyObject *v)
14651  {
14652      /* presume that the buffer is at least 3 characters long */
14653      if (PyUnicode_Check(v)) {
14654          if (PyUnicode_GET_LENGTH(v) == 1) {
14655              return PyUnicode_READ_CHAR(v, 0);
14656          }
14657          goto onError;
14658      }
14659      else {
14660          int overflow;
14661          long x = PyLong_AsLongAndOverflow(v, &overflow);
14662          if (x == -1 && PyErr_Occurred()) {
14663              if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14664                  goto onError;
14665              }
14666              return (Py_UCS4) -1;
14667          }
14668  
14669          if (x < 0 || x > MAX_UNICODE) {
14670              /* this includes an overflow in converting to C long */
14671              PyErr_SetString(PyExc_OverflowError,
14672                              "%c arg not in range(0x110000)");
14673              return (Py_UCS4) -1;
14674          }
14675  
14676          return (Py_UCS4) x;
14677      }
14678  
14679    onError:
14680      PyErr_SetString(PyExc_TypeError,
14681                      "%c requires int or char");
14682      return (Py_UCS4) -1;
14683  }
14684  
14685  /* Parse options of an argument: flags, width, precision.
14686     Handle also "%(name)" syntax.
14687  
14688     Return 0 if the argument has been formatted into arg->str.
14689     Return 1 if the argument has been written into ctx->writer,
14690     Raise an exception and return -1 on error. */
14691  static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14692  unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14693                           struct unicode_format_arg_t *arg)
14694  {
14695  #define FORMAT_READ(ctx) \
14696          PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14697  
14698      PyObject *v;
14699  
14700      if (arg->ch == '(') {
14701          /* Get argument value from a dictionary. Example: "%(name)s". */
14702          Py_ssize_t keystart;
14703          Py_ssize_t keylen;
14704          PyObject *key;
14705          int pcount = 1;
14706  
14707          if (ctx->dict == NULL) {
14708              PyErr_SetString(PyExc_TypeError,
14709                              "format requires a mapping");
14710              return -1;
14711          }
14712          ++ctx->fmtpos;
14713          --ctx->fmtcnt;
14714          keystart = ctx->fmtpos;
14715          /* Skip over balanced parentheses */
14716          while (pcount > 0 && --ctx->fmtcnt >= 0) {
14717              arg->ch = FORMAT_READ(ctx);
14718              if (arg->ch == ')')
14719                  --pcount;
14720              else if (arg->ch == '(')
14721                  ++pcount;
14722              ctx->fmtpos++;
14723          }
14724          keylen = ctx->fmtpos - keystart - 1;
14725          if (ctx->fmtcnt < 0 || pcount > 0) {
14726              PyErr_SetString(PyExc_ValueError,
14727                              "incomplete format key");
14728              return -1;
14729          }
14730          key = PyUnicode_Substring(ctx->fmtstr,
14731                                    keystart, keystart + keylen);
14732          if (key == NULL)
14733              return -1;
14734          if (ctx->args_owned) {
14735              ctx->args_owned = 0;
14736              Py_DECREF(ctx->args);
14737          }
14738          ctx->args = PyObject_GetItem(ctx->dict, key);
14739          Py_DECREF(key);
14740          if (ctx->args == NULL)
14741              return -1;
14742          ctx->args_owned = 1;
14743          ctx->arglen = -1;
14744          ctx->argidx = -2;
14745      }
14746  
14747      /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14748      while (--ctx->fmtcnt >= 0) {
14749          arg->ch = FORMAT_READ(ctx);
14750          ctx->fmtpos++;
14751          switch (arg->ch) {
14752          case '-': arg->flags |= F_LJUST; continue;
14753          case '+': arg->flags |= F_SIGN; continue;
14754          case ' ': arg->flags |= F_BLANK; continue;
14755          case '#': arg->flags |= F_ALT; continue;
14756          case '0': arg->flags |= F_ZERO; continue;
14757          }
14758          break;
14759      }
14760  
14761      /* Parse width. Example: "%10s" => width=10 */
14762      if (arg->ch == '*') {
14763          v = unicode_format_getnextarg(ctx);
14764          if (v == NULL)
14765              return -1;
14766          if (!PyLong_Check(v)) {
14767              PyErr_SetString(PyExc_TypeError,
14768                              "* wants int");
14769              return -1;
14770          }
14771          arg->width = PyLong_AsSsize_t(v);
14772          if (arg->width == -1 && PyErr_Occurred())
14773              return -1;
14774          if (arg->width < 0) {
14775              arg->flags |= F_LJUST;
14776              arg->width = -arg->width;
14777          }
14778          if (--ctx->fmtcnt >= 0) {
14779              arg->ch = FORMAT_READ(ctx);
14780              ctx->fmtpos++;
14781          }
14782      }
14783      else if (arg->ch >= '0' && arg->ch <= '9') {
14784          arg->width = arg->ch - '0';
14785          while (--ctx->fmtcnt >= 0) {
14786              arg->ch = FORMAT_READ(ctx);
14787              ctx->fmtpos++;
14788              if (arg->ch < '0' || arg->ch > '9')
14789                  break;
14790              /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14791                 mixing signed and unsigned comparison. Since arg->ch is between
14792                 '0' and '9', casting to int is safe. */
14793              if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14794                  PyErr_SetString(PyExc_ValueError,
14795                                  "width too big");
14796                  return -1;
14797              }
14798              arg->width = arg->width*10 + (arg->ch - '0');
14799          }
14800      }
14801  
14802      /* Parse precision. Example: "%.3f" => prec=3 */
14803      if (arg->ch == '.') {
14804          arg->prec = 0;
14805          if (--ctx->fmtcnt >= 0) {
14806              arg->ch = FORMAT_READ(ctx);
14807              ctx->fmtpos++;
14808          }
14809          if (arg->ch == '*') {
14810              v = unicode_format_getnextarg(ctx);
14811              if (v == NULL)
14812                  return -1;
14813              if (!PyLong_Check(v)) {
14814                  PyErr_SetString(PyExc_TypeError,
14815                                  "* wants int");
14816                  return -1;
14817              }
14818              arg->prec = _PyLong_AsInt(v);
14819              if (arg->prec == -1 && PyErr_Occurred())
14820                  return -1;
14821              if (arg->prec < 0)
14822                  arg->prec = 0;
14823              if (--ctx->fmtcnt >= 0) {
14824                  arg->ch = FORMAT_READ(ctx);
14825                  ctx->fmtpos++;
14826              }
14827          }
14828          else if (arg->ch >= '0' && arg->ch <= '9') {
14829              arg->prec = arg->ch - '0';
14830              while (--ctx->fmtcnt >= 0) {
14831                  arg->ch = FORMAT_READ(ctx);
14832                  ctx->fmtpos++;
14833                  if (arg->ch < '0' || arg->ch > '9')
14834                      break;
14835                  if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14836                      PyErr_SetString(PyExc_ValueError,
14837                                      "precision too big");
14838                      return -1;
14839                  }
14840                  arg->prec = arg->prec*10 + (arg->ch - '0');
14841              }
14842          }
14843      }
14844  
14845      /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14846      if (ctx->fmtcnt >= 0) {
14847          if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14848              if (--ctx->fmtcnt >= 0) {
14849                  arg->ch = FORMAT_READ(ctx);
14850                  ctx->fmtpos++;
14851              }
14852          }
14853      }
14854      if (ctx->fmtcnt < 0) {
14855          PyErr_SetString(PyExc_ValueError,
14856                          "incomplete format");
14857          return -1;
14858      }
14859      return 0;
14860  
14861  #undef FORMAT_READ
14862  }
14863  
14864  /* Format one argument. Supported conversion specifiers:
14865  
14866     - "s", "r", "a": any type
14867     - "i", "d", "u": int or float
14868     - "o", "x", "X": int
14869     - "e", "E", "f", "F", "g", "G": float
14870     - "c": int or str (1 character)
14871  
14872     When possible, the output is written directly into the Unicode writer
14873     (ctx->writer). A string is created when padding is required.
14874  
14875     Return 0 if the argument has been formatted into *p_str,
14876            1 if the argument has been written into ctx->writer,
14877           -1 on error. */
14878  static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14879  unicode_format_arg_format(struct unicode_formatter_t *ctx,
14880                            struct unicode_format_arg_t *arg,
14881                            PyObject **p_str)
14882  {
14883      PyObject *v;
14884      _PyUnicodeWriter *writer = &ctx->writer;
14885  
14886      if (ctx->fmtcnt == 0)
14887          ctx->writer.overallocate = 0;
14888  
14889      v = unicode_format_getnextarg(ctx);
14890      if (v == NULL)
14891          return -1;
14892  
14893  
14894      switch (arg->ch) {
14895      case 's':
14896      case 'r':
14897      case 'a':
14898          if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14899              /* Fast path */
14900              if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14901                  return -1;
14902              return 1;
14903          }
14904  
14905          if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14906              *p_str = v;
14907              Py_INCREF(*p_str);
14908          }
14909          else {
14910              if (arg->ch == 's')
14911                  *p_str = PyObject_Str(v);
14912              else if (arg->ch == 'r')
14913                  *p_str = PyObject_Repr(v);
14914              else
14915                  *p_str = PyObject_ASCII(v);
14916          }
14917          break;
14918  
14919      case 'i':
14920      case 'd':
14921      case 'u':
14922      case 'o':
14923      case 'x':
14924      case 'X':
14925      {
14926          int ret = mainformatlong(v, arg, p_str, writer);
14927          if (ret != 0)
14928              return ret;
14929          arg->sign = 1;
14930          break;
14931      }
14932  
14933      case 'e':
14934      case 'E':
14935      case 'f':
14936      case 'F':
14937      case 'g':
14938      case 'G':
14939          if (arg->width == -1 && arg->prec == -1
14940              && !(arg->flags & (F_SIGN | F_BLANK)))
14941          {
14942              /* Fast path */
14943              if (formatfloat(v, arg, NULL, writer) == -1)
14944                  return -1;
14945              return 1;
14946          }
14947  
14948          arg->sign = 1;
14949          if (formatfloat(v, arg, p_str, NULL) == -1)
14950              return -1;
14951          break;
14952  
14953      case 'c':
14954      {
14955          Py_UCS4 ch = formatchar(v);
14956          if (ch == (Py_UCS4) -1)
14957              return -1;
14958          if (arg->width == -1 && arg->prec == -1) {
14959              /* Fast path */
14960              if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14961                  return -1;
14962              return 1;
14963          }
14964          *p_str = PyUnicode_FromOrdinal(ch);
14965          break;
14966      }
14967  
14968      default:
14969          PyErr_Format(PyExc_ValueError,
14970                       "unsupported format character '%c' (0x%x) "
14971                       "at index %zd",
14972                       (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14973                       (int)arg->ch,
14974                       ctx->fmtpos - 1);
14975          return -1;
14976      }
14977      if (*p_str == NULL)
14978          return -1;
14979      assert (PyUnicode_Check(*p_str));
14980      return 0;
14981  }
14982  
14983  static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14984  unicode_format_arg_output(struct unicode_formatter_t *ctx,
14985                            struct unicode_format_arg_t *arg,
14986                            PyObject *str)
14987  {
14988      Py_ssize_t len;
14989      enum PyUnicode_Kind kind;
14990      const void *pbuf;
14991      Py_ssize_t pindex;
14992      Py_UCS4 signchar;
14993      Py_ssize_t buflen;
14994      Py_UCS4 maxchar;
14995      Py_ssize_t sublen;
14996      _PyUnicodeWriter *writer = &ctx->writer;
14997      Py_UCS4 fill;
14998  
14999      fill = ' ';
15000      if (arg->sign && arg->flags & F_ZERO)
15001          fill = '0';
15002  
15003      if (PyUnicode_READY(str) == -1)
15004          return -1;
15005  
15006      len = PyUnicode_GET_LENGTH(str);
15007      if ((arg->width == -1 || arg->width <= len)
15008          && (arg->prec == -1 || arg->prec >= len)
15009          && !(arg->flags & (F_SIGN | F_BLANK)))
15010      {
15011          /* Fast path */
15012          if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15013              return -1;
15014          return 0;
15015      }
15016  
15017      /* Truncate the string for "s", "r" and "a" formats
15018         if the precision is set */
15019      if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15020          if (arg->prec >= 0 && len > arg->prec)
15021              len = arg->prec;
15022      }
15023  
15024      /* Adjust sign and width */
15025      kind = PyUnicode_KIND(str);
15026      pbuf = PyUnicode_DATA(str);
15027      pindex = 0;
15028      signchar = '\0';
15029      if (arg->sign) {
15030          Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15031          if (ch == '-' || ch == '+') {
15032              signchar = ch;
15033              len--;
15034              pindex++;
15035          }
15036          else if (arg->flags & F_SIGN)
15037              signchar = '+';
15038          else if (arg->flags & F_BLANK)
15039              signchar = ' ';
15040          else
15041              arg->sign = 0;
15042      }
15043      if (arg->width < len)
15044          arg->width = len;
15045  
15046      /* Prepare the writer */
15047      maxchar = writer->maxchar;
15048      if (!(arg->flags & F_LJUST)) {
15049          if (arg->sign) {
15050              if ((arg->width-1) > len)
15051                  maxchar = Py_MAX(maxchar, fill);
15052          }
15053          else {
15054              if (arg->width > len)
15055                  maxchar = Py_MAX(maxchar, fill);
15056          }
15057      }
15058      if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15059          Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15060          maxchar = Py_MAX(maxchar, strmaxchar);
15061      }
15062  
15063      buflen = arg->width;
15064      if (arg->sign && len == arg->width)
15065          buflen++;
15066      if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15067          return -1;
15068  
15069      /* Write the sign if needed */
15070      if (arg->sign) {
15071          if (fill != ' ') {
15072              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15073              writer->pos += 1;
15074          }
15075          if (arg->width > len)
15076              arg->width--;
15077      }
15078  
15079      /* Write the numeric prefix for "x", "X" and "o" formats
15080         if the alternate form is used.
15081         For example, write "0x" for the "%#x" format. */
15082      if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15083          assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15084          assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15085          if (fill != ' ') {
15086              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15087              PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15088              writer->pos += 2;
15089              pindex += 2;
15090          }
15091          arg->width -= 2;
15092          if (arg->width < 0)
15093              arg->width = 0;
15094          len -= 2;
15095      }
15096  
15097      /* Pad left with the fill character if needed */
15098      if (arg->width > len && !(arg->flags & F_LJUST)) {
15099          sublen = arg->width - len;
15100          unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15101          writer->pos += sublen;
15102          arg->width = len;
15103      }
15104  
15105      /* If padding with spaces: write sign if needed and/or numeric prefix if
15106         the alternate form is used */
15107      if (fill == ' ') {
15108          if (arg->sign) {
15109              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15110              writer->pos += 1;
15111          }
15112          if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15113              assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15114              assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15115              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15116              PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15117              writer->pos += 2;
15118              pindex += 2;
15119          }
15120      }
15121  
15122      /* Write characters */
15123      if (len) {
15124          _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15125                                        str, pindex, len);
15126          writer->pos += len;
15127      }
15128  
15129      /* Pad right with the fill character if needed */
15130      if (arg->width > len) {
15131          sublen = arg->width - len;
15132          unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15133          writer->pos += sublen;
15134      }
15135      return 0;
15136  }
15137  
15138  /* Helper of PyUnicode_Format(): format one arg.
15139     Return 0 on success, raise an exception and return -1 on error. */
15140  static int
unicode_format_arg(struct unicode_formatter_t * ctx)15141  unicode_format_arg(struct unicode_formatter_t *ctx)
15142  {
15143      struct unicode_format_arg_t arg;
15144      PyObject *str;
15145      int ret;
15146  
15147      arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15148      if (arg.ch == '%') {
15149          ctx->fmtpos++;
15150          ctx->fmtcnt--;
15151          if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15152              return -1;
15153          return 0;
15154      }
15155      arg.flags = 0;
15156      arg.width = -1;
15157      arg.prec = -1;
15158      arg.sign = 0;
15159      str = NULL;
15160  
15161      ret = unicode_format_arg_parse(ctx, &arg);
15162      if (ret == -1)
15163          return -1;
15164  
15165      ret = unicode_format_arg_format(ctx, &arg, &str);
15166      if (ret == -1)
15167          return -1;
15168  
15169      if (ret != 1) {
15170          ret = unicode_format_arg_output(ctx, &arg, str);
15171          Py_DECREF(str);
15172          if (ret == -1)
15173              return -1;
15174      }
15175  
15176      if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15177          PyErr_SetString(PyExc_TypeError,
15178                          "not all arguments converted during string formatting");
15179          return -1;
15180      }
15181      return 0;
15182  }
15183  
15184  PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15185  PyUnicode_Format(PyObject *format, PyObject *args)
15186  {
15187      struct unicode_formatter_t ctx;
15188  
15189      if (format == NULL || args == NULL) {
15190          PyErr_BadInternalCall();
15191          return NULL;
15192      }
15193  
15194      if (ensure_unicode(format) < 0)
15195          return NULL;
15196  
15197      ctx.fmtstr = format;
15198      ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15199      ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15200      ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15201      ctx.fmtpos = 0;
15202  
15203      _PyUnicodeWriter_Init(&ctx.writer);
15204      ctx.writer.min_length = ctx.fmtcnt + 100;
15205      ctx.writer.overallocate = 1;
15206  
15207      if (PyTuple_Check(args)) {
15208          ctx.arglen = PyTuple_Size(args);
15209          ctx.argidx = 0;
15210      }
15211      else {
15212          ctx.arglen = -1;
15213          ctx.argidx = -2;
15214      }
15215      ctx.args_owned = 0;
15216      if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15217          ctx.dict = args;
15218      else
15219          ctx.dict = NULL;
15220      ctx.args = args;
15221  
15222      while (--ctx.fmtcnt >= 0) {
15223          if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15224              Py_ssize_t nonfmtpos;
15225  
15226              nonfmtpos = ctx.fmtpos++;
15227              while (ctx.fmtcnt >= 0 &&
15228                     PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15229                  ctx.fmtpos++;
15230                  ctx.fmtcnt--;
15231              }
15232              if (ctx.fmtcnt < 0) {
15233                  ctx.fmtpos--;
15234                  ctx.writer.overallocate = 0;
15235              }
15236  
15237              if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15238                                                  nonfmtpos, ctx.fmtpos) < 0)
15239                  goto onError;
15240          }
15241          else {
15242              ctx.fmtpos++;
15243              if (unicode_format_arg(&ctx) == -1)
15244                  goto onError;
15245          }
15246      }
15247  
15248      if (ctx.argidx < ctx.arglen && !ctx.dict) {
15249          PyErr_SetString(PyExc_TypeError,
15250                          "not all arguments converted during string formatting");
15251          goto onError;
15252      }
15253  
15254      if (ctx.args_owned) {
15255          Py_DECREF(ctx.args);
15256      }
15257      return _PyUnicodeWriter_Finish(&ctx.writer);
15258  
15259    onError:
15260      _PyUnicodeWriter_Dealloc(&ctx.writer);
15261      if (ctx.args_owned) {
15262          Py_DECREF(ctx.args);
15263      }
15264      return NULL;
15265  }
15266  
15267  static PyObject *
15268  unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15269  
15270  /*[clinic input]
15271  @classmethod
15272  str.__new__ as unicode_new
15273  
15274      object as x: object = NULL
15275      encoding: str = NULL
15276      errors: str = NULL
15277  
15278  [clinic start generated code]*/
15279  
15280  static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15281  unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15282                   const char *errors)
15283  /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15284  {
15285      PyObject *unicode;
15286      if (x == NULL) {
15287          unicode = unicode_new_empty();
15288      }
15289      else if (encoding == NULL && errors == NULL) {
15290          unicode = PyObject_Str(x);
15291      }
15292      else {
15293          unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15294      }
15295  
15296      if (unicode != NULL && type != &PyUnicode_Type) {
15297          Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15298      }
15299      return unicode;
15300  }
15301  
15302  static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15303  unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15304  {
15305      PyObject *self;
15306      Py_ssize_t length, char_size;
15307      int share_wstr, share_utf8;
15308      unsigned int kind;
15309      void *data;
15310  
15311      assert(PyType_IsSubtype(type, &PyUnicode_Type));
15312      assert(_PyUnicode_CHECK(unicode));
15313      if (PyUnicode_READY(unicode) == -1) {
15314          return NULL;
15315      }
15316  
15317      self = type->tp_alloc(type, 0);
15318      if (self == NULL) {
15319          return NULL;
15320      }
15321      kind = PyUnicode_KIND(unicode);
15322      length = PyUnicode_GET_LENGTH(unicode);
15323  
15324      _PyUnicode_LENGTH(self) = length;
15325  #ifdef Py_DEBUG
15326      _PyUnicode_HASH(self) = -1;
15327  #else
15328      _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15329  #endif
15330      _PyUnicode_STATE(self).interned = 0;
15331      _PyUnicode_STATE(self).kind = kind;
15332      _PyUnicode_STATE(self).compact = 0;
15333      _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15334      _PyUnicode_STATE(self).ready = 1;
15335      _PyUnicode_WSTR(self) = NULL;
15336      _PyUnicode_UTF8_LENGTH(self) = 0;
15337      _PyUnicode_UTF8(self) = NULL;
15338      _PyUnicode_WSTR_LENGTH(self) = 0;
15339      _PyUnicode_DATA_ANY(self) = NULL;
15340  
15341      share_utf8 = 0;
15342      share_wstr = 0;
15343      if (kind == PyUnicode_1BYTE_KIND) {
15344          char_size = 1;
15345          if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15346              share_utf8 = 1;
15347      }
15348      else if (kind == PyUnicode_2BYTE_KIND) {
15349          char_size = 2;
15350          if (sizeof(wchar_t) == 2)
15351              share_wstr = 1;
15352      }
15353      else {
15354          assert(kind == PyUnicode_4BYTE_KIND);
15355          char_size = 4;
15356          if (sizeof(wchar_t) == 4)
15357              share_wstr = 1;
15358      }
15359  
15360      /* Ensure we won't overflow the length. */
15361      if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15362          PyErr_NoMemory();
15363          goto onError;
15364      }
15365      data = PyObject_Malloc((length + 1) * char_size);
15366      if (data == NULL) {
15367          PyErr_NoMemory();
15368          goto onError;
15369      }
15370  
15371      _PyUnicode_DATA_ANY(self) = data;
15372      if (share_utf8) {
15373          _PyUnicode_UTF8_LENGTH(self) = length;
15374          _PyUnicode_UTF8(self) = data;
15375      }
15376      if (share_wstr) {
15377          _PyUnicode_WSTR_LENGTH(self) = length;
15378          _PyUnicode_WSTR(self) = (wchar_t *)data;
15379      }
15380  
15381      memcpy(data, PyUnicode_DATA(unicode),
15382                kind * (length + 1));
15383      assert(_PyUnicode_CheckConsistency(self, 1));
15384  #ifdef Py_DEBUG
15385      _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15386  #endif
15387      return self;
15388  
15389  onError:
15390      Py_DECREF(self);
15391      return NULL;
15392  }
15393  
15394  void
_PyUnicode_ExactDealloc(PyObject * op)15395  _PyUnicode_ExactDealloc(PyObject *op)
15396  {
15397      assert(PyUnicode_CheckExact(op));
15398      unicode_dealloc(op);
15399  }
15400  
15401  PyDoc_STRVAR(unicode_doc,
15402  "str(object='') -> str\n\
15403  str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15404  \n\
15405  Create a new string object from the given object. If encoding or\n\
15406  errors is specified, then the object must expose a data buffer\n\
15407  that will be decoded using the given encoding and error handler.\n\
15408  Otherwise, returns the result of object.__str__() (if defined)\n\
15409  or repr(object).\n\
15410  encoding defaults to sys.getdefaultencoding().\n\
15411  errors defaults to 'strict'.");
15412  
15413  static PyObject *unicode_iter(PyObject *seq);
15414  
15415  PyTypeObject PyUnicode_Type = {
15416      PyVarObject_HEAD_INIT(&PyType_Type, 0)
15417      "str",                        /* tp_name */
15418      sizeof(PyUnicodeObject),      /* tp_basicsize */
15419      0,                            /* tp_itemsize */
15420      /* Slots */
15421      (destructor)unicode_dealloc,  /* tp_dealloc */
15422      0,                            /* tp_vectorcall_offset */
15423      0,                            /* tp_getattr */
15424      0,                            /* tp_setattr */
15425      0,                            /* tp_as_async */
15426      unicode_repr,                 /* tp_repr */
15427      &unicode_as_number,           /* tp_as_number */
15428      &unicode_as_sequence,         /* tp_as_sequence */
15429      &unicode_as_mapping,          /* tp_as_mapping */
15430      (hashfunc) unicode_hash,      /* tp_hash*/
15431      0,                            /* tp_call*/
15432      (reprfunc) unicode_str,       /* tp_str */
15433      PyObject_GenericGetAttr,      /* tp_getattro */
15434      0,                            /* tp_setattro */
15435      0,                            /* tp_as_buffer */
15436      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15437          Py_TPFLAGS_UNICODE_SUBCLASS |
15438          _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15439      unicode_doc,                  /* tp_doc */
15440      0,                            /* tp_traverse */
15441      0,                            /* tp_clear */
15442      PyUnicode_RichCompare,        /* tp_richcompare */
15443      0,                            /* tp_weaklistoffset */
15444      unicode_iter,                 /* tp_iter */
15445      0,                            /* tp_iternext */
15446      unicode_methods,              /* tp_methods */
15447      0,                            /* tp_members */
15448      0,                            /* tp_getset */
15449      0,                            /* tp_base */
15450      0,                            /* tp_dict */
15451      0,                            /* tp_descr_get */
15452      0,                            /* tp_descr_set */
15453      0,                            /* tp_dictoffset */
15454      0,                            /* tp_init */
15455      0,                            /* tp_alloc */
15456      unicode_new,                  /* tp_new */
15457      PyObject_Del,                 /* tp_free */
15458  };
15459  
15460  /* Initialize the Unicode implementation */
15461  
15462  void
_PyUnicode_InitState(PyInterpreterState * interp)15463  _PyUnicode_InitState(PyInterpreterState *interp)
15464  {
15465      if (!_Py_IsMainInterpreter(interp)) {
15466          return;
15467      }
15468  
15469      /* initialize the linebreak bloom filter */
15470      const Py_UCS2 linebreak[] = {
15471          0x000A, /* LINE FEED */
15472          0x000D, /* CARRIAGE RETURN */
15473          0x001C, /* FILE SEPARATOR */
15474          0x001D, /* GROUP SEPARATOR */
15475          0x001E, /* RECORD SEPARATOR */
15476          0x0085, /* NEXT LINE */
15477          0x2028, /* LINE SEPARATOR */
15478          0x2029, /* PARAGRAPH SEPARATOR */
15479      };
15480      bloom_linebreak = make_bloom_mask(
15481          PyUnicode_2BYTE_KIND, linebreak,
15482          Py_ARRAY_LENGTH(linebreak));
15483  }
15484  
15485  
15486  PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15487  _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15488  {
15489      if (!_Py_IsMainInterpreter(interp)) {
15490          return _PyStatus_OK();
15491      }
15492  
15493  #ifdef Py_DEBUG
15494      assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15495  
15496      for (int i = 0; i < 256; i++) {
15497          assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15498      }
15499  #endif
15500  
15501      return _PyStatus_OK();
15502  }
15503  
15504  
15505  PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15506  _PyUnicode_InitTypes(PyInterpreterState *interp)
15507  {
15508      if (!_Py_IsMainInterpreter(interp)) {
15509          return _PyStatus_OK();
15510      }
15511  
15512      if (PyType_Ready(&EncodingMapType) < 0) {
15513          goto error;
15514      }
15515      if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15516          goto error;
15517      }
15518      if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15519          goto error;
15520      }
15521      return _PyStatus_OK();
15522  
15523  error:
15524      return _PyStatus_ERR("Can't initialize unicode types");
15525  }
15526  
15527  
15528  void
PyUnicode_InternInPlace(PyObject ** p)15529  PyUnicode_InternInPlace(PyObject **p)
15530  {
15531      PyObject *s = *p;
15532  #ifdef Py_DEBUG
15533      assert(s != NULL);
15534      assert(_PyUnicode_CHECK(s));
15535  #else
15536      if (s == NULL || !PyUnicode_Check(s)) {
15537          return;
15538      }
15539  #endif
15540  
15541      /* If it's a subclass, we don't really know what putting
15542         it in the interned dict might do. */
15543      if (!PyUnicode_CheckExact(s)) {
15544          return;
15545      }
15546  
15547      if (PyUnicode_CHECK_INTERNED(s)) {
15548          return;
15549      }
15550  
15551      if (PyUnicode_READY(s) == -1) {
15552          PyErr_Clear();
15553          return;
15554      }
15555  
15556      if (interned == NULL) {
15557          interned = PyDict_New();
15558          if (interned == NULL) {
15559              PyErr_Clear(); /* Don't leave an exception */
15560              return;
15561          }
15562      }
15563  
15564      PyObject *t = PyDict_SetDefault(interned, s, s);
15565      if (t == NULL) {
15566          PyErr_Clear();
15567          return;
15568      }
15569  
15570      if (t != s) {
15571          Py_INCREF(t);
15572          Py_SETREF(*p, t);
15573          return;
15574      }
15575  
15576      /* The two references in interned dict (key and value) are not counted by
15577         refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15578         this. */
15579      Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15580      _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15581  }
15582  
15583  void
PyUnicode_InternImmortal(PyObject ** p)15584  PyUnicode_InternImmortal(PyObject **p)
15585  {
15586      if (PyErr_WarnEx(PyExc_DeprecationWarning,
15587              "PyUnicode_InternImmortal() is deprecated; "
15588              "use PyUnicode_InternInPlace() instead", 1) < 0)
15589      {
15590          // The function has no return value, the exception cannot
15591          // be reported to the caller, so just log it.
15592          PyErr_WriteUnraisable(NULL);
15593      }
15594  
15595      PyUnicode_InternInPlace(p);
15596      if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15597          _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15598          Py_INCREF(*p);
15599      }
15600  }
15601  
15602  PyObject *
PyUnicode_InternFromString(const char * cp)15603  PyUnicode_InternFromString(const char *cp)
15604  {
15605      PyObject *s = PyUnicode_FromString(cp);
15606      if (s == NULL)
15607          return NULL;
15608      PyUnicode_InternInPlace(&s);
15609      return s;
15610  }
15611  
15612  
15613  void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15614  _PyUnicode_ClearInterned(PyInterpreterState *interp)
15615  {
15616      if (!_Py_IsMainInterpreter(interp)) {
15617          // interned dict is shared by all interpreters
15618          return;
15619      }
15620  
15621      if (interned == NULL) {
15622          return;
15623      }
15624      assert(PyDict_CheckExact(interned));
15625  
15626      /* Interned unicode strings are not forcibly deallocated; rather, we give
15627         them their stolen references back, and then clear and DECREF the
15628         interned dict. */
15629  
15630  #ifdef INTERNED_STATS
15631      fprintf(stderr, "releasing %zd interned strings\n",
15632              PyDict_GET_SIZE(interned));
15633  
15634      Py_ssize_t immortal_size = 0, mortal_size = 0;
15635  #endif
15636      Py_ssize_t pos = 0;
15637      PyObject *s, *ignored_value;
15638      while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15639          assert(PyUnicode_IS_READY(s));
15640  
15641          switch (PyUnicode_CHECK_INTERNED(s)) {
15642          case SSTATE_INTERNED_IMMORTAL:
15643              Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15644  #ifdef INTERNED_STATS
15645              immortal_size += PyUnicode_GET_LENGTH(s);
15646  #endif
15647              break;
15648          case SSTATE_INTERNED_MORTAL:
15649              // Restore the two references (key and value) ignored
15650              // by PyUnicode_InternInPlace().
15651              Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15652  #ifdef INTERNED_STATS
15653              mortal_size += PyUnicode_GET_LENGTH(s);
15654  #endif
15655              break;
15656          case SSTATE_NOT_INTERNED:
15657              /* fall through */
15658          default:
15659              Py_UNREACHABLE();
15660          }
15661          _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15662      }
15663  #ifdef INTERNED_STATS
15664      fprintf(stderr,
15665              "total size of all interned strings: %zd/%zd mortal/immortal\n",
15666              mortal_size, immortal_size);
15667  #endif
15668  
15669      PyDict_Clear(interned);
15670      Py_CLEAR(interned);
15671  }
15672  
15673  
15674  /********************* Unicode Iterator **************************/
15675  
15676  typedef struct {
15677      PyObject_HEAD
15678      Py_ssize_t it_index;
15679      PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15680  } unicodeiterobject;
15681  
15682  static void
unicodeiter_dealloc(unicodeiterobject * it)15683  unicodeiter_dealloc(unicodeiterobject *it)
15684  {
15685      _PyObject_GC_UNTRACK(it);
15686      Py_XDECREF(it->it_seq);
15687      PyObject_GC_Del(it);
15688  }
15689  
15690  static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15691  unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15692  {
15693      Py_VISIT(it->it_seq);
15694      return 0;
15695  }
15696  
15697  static PyObject *
unicodeiter_next(unicodeiterobject * it)15698  unicodeiter_next(unicodeiterobject *it)
15699  {
15700      PyObject *seq;
15701  
15702      assert(it != NULL);
15703      seq = it->it_seq;
15704      if (seq == NULL)
15705          return NULL;
15706      assert(_PyUnicode_CHECK(seq));
15707  
15708      if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15709          int kind = PyUnicode_KIND(seq);
15710          const void *data = PyUnicode_DATA(seq);
15711          Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15712          it->it_index++;
15713          return unicode_char(chr);
15714      }
15715  
15716      it->it_seq = NULL;
15717      Py_DECREF(seq);
15718      return NULL;
15719  }
15720  
15721  static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15722  unicode_ascii_iter_next(unicodeiterobject *it)
15723  {
15724      assert(it != NULL);
15725      PyObject *seq = it->it_seq;
15726      if (seq == NULL) {
15727          return NULL;
15728      }
15729      assert(_PyUnicode_CHECK(seq));
15730      assert(PyUnicode_IS_COMPACT_ASCII(seq));
15731      if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15732          const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15733          Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15734                                                data, it->it_index);
15735          it->it_index++;
15736          PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15737          return Py_NewRef(item);
15738      }
15739      it->it_seq = NULL;
15740      Py_DECREF(seq);
15741      return NULL;
15742  }
15743  
15744  static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15745  unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15746  {
15747      Py_ssize_t len = 0;
15748      if (it->it_seq)
15749          len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15750      return PyLong_FromSsize_t(len);
15751  }
15752  
15753  PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15754  
15755  static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15756  unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15757  {
15758      PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15759  
15760      /* _PyEval_GetBuiltin can invoke arbitrary code,
15761       * call must be before access of iterator pointers.
15762       * see issue #101765 */
15763  
15764      if (it->it_seq != NULL) {
15765          return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15766      } else {
15767          PyObject *u = (PyObject *)_PyUnicode_New(0);
15768          if (u == NULL) {
15769              Py_XDECREF(iter);
15770              return NULL;
15771          }
15772          return Py_BuildValue("N(N)", iter, u);
15773      }
15774  }
15775  
15776  PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15777  
15778  static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15779  unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15780  {
15781      Py_ssize_t index = PyLong_AsSsize_t(state);
15782      if (index == -1 && PyErr_Occurred())
15783          return NULL;
15784      if (it->it_seq != NULL) {
15785          if (index < 0)
15786              index = 0;
15787          else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15788              index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15789          it->it_index = index;
15790      }
15791      Py_RETURN_NONE;
15792  }
15793  
15794  PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15795  
15796  static PyMethodDef unicodeiter_methods[] = {
15797      {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15798       length_hint_doc},
15799      {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15800       reduce_doc},
15801      {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15802       setstate_doc},
15803      {NULL,      NULL}       /* sentinel */
15804  };
15805  
15806  PyTypeObject PyUnicodeIter_Type = {
15807      PyVarObject_HEAD_INIT(&PyType_Type, 0)
15808      "str_iterator",         /* tp_name */
15809      sizeof(unicodeiterobject),      /* tp_basicsize */
15810      0,                  /* tp_itemsize */
15811      /* methods */
15812      (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15813      0,                  /* tp_vectorcall_offset */
15814      0,                  /* tp_getattr */
15815      0,                  /* tp_setattr */
15816      0,                  /* tp_as_async */
15817      0,                  /* tp_repr */
15818      0,                  /* tp_as_number */
15819      0,                  /* tp_as_sequence */
15820      0,                  /* tp_as_mapping */
15821      0,                  /* tp_hash */
15822      0,                  /* tp_call */
15823      0,                  /* tp_str */
15824      PyObject_GenericGetAttr,        /* tp_getattro */
15825      0,                  /* tp_setattro */
15826      0,                  /* tp_as_buffer */
15827      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15828      0,                  /* tp_doc */
15829      (traverseproc)unicodeiter_traverse, /* tp_traverse */
15830      0,                  /* tp_clear */
15831      0,                  /* tp_richcompare */
15832      0,                  /* tp_weaklistoffset */
15833      PyObject_SelfIter,          /* tp_iter */
15834      (iternextfunc)unicodeiter_next,     /* tp_iternext */
15835      unicodeiter_methods,            /* tp_methods */
15836      0,
15837  };
15838  
15839  PyTypeObject _PyUnicodeASCIIIter_Type = {
15840      PyVarObject_HEAD_INIT(&PyType_Type, 0)
15841      .tp_name = "str_ascii_iterator",
15842      .tp_basicsize = sizeof(unicodeiterobject),
15843      .tp_dealloc = (destructor)unicodeiter_dealloc,
15844      .tp_getattro = PyObject_GenericGetAttr,
15845      .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15846      .tp_traverse = (traverseproc)unicodeiter_traverse,
15847      .tp_iter = PyObject_SelfIter,
15848      .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15849      .tp_methods = unicodeiter_methods,
15850  };
15851  
15852  static PyObject *
unicode_iter(PyObject * seq)15853  unicode_iter(PyObject *seq)
15854  {
15855      unicodeiterobject *it;
15856  
15857      if (!PyUnicode_Check(seq)) {
15858          PyErr_BadInternalCall();
15859          return NULL;
15860      }
15861      if (PyUnicode_READY(seq) == -1)
15862          return NULL;
15863      if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15864          it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15865      }
15866      else {
15867          it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15868      }
15869      if (it == NULL)
15870          return NULL;
15871      it->it_index = 0;
15872      Py_INCREF(seq);
15873      it->it_seq = seq;
15874      _PyObject_GC_TRACK(it);
15875      return (PyObject *)it;
15876  }
15877  
15878  static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15879  encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15880  {
15881      int res;
15882      res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15883      if (res == -2) {
15884          PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15885          return -1;
15886      }
15887      if (res < 0) {
15888          PyErr_NoMemory();
15889          return -1;
15890      }
15891      return 0;
15892  }
15893  
15894  
15895  static int
config_get_codec_name(wchar_t ** config_encoding)15896  config_get_codec_name(wchar_t **config_encoding)
15897  {
15898      char *encoding;
15899      if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15900          return -1;
15901      }
15902  
15903      PyObject *name_obj = NULL;
15904      PyObject *codec = _PyCodec_Lookup(encoding);
15905      PyMem_RawFree(encoding);
15906  
15907      if (!codec)
15908          goto error;
15909  
15910      name_obj = PyObject_GetAttrString(codec, "name");
15911      Py_CLEAR(codec);
15912      if (!name_obj) {
15913          goto error;
15914      }
15915  
15916      wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15917      Py_DECREF(name_obj);
15918      if (wname == NULL) {
15919          goto error;
15920      }
15921  
15922      wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15923      if (raw_wname == NULL) {
15924          PyMem_Free(wname);
15925          PyErr_NoMemory();
15926          goto error;
15927      }
15928  
15929      PyMem_RawFree(*config_encoding);
15930      *config_encoding = raw_wname;
15931  
15932      PyMem_Free(wname);
15933      return 0;
15934  
15935  error:
15936      Py_XDECREF(codec);
15937      Py_XDECREF(name_obj);
15938      return -1;
15939  }
15940  
15941  
15942  static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15943  init_stdio_encoding(PyInterpreterState *interp)
15944  {
15945      /* Update the stdio encoding to the normalized Python codec name. */
15946      PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15947      if (config_get_codec_name(&config->stdio_encoding) < 0) {
15948          return _PyStatus_ERR("failed to get the Python codec name "
15949                               "of the stdio encoding");
15950      }
15951      return _PyStatus_OK();
15952  }
15953  
15954  
15955  static int
init_fs_codec(PyInterpreterState * interp)15956  init_fs_codec(PyInterpreterState *interp)
15957  {
15958      const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15959  
15960      _Py_error_handler error_handler;
15961      error_handler = get_error_handler_wide(config->filesystem_errors);
15962      if (error_handler == _Py_ERROR_UNKNOWN) {
15963          PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15964          return -1;
15965      }
15966  
15967      char *encoding, *errors;
15968      if (encode_wstr_utf8(config->filesystem_encoding,
15969                           &encoding,
15970                           "filesystem_encoding") < 0) {
15971          return -1;
15972      }
15973  
15974      if (encode_wstr_utf8(config->filesystem_errors,
15975                           &errors,
15976                           "filesystem_errors") < 0) {
15977          PyMem_RawFree(encoding);
15978          return -1;
15979      }
15980  
15981      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15982      PyMem_RawFree(fs_codec->encoding);
15983      fs_codec->encoding = encoding;
15984      /* encoding has been normalized by init_fs_encoding() */
15985      fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15986      PyMem_RawFree(fs_codec->errors);
15987      fs_codec->errors = errors;
15988      fs_codec->error_handler = error_handler;
15989  
15990  #ifdef _Py_FORCE_UTF8_FS_ENCODING
15991      assert(fs_codec->utf8 == 1);
15992  #endif
15993  
15994      /* At this point, PyUnicode_EncodeFSDefault() and
15995         PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15996         the C implementation of the filesystem encoding. */
15997  
15998      /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15999         global configuration variables. */
16000      if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16001                                    fs_codec->errors) < 0) {
16002          PyErr_NoMemory();
16003          return -1;
16004      }
16005      return 0;
16006  }
16007  
16008  
16009  static PyStatus
init_fs_encoding(PyThreadState * tstate)16010  init_fs_encoding(PyThreadState *tstate)
16011  {
16012      PyInterpreterState *interp = tstate->interp;
16013  
16014      /* Update the filesystem encoding to the normalized Python codec name.
16015         For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16016         (Python codec name). */
16017      PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16018      if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16019          _Py_DumpPathConfig(tstate);
16020          return _PyStatus_ERR("failed to get the Python codec "
16021                               "of the filesystem encoding");
16022      }
16023  
16024      if (init_fs_codec(interp) < 0) {
16025          return _PyStatus_ERR("cannot initialize filesystem codec");
16026      }
16027      return _PyStatus_OK();
16028  }
16029  
16030  
16031  PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16032  _PyUnicode_InitEncodings(PyThreadState *tstate)
16033  {
16034      PyStatus status = init_fs_encoding(tstate);
16035      if (_PyStatus_EXCEPTION(status)) {
16036          return status;
16037      }
16038  
16039      return init_stdio_encoding(tstate->interp);
16040  }
16041  
16042  
16043  static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16044  _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16045  {
16046      PyMem_RawFree(fs_codec->encoding);
16047      fs_codec->encoding = NULL;
16048      fs_codec->utf8 = 0;
16049      PyMem_RawFree(fs_codec->errors);
16050      fs_codec->errors = NULL;
16051      fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16052  }
16053  
16054  
16055  #ifdef MS_WINDOWS
16056  int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16057  _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16058  {
16059      PyInterpreterState *interp = _PyInterpreterState_GET();
16060      PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16061  
16062      /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16063      wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16064      wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16065      if (encoding == NULL || errors == NULL) {
16066          PyMem_RawFree(encoding);
16067          PyMem_RawFree(errors);
16068          PyErr_NoMemory();
16069          return -1;
16070      }
16071  
16072      PyMem_RawFree(config->filesystem_encoding);
16073      config->filesystem_encoding = encoding;
16074      PyMem_RawFree(config->filesystem_errors);
16075      config->filesystem_errors = errors;
16076  
16077      return init_fs_codec(interp);
16078  }
16079  #endif
16080  
16081  
16082  #ifdef Py_DEBUG
16083  static inline int
unicode_is_finalizing(void)16084  unicode_is_finalizing(void)
16085  {
16086      return (interned == NULL);
16087  }
16088  #endif
16089  
16090  
16091  void
_PyUnicode_FiniTypes(PyInterpreterState * interp)16092  _PyUnicode_FiniTypes(PyInterpreterState *interp)
16093  {
16094      if (!_Py_IsMainInterpreter(interp)) {
16095          return;
16096      }
16097  
16098      _PyStaticType_Dealloc(&EncodingMapType);
16099      _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16100      _PyStaticType_Dealloc(&PyFormatterIter_Type);
16101  }
16102  
16103  
unicode_static_dealloc(PyObject * op)16104  static void unicode_static_dealloc(PyObject *op)
16105  {
16106      PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16107  
16108      assert(ascii->state.compact);
16109  
16110      if (ascii->state.ascii) {
16111          if (ascii->wstr) {
16112              PyObject_Free(ascii->wstr);
16113              ascii->wstr = NULL;
16114          }
16115      }
16116      else {
16117          PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16118          void* data = (void*)(compact + 1);
16119          if (ascii->wstr && ascii->wstr != data) {
16120              PyObject_Free(ascii->wstr);
16121              ascii->wstr = NULL;
16122              compact->wstr_length = 0;
16123          }
16124          if (compact->utf8) {
16125              PyObject_Free(compact->utf8);
16126              compact->utf8 = NULL;
16127              compact->utf8_length = 0;
16128          }
16129      }
16130  }
16131  
16132  
16133  void
_PyUnicode_Fini(PyInterpreterState * interp)16134  _PyUnicode_Fini(PyInterpreterState *interp)
16135  {
16136      struct _Py_unicode_state *state = &interp->unicode;
16137  
16138      if (_Py_IsMainInterpreter(interp)) {
16139          // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16140          assert(interned == NULL);
16141          // bpo-47182: force a unicodedata CAPI capsule re-import on
16142          // subsequent initialization of main interpreter.
16143          ucnhash_capi = NULL;
16144      }
16145  
16146      _PyUnicode_FiniEncodings(&state->fs_codec);
16147  
16148      unicode_clear_identifiers(state);
16149  
16150      // Clear the single character singletons
16151      for (int i = 0; i < 128; i++) {
16152          unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16153      }
16154      for (int i = 0; i < 128; i++) {
16155          unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16156      }
16157  }
16158  
16159  
16160  void
_PyStaticUnicode_Dealloc(PyObject * op)16161  _PyStaticUnicode_Dealloc(PyObject *op)
16162  {
16163      unicode_static_dealloc(op);
16164  }
16165  
16166  
16167  /* A _string module, to export formatter_parser and formatter_field_name_split
16168     to the string.Formatter class implemented in Python. */
16169  
16170  static PyMethodDef _string_methods[] = {
16171      {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16172       METH_O, PyDoc_STR("split the argument as a field name")},
16173      {"formatter_parser", (PyCFunction) formatter_parser,
16174       METH_O, PyDoc_STR("parse the argument as a format string")},
16175      {NULL, NULL}
16176  };
16177  
16178  static struct PyModuleDef _string_module = {
16179      PyModuleDef_HEAD_INIT,
16180      .m_name = "_string",
16181      .m_doc = PyDoc_STR("string helper module"),
16182      .m_size = 0,
16183      .m_methods = _string_methods,
16184  };
16185  
16186  PyMODINIT_FUNC
PyInit__string(void)16187  PyInit__string(void)
16188  {
16189      return PyModuleDef_Init(&_string_module);
16190  }
16191  
16192  
16193  #ifdef __cplusplus
16194  }
16195  #endif
16196