1 /* 2 3 Unicode implementation based on original code by Fredrik Lundh, 4 modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6 Major speed upgrades to the method implementations at the Reykjavik 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9 Copyright (c) Corporation for National Research Initiatives. 10 11 -------------------------------------------------------------------- 12 The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17 By obtaining, using, and/or copying this software and/or its 18 associated documentation, you agree that you have read, understood, 19 and will comply with the following terms and conditions: 20 21 Permission to use, copy, modify, and distribute this software and its 22 associated documentation for any purpose and without fee is hereby 23 granted, provided that the above copyright notice appears in all 24 copies, and that both that copyright notice and this permission notice 25 appear in supporting documentation, and that the name of Secret Labs 26 AB or the author not be used in advertising or publicity pertaining to 27 distribution of the software without specific, written prior 28 permission. 29 30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 -------------------------------------------------------------------- 38 39 */ 40 41 #define PY_SSIZE_T_CLEAN 42 #include "Python.h" 43 #include "pycore_abstract.h" // _PyIndex_Check() 44 #include "pycore_atomic_funcs.h" // _Py_atomic_size_get() 45 #include "pycore_bytesobject.h" // _PyBytes_Repeat() 46 #include "pycore_bytes_methods.h" // _Py_bytes_lower() 47 #include "pycore_format.h" // F_LJUST 48 #include "pycore_initconfig.h" // _PyStatus_OK() 49 #include "pycore_interp.h" // PyInterpreterState.fs_codec 50 #include "pycore_long.h" // _PyLong_FormatWriter() 51 #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError() 52 #include "pycore_pathconfig.h" // _Py_DumpPathConfig() 53 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() 54 #include "pycore_pystate.h" // _PyInterpreterState_GET() 55 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI 56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state 57 #include "stringlib/eq.h" // unicode_eq() 58 59 #ifdef MS_WINDOWS 60 #include <windows.h> 61 #endif 62 63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 64 # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() 65 #endif 66 67 /* Uncomment to display statistics on interned strings at exit 68 in _PyUnicode_ClearInterned(). */ 69 /* #define INTERNED_STATS 1 */ 70 71 72 /*[clinic input] 73 class str "PyObject *" "&PyUnicode_Type" 74 [clinic start generated code]*/ 75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/ 76 77 /*[python input] 78 class Py_UCS4_converter(CConverter): 79 type = 'Py_UCS4' 80 converter = 'convert_uc' 81 82 def converter_init(self): 83 if self.default is not unspecified: 84 self.c_default = ascii(self.default) 85 if len(self.c_default) > 4 or self.c_default[0] != "'": 86 self.c_default = hex(ord(self.default)) 87 88 [python start generated code]*/ 89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/ 90 91 /* --- Globals ------------------------------------------------------------ 92 93 NOTE: In the interpreter's initialization phase, some globals are currently 94 initialized dynamically as needed. In the process Unicode objects may 95 be created before the Unicode type is ready. 96 97 */ 98 99 100 #ifdef __cplusplus 101 extern "C" { 102 #endif 103 104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). 105 // The value must be the same in fileutils.c. 106 #define MAX_UNICODE 0x10ffff 107 108 #ifdef Py_DEBUG 109 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 110 #else 111 # define _PyUnicode_CHECK(op) PyUnicode_Check(op) 112 #endif 113 114 #define _PyUnicode_UTF8(op) \ 115 (_PyCompactUnicodeObject_CAST(op)->utf8) 116 #define PyUnicode_UTF8(op) \ 117 (assert(_PyUnicode_CHECK(op)), \ 118 assert(PyUnicode_IS_READY(op)), \ 119 PyUnicode_IS_COMPACT_ASCII(op) ? \ 120 ((char*)(_PyASCIIObject_CAST(op) + 1)) : \ 121 _PyUnicode_UTF8(op)) 122 #define _PyUnicode_UTF8_LENGTH(op) \ 123 (_PyCompactUnicodeObject_CAST(op)->utf8_length) 124 #define PyUnicode_UTF8_LENGTH(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 assert(PyUnicode_IS_READY(op)), \ 127 PyUnicode_IS_COMPACT_ASCII(op) ? \ 128 _PyASCIIObject_CAST(op)->length : \ 129 _PyUnicode_UTF8_LENGTH(op)) 130 #define _PyUnicode_WSTR(op) \ 131 (_PyASCIIObject_CAST(op)->wstr) 132 133 /* Don't use deprecated macro of unicodeobject.h */ 134 #undef PyUnicode_WSTR_LENGTH 135 #define PyUnicode_WSTR_LENGTH(op) \ 136 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 137 _PyASCIIObject_CAST(op)->length : \ 138 _PyCompactUnicodeObject_CAST(op)->wstr_length) 139 #define _PyUnicode_WSTR_LENGTH(op) \ 140 (_PyCompactUnicodeObject_CAST(op)->wstr_length) 141 #define _PyUnicode_LENGTH(op) \ 142 (_PyASCIIObject_CAST(op)->length) 143 #define _PyUnicode_STATE(op) \ 144 (_PyASCIIObject_CAST(op)->state) 145 #define _PyUnicode_HASH(op) \ 146 (_PyASCIIObject_CAST(op)->hash) 147 #define _PyUnicode_KIND(op) \ 148 (assert(_PyUnicode_CHECK(op)), \ 149 _PyASCIIObject_CAST(op)->state.kind) 150 #define _PyUnicode_GET_LENGTH(op) \ 151 (assert(_PyUnicode_CHECK(op)), \ 152 _PyASCIIObject_CAST(op)->length) 153 #define _PyUnicode_DATA_ANY(op) \ 154 (_PyUnicodeObject_CAST(op)->data.any) 155 156 #undef PyUnicode_READY 157 #define PyUnicode_READY(op) \ 158 (assert(_PyUnicode_CHECK(op)), \ 159 (PyUnicode_IS_READY(op) ? \ 160 0 : \ 161 _PyUnicode_Ready(op))) 162 163 #define _PyUnicode_SHARE_UTF8(op) \ 164 (assert(_PyUnicode_CHECK(op)), \ 165 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 166 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 167 #define _PyUnicode_SHARE_WSTR(op) \ 168 (assert(_PyUnicode_CHECK(op)), \ 169 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 170 171 /* true if the Unicode object has an allocated UTF-8 memory block 172 (not shared with other data) */ 173 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 174 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 175 && _PyUnicode_UTF8(op) \ 176 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 177 178 /* true if the Unicode object has an allocated wstr memory block 179 (not shared with other data) */ 180 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 181 ((_PyUnicode_WSTR(op) && \ 182 (!PyUnicode_IS_READY(op) || \ 183 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 184 185 /* Generic helper macro to convert characters of different types. 186 from_type and to_type have to be valid type names, begin and end 187 are pointers to the source characters which should be of type 188 "from_type *". to is a pointer of type "to_type *" and points to the 189 buffer where the result characters are written to. */ 190 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 191 do { \ 192 to_type *_to = (to_type *)(to); \ 193 const from_type *_iter = (const from_type *)(begin);\ 194 const from_type *_end = (const from_type *)(end);\ 195 Py_ssize_t n = (_end) - (_iter); \ 196 const from_type *_unrolled_end = \ 197 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 198 while (_iter < (_unrolled_end)) { \ 199 _to[0] = (to_type) _iter[0]; \ 200 _to[1] = (to_type) _iter[1]; \ 201 _to[2] = (to_type) _iter[2]; \ 202 _to[3] = (to_type) _iter[3]; \ 203 _iter += 4; _to += 4; \ 204 } \ 205 while (_iter < (_end)) \ 206 *_to++ = (to_type) *_iter++; \ 207 } while (0) 208 209 #define LATIN1(ch) \ 210 (ch < 128 \ 211 ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \ 212 : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128]) 213 214 #ifdef MS_WINDOWS 215 /* On Windows, overallocate by 50% is the best factor */ 216 # define OVERALLOCATE_FACTOR 2 217 #else 218 /* On Linux, overallocate by 25% is the best factor */ 219 # define OVERALLOCATE_FACTOR 4 220 #endif 221 222 /* This dictionary holds all interned unicode strings. Note that references 223 to strings in this dictionary are *not* counted in the string's ob_refcnt. 224 When the interned string reaches a refcnt of 0 the string deallocation 225 function will delete the reference from this dictionary. 226 227 Another way to look at this is that to say that the actual reference 228 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 229 */ 230 static PyObject *interned = NULL; 231 232 /* Forward declaration */ 233 static inline int 234 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 235 static inline void 236 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); 237 static PyObject * 238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, 239 const char *errors); 240 static PyObject * 241 unicode_decode_utf8(const char *s, Py_ssize_t size, 242 _Py_error_handler error_handler, const char *errors, 243 Py_ssize_t *consumed); 244 #ifdef Py_DEBUG 245 static inline int unicode_is_finalizing(void); 246 static int unicode_is_singleton(PyObject *unicode); 247 #endif 248 249 250 // Return a borrowed reference to the empty string singleton. unicode_get_empty(void)251 static inline PyObject* unicode_get_empty(void) 252 { 253 _Py_DECLARE_STR(empty, ""); 254 return &_Py_STR(empty); 255 } 256 257 258 // Return a strong reference to the empty string singleton. unicode_new_empty(void)259 static inline PyObject* unicode_new_empty(void) 260 { 261 PyObject *empty = unicode_get_empty(); 262 Py_INCREF(empty); 263 return empty; 264 } 265 266 #define _Py_RETURN_UNICODE_EMPTY() \ 267 do { \ 268 return unicode_new_empty(); \ 269 } while (0) 270 271 static inline void unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, 273 Py_ssize_t start, Py_ssize_t length) 274 { 275 assert(0 <= start); 276 assert(kind != PyUnicode_WCHAR_KIND); 277 switch (kind) { 278 case PyUnicode_1BYTE_KIND: { 279 assert(value <= 0xff); 280 Py_UCS1 ch = (unsigned char)value; 281 Py_UCS1 *to = (Py_UCS1 *)data + start; 282 memset(to, ch, length); 283 break; 284 } 285 case PyUnicode_2BYTE_KIND: { 286 assert(value <= 0xffff); 287 Py_UCS2 ch = (Py_UCS2)value; 288 Py_UCS2 *to = (Py_UCS2 *)data + start; 289 const Py_UCS2 *end = to + length; 290 for (; to < end; ++to) *to = ch; 291 break; 292 } 293 case PyUnicode_4BYTE_KIND: { 294 assert(value <= MAX_UNICODE); 295 Py_UCS4 ch = value; 296 Py_UCS4 * to = (Py_UCS4 *)data + start; 297 const Py_UCS4 *end = to + length; 298 for (; to < end; ++to) *to = ch; 299 break; 300 } 301 default: Py_UNREACHABLE(); 302 } 303 } 304 305 306 /* Fast detection of the most frequent whitespace characters */ 307 const unsigned char _Py_ascii_whitespace[] = { 308 0, 0, 0, 0, 0, 0, 0, 0, 309 /* case 0x0009: * CHARACTER TABULATION */ 310 /* case 0x000A: * LINE FEED */ 311 /* case 0x000B: * LINE TABULATION */ 312 /* case 0x000C: * FORM FEED */ 313 /* case 0x000D: * CARRIAGE RETURN */ 314 0, 1, 1, 1, 1, 1, 0, 0, 315 0, 0, 0, 0, 0, 0, 0, 0, 316 /* case 0x001C: * FILE SEPARATOR */ 317 /* case 0x001D: * GROUP SEPARATOR */ 318 /* case 0x001E: * RECORD SEPARATOR */ 319 /* case 0x001F: * UNIT SEPARATOR */ 320 0, 0, 0, 0, 1, 1, 1, 1, 321 /* case 0x0020: * SPACE */ 322 1, 0, 0, 0, 0, 0, 0, 0, 323 0, 0, 0, 0, 0, 0, 0, 0, 324 0, 0, 0, 0, 0, 0, 0, 0, 325 0, 0, 0, 0, 0, 0, 0, 0, 326 327 0, 0, 0, 0, 0, 0, 0, 0, 328 0, 0, 0, 0, 0, 0, 0, 0, 329 0, 0, 0, 0, 0, 0, 0, 0, 330 0, 0, 0, 0, 0, 0, 0, 0, 331 0, 0, 0, 0, 0, 0, 0, 0, 332 0, 0, 0, 0, 0, 0, 0, 0, 333 0, 0, 0, 0, 0, 0, 0, 0, 334 0, 0, 0, 0, 0, 0, 0, 0 335 }; 336 337 /* forward */ 338 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 339 static PyObject* get_latin1_char(unsigned char ch); 340 static int unicode_modifiable(PyObject *unicode); 341 342 343 static PyObject * 344 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 345 static PyObject * 346 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 347 static PyObject * 348 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 349 350 static PyObject * 351 unicode_encode_call_errorhandler(const char *errors, 352 PyObject **errorHandler,const char *encoding, const char *reason, 353 PyObject *unicode, PyObject **exceptionObject, 354 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 355 356 static void 357 raise_encode_exception(PyObject **exceptionObject, 358 const char *encoding, 359 PyObject *unicode, 360 Py_ssize_t startpos, Py_ssize_t endpos, 361 const char *reason); 362 363 /* Same for linebreaks */ 364 static const unsigned char ascii_linebreak[] = { 365 0, 0, 0, 0, 0, 0, 0, 0, 366 /* 0x000A, * LINE FEED */ 367 /* 0x000B, * LINE TABULATION */ 368 /* 0x000C, * FORM FEED */ 369 /* 0x000D, * CARRIAGE RETURN */ 370 0, 0, 1, 1, 1, 1, 0, 0, 371 0, 0, 0, 0, 0, 0, 0, 0, 372 /* 0x001C, * FILE SEPARATOR */ 373 /* 0x001D, * GROUP SEPARATOR */ 374 /* 0x001E, * RECORD SEPARATOR */ 375 0, 0, 0, 0, 1, 1, 1, 0, 376 0, 0, 0, 0, 0, 0, 0, 0, 377 0, 0, 0, 0, 0, 0, 0, 0, 378 0, 0, 0, 0, 0, 0, 0, 0, 379 0, 0, 0, 0, 0, 0, 0, 0, 380 381 0, 0, 0, 0, 0, 0, 0, 0, 382 0, 0, 0, 0, 0, 0, 0, 0, 383 0, 0, 0, 0, 0, 0, 0, 0, 384 0, 0, 0, 0, 0, 0, 0, 0, 385 0, 0, 0, 0, 0, 0, 0, 0, 386 0, 0, 0, 0, 0, 0, 0, 0, 387 0, 0, 0, 0, 0, 0, 0, 0, 388 0, 0, 0, 0, 0, 0, 0, 0 389 }; 390 391 static int convert_uc(PyObject *obj, void *addr); 392 393 struct encoding_map; 394 #include "clinic/unicodeobject.c.h" 395 396 _Py_error_handler _Py_GetErrorHandler(const char * errors)397 _Py_GetErrorHandler(const char *errors) 398 { 399 if (errors == NULL || strcmp(errors, "strict") == 0) { 400 return _Py_ERROR_STRICT; 401 } 402 if (strcmp(errors, "surrogateescape") == 0) { 403 return _Py_ERROR_SURROGATEESCAPE; 404 } 405 if (strcmp(errors, "replace") == 0) { 406 return _Py_ERROR_REPLACE; 407 } 408 if (strcmp(errors, "ignore") == 0) { 409 return _Py_ERROR_IGNORE; 410 } 411 if (strcmp(errors, "backslashreplace") == 0) { 412 return _Py_ERROR_BACKSLASHREPLACE; 413 } 414 if (strcmp(errors, "surrogatepass") == 0) { 415 return _Py_ERROR_SURROGATEPASS; 416 } 417 if (strcmp(errors, "xmlcharrefreplace") == 0) { 418 return _Py_ERROR_XMLCHARREFREPLACE; 419 } 420 return _Py_ERROR_OTHER; 421 } 422 423 424 static _Py_error_handler get_error_handler_wide(const wchar_t * errors)425 get_error_handler_wide(const wchar_t *errors) 426 { 427 if (errors == NULL || wcscmp(errors, L"strict") == 0) { 428 return _Py_ERROR_STRICT; 429 } 430 if (wcscmp(errors, L"surrogateescape") == 0) { 431 return _Py_ERROR_SURROGATEESCAPE; 432 } 433 if (wcscmp(errors, L"replace") == 0) { 434 return _Py_ERROR_REPLACE; 435 } 436 if (wcscmp(errors, L"ignore") == 0) { 437 return _Py_ERROR_IGNORE; 438 } 439 if (wcscmp(errors, L"backslashreplace") == 0) { 440 return _Py_ERROR_BACKSLASHREPLACE; 441 } 442 if (wcscmp(errors, L"surrogatepass") == 0) { 443 return _Py_ERROR_SURROGATEPASS; 444 } 445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) { 446 return _Py_ERROR_XMLCHARREFREPLACE; 447 } 448 return _Py_ERROR_OTHER; 449 } 450 451 452 static inline int unicode_check_encoding_errors(const char * encoding,const char * errors)453 unicode_check_encoding_errors(const char *encoding, const char *errors) 454 { 455 if (encoding == NULL && errors == NULL) { 456 return 0; 457 } 458 459 PyInterpreterState *interp = _PyInterpreterState_GET(); 460 #ifndef Py_DEBUG 461 /* In release mode, only check in development mode (-X dev) */ 462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) { 463 return 0; 464 } 465 #else 466 /* Always check in debug mode */ 467 #endif 468 469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the 470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */ 471 if (!interp->unicode.fs_codec.encoding) { 472 return 0; 473 } 474 475 /* Disable checks during Python finalization. For example, it allows to 476 call _PyObject_Dump() during finalization for debugging purpose. */ 477 if (interp->finalizing) { 478 return 0; 479 } 480 481 if (encoding != NULL) { 482 PyObject *handler = _PyCodec_Lookup(encoding); 483 if (handler == NULL) { 484 return -1; 485 } 486 Py_DECREF(handler); 487 } 488 489 if (errors != NULL) { 490 PyObject *handler = PyCodec_LookupError(errors); 491 if (handler == NULL) { 492 return -1; 493 } 494 Py_DECREF(handler); 495 } 496 return 0; 497 } 498 499 500 int _PyUnicode_CheckConsistency(PyObject * op,int check_content)501 _PyUnicode_CheckConsistency(PyObject *op, int check_content) 502 { 503 #define CHECK(expr) \ 504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) 505 506 assert(op != NULL); 507 CHECK(PyUnicode_Check(op)); 508 509 PyASCIIObject *ascii = _PyASCIIObject_CAST(op); 510 unsigned int kind = ascii->state.kind; 511 512 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 513 CHECK(kind == PyUnicode_1BYTE_KIND); 514 CHECK(ascii->state.ready == 1); 515 } 516 else { 517 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); 518 void *data; 519 520 if (ascii->state.compact == 1) { 521 data = compact + 1; 522 CHECK(kind == PyUnicode_1BYTE_KIND 523 || kind == PyUnicode_2BYTE_KIND 524 || kind == PyUnicode_4BYTE_KIND); 525 CHECK(ascii->state.ascii == 0); 526 CHECK(ascii->state.ready == 1); 527 CHECK(compact->utf8 != data); 528 } 529 else { 530 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); 531 532 data = unicode->data.any; 533 if (kind == PyUnicode_WCHAR_KIND) { 534 CHECK(ascii->length == 0); 535 CHECK(ascii->hash == -1); 536 CHECK(ascii->state.compact == 0); 537 CHECK(ascii->state.ascii == 0); 538 CHECK(ascii->state.ready == 0); 539 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED); 540 CHECK(ascii->wstr != NULL); 541 CHECK(data == NULL); 542 CHECK(compact->utf8 == NULL); 543 } 544 else { 545 CHECK(kind == PyUnicode_1BYTE_KIND 546 || kind == PyUnicode_2BYTE_KIND 547 || kind == PyUnicode_4BYTE_KIND); 548 CHECK(ascii->state.compact == 0); 549 CHECK(ascii->state.ready == 1); 550 CHECK(data != NULL); 551 if (ascii->state.ascii) { 552 CHECK(compact->utf8 == data); 553 CHECK(compact->utf8_length == ascii->length); 554 } 555 else 556 CHECK(compact->utf8 != data); 557 } 558 } 559 if (kind != PyUnicode_WCHAR_KIND) { 560 if ( 561 #if SIZEOF_WCHAR_T == 2 562 kind == PyUnicode_2BYTE_KIND 563 #else 564 kind == PyUnicode_4BYTE_KIND 565 #endif 566 ) 567 { 568 CHECK(ascii->wstr == data); 569 CHECK(compact->wstr_length == ascii->length); 570 } else 571 CHECK(ascii->wstr != data); 572 } 573 574 if (compact->utf8 == NULL) 575 CHECK(compact->utf8_length == 0); 576 if (ascii->wstr == NULL) 577 CHECK(compact->wstr_length == 0); 578 } 579 580 /* check that the best kind is used: O(n) operation */ 581 if (check_content && kind != PyUnicode_WCHAR_KIND) { 582 Py_ssize_t i; 583 Py_UCS4 maxchar = 0; 584 const void *data; 585 Py_UCS4 ch; 586 587 data = PyUnicode_DATA(ascii); 588 for (i=0; i < ascii->length; i++) 589 { 590 ch = PyUnicode_READ(kind, data, i); 591 if (ch > maxchar) 592 maxchar = ch; 593 } 594 if (kind == PyUnicode_1BYTE_KIND) { 595 if (ascii->state.ascii == 0) { 596 CHECK(maxchar >= 128); 597 CHECK(maxchar <= 255); 598 } 599 else 600 CHECK(maxchar < 128); 601 } 602 else if (kind == PyUnicode_2BYTE_KIND) { 603 CHECK(maxchar >= 0x100); 604 CHECK(maxchar <= 0xFFFF); 605 } 606 else { 607 CHECK(maxchar >= 0x10000); 608 CHECK(maxchar <= MAX_UNICODE); 609 } 610 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0); 611 } 612 return 1; 613 614 #undef CHECK 615 } 616 617 618 static PyObject* unicode_result_wchar(PyObject * unicode)619 unicode_result_wchar(PyObject *unicode) 620 { 621 #ifndef Py_DEBUG 622 Py_ssize_t len; 623 624 len = _PyUnicode_WSTR_LENGTH(unicode); 625 if (len == 0) { 626 Py_DECREF(unicode); 627 _Py_RETURN_UNICODE_EMPTY(); 628 } 629 630 if (len == 1) { 631 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 632 if ((Py_UCS4)ch < 256) { 633 Py_DECREF(unicode); 634 return get_latin1_char((unsigned char)ch); 635 } 636 } 637 638 if (_PyUnicode_Ready(unicode) < 0) { 639 Py_DECREF(unicode); 640 return NULL; 641 } 642 #else 643 assert(Py_REFCNT(unicode) == 1); 644 645 /* don't make the result ready in debug mode to ensure that the caller 646 makes the string ready before using it */ 647 assert(_PyUnicode_CheckConsistency(unicode, 1)); 648 #endif 649 return unicode; 650 } 651 652 static PyObject* unicode_result_ready(PyObject * unicode)653 unicode_result_ready(PyObject *unicode) 654 { 655 Py_ssize_t length; 656 657 length = PyUnicode_GET_LENGTH(unicode); 658 if (length == 0) { 659 PyObject *empty = unicode_get_empty(); 660 if (unicode != empty) { 661 Py_DECREF(unicode); 662 Py_INCREF(empty); 663 } 664 return empty; 665 } 666 667 if (length == 1) { 668 int kind = PyUnicode_KIND(unicode); 669 if (kind == PyUnicode_1BYTE_KIND) { 670 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 671 Py_UCS1 ch = data[0]; 672 PyObject *latin1_char = LATIN1(ch); 673 if (unicode != latin1_char) { 674 Py_INCREF(latin1_char); 675 Py_DECREF(unicode); 676 } 677 return latin1_char; 678 } 679 } 680 681 assert(_PyUnicode_CheckConsistency(unicode, 1)); 682 return unicode; 683 } 684 685 static PyObject* unicode_result(PyObject * unicode)686 unicode_result(PyObject *unicode) 687 { 688 assert(_PyUnicode_CHECK(unicode)); 689 if (PyUnicode_IS_READY(unicode)) 690 return unicode_result_ready(unicode); 691 else 692 return unicode_result_wchar(unicode); 693 } 694 695 static PyObject* unicode_result_unchanged(PyObject * unicode)696 unicode_result_unchanged(PyObject *unicode) 697 { 698 if (PyUnicode_CheckExact(unicode)) { 699 if (PyUnicode_READY(unicode) == -1) 700 return NULL; 701 Py_INCREF(unicode); 702 return unicode; 703 } 704 else 705 /* Subtype -- return genuine unicode string with the same value. */ 706 return _PyUnicode_Copy(unicode); 707 } 708 709 /* Implementation of the "backslashreplace" error handler for 8-bit encodings: 710 ASCII, Latin1, UTF-8, etc. */ 711 static char* backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712 backslashreplace(_PyBytesWriter *writer, char *str, 713 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 714 { 715 Py_ssize_t size, i; 716 Py_UCS4 ch; 717 enum PyUnicode_Kind kind; 718 const void *data; 719 720 assert(PyUnicode_IS_READY(unicode)); 721 kind = PyUnicode_KIND(unicode); 722 data = PyUnicode_DATA(unicode); 723 724 size = 0; 725 /* determine replacement size */ 726 for (i = collstart; i < collend; ++i) { 727 Py_ssize_t incr; 728 729 ch = PyUnicode_READ(kind, data, i); 730 if (ch < 0x100) 731 incr = 2+2; 732 else if (ch < 0x10000) 733 incr = 2+4; 734 else { 735 assert(ch <= MAX_UNICODE); 736 incr = 2+8; 737 } 738 if (size > PY_SSIZE_T_MAX - incr) { 739 PyErr_SetString(PyExc_OverflowError, 740 "encoded result is too long for a Python string"); 741 return NULL; 742 } 743 size += incr; 744 } 745 746 str = _PyBytesWriter_Prepare(writer, str, size); 747 if (str == NULL) 748 return NULL; 749 750 /* generate replacement */ 751 for (i = collstart; i < collend; ++i) { 752 ch = PyUnicode_READ(kind, data, i); 753 *str++ = '\\'; 754 if (ch >= 0x00010000) { 755 *str++ = 'U'; 756 *str++ = Py_hexdigits[(ch>>28)&0xf]; 757 *str++ = Py_hexdigits[(ch>>24)&0xf]; 758 *str++ = Py_hexdigits[(ch>>20)&0xf]; 759 *str++ = Py_hexdigits[(ch>>16)&0xf]; 760 *str++ = Py_hexdigits[(ch>>12)&0xf]; 761 *str++ = Py_hexdigits[(ch>>8)&0xf]; 762 } 763 else if (ch >= 0x100) { 764 *str++ = 'u'; 765 *str++ = Py_hexdigits[(ch>>12)&0xf]; 766 *str++ = Py_hexdigits[(ch>>8)&0xf]; 767 } 768 else 769 *str++ = 'x'; 770 *str++ = Py_hexdigits[(ch>>4)&0xf]; 771 *str++ = Py_hexdigits[ch&0xf]; 772 } 773 return str; 774 } 775 776 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: 777 ASCII, Latin1, UTF-8, etc. */ 778 static char* xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779 xmlcharrefreplace(_PyBytesWriter *writer, char *str, 780 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 781 { 782 Py_ssize_t size, i; 783 Py_UCS4 ch; 784 enum PyUnicode_Kind kind; 785 const void *data; 786 787 assert(PyUnicode_IS_READY(unicode)); 788 kind = PyUnicode_KIND(unicode); 789 data = PyUnicode_DATA(unicode); 790 791 size = 0; 792 /* determine replacement size */ 793 for (i = collstart; i < collend; ++i) { 794 Py_ssize_t incr; 795 796 ch = PyUnicode_READ(kind, data, i); 797 if (ch < 10) 798 incr = 2+1+1; 799 else if (ch < 100) 800 incr = 2+2+1; 801 else if (ch < 1000) 802 incr = 2+3+1; 803 else if (ch < 10000) 804 incr = 2+4+1; 805 else if (ch < 100000) 806 incr = 2+5+1; 807 else if (ch < 1000000) 808 incr = 2+6+1; 809 else { 810 assert(ch <= MAX_UNICODE); 811 incr = 2+7+1; 812 } 813 if (size > PY_SSIZE_T_MAX - incr) { 814 PyErr_SetString(PyExc_OverflowError, 815 "encoded result is too long for a Python string"); 816 return NULL; 817 } 818 size += incr; 819 } 820 821 str = _PyBytesWriter_Prepare(writer, str, size); 822 if (str == NULL) 823 return NULL; 824 825 /* generate replacement */ 826 for (i = collstart; i < collend; ++i) { 827 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 828 if (size < 0) { 829 return NULL; 830 } 831 str += size; 832 } 833 return str; 834 } 835 836 /* --- Bloom Filters ----------------------------------------------------- */ 837 838 /* stuff to implement simple "bloom filters" for Unicode characters. 839 to keep things simple, we use a single bitmask, using the least 5 840 bits from each unicode characters as the bit index. */ 841 842 /* the linebreak mask is set up by _PyUnicode_Init() below */ 843 844 #if LONG_BIT >= 128 845 #define BLOOM_WIDTH 128 846 #elif LONG_BIT >= 64 847 #define BLOOM_WIDTH 64 848 #elif LONG_BIT >= 32 849 #define BLOOM_WIDTH 32 850 #else 851 #error "LONG_BIT is smaller than 32" 852 #endif 853 854 #define BLOOM_MASK unsigned long 855 856 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 857 858 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 859 860 #define BLOOM_LINEBREAK(ch) \ 861 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 862 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 863 864 static inline BLOOM_MASK make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) 866 { 867 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 868 do { \ 869 TYPE *data = (TYPE *)PTR; \ 870 TYPE *end = data + LEN; \ 871 Py_UCS4 ch; \ 872 for (; data != end; data++) { \ 873 ch = *data; \ 874 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 875 } \ 876 break; \ 877 } while (0) 878 879 /* calculate simple bloom-style bitmask for a given unicode string */ 880 881 BLOOM_MASK mask; 882 883 mask = 0; 884 switch (kind) { 885 case PyUnicode_1BYTE_KIND: 886 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 887 break; 888 case PyUnicode_2BYTE_KIND: 889 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 890 break; 891 case PyUnicode_4BYTE_KIND: 892 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 893 break; 894 default: 895 Py_UNREACHABLE(); 896 } 897 return mask; 898 899 #undef BLOOM_UPDATE 900 } 901 902 static int ensure_unicode(PyObject * obj)903 ensure_unicode(PyObject *obj) 904 { 905 if (!PyUnicode_Check(obj)) { 906 PyErr_Format(PyExc_TypeError, 907 "must be str, not %.100s", 908 Py_TYPE(obj)->tp_name); 909 return -1; 910 } 911 return PyUnicode_READY(obj); 912 } 913 914 /* Compilation of templated routines */ 915 916 #define STRINGLIB_GET_EMPTY() unicode_get_empty() 917 918 #include "stringlib/asciilib.h" 919 #include "stringlib/fastsearch.h" 920 #include "stringlib/partition.h" 921 #include "stringlib/split.h" 922 #include "stringlib/count.h" 923 #include "stringlib/find.h" 924 #include "stringlib/find_max_char.h" 925 #include "stringlib/undef.h" 926 927 #include "stringlib/ucs1lib.h" 928 #include "stringlib/fastsearch.h" 929 #include "stringlib/partition.h" 930 #include "stringlib/split.h" 931 #include "stringlib/count.h" 932 #include "stringlib/find.h" 933 #include "stringlib/replace.h" 934 #include "stringlib/find_max_char.h" 935 #include "stringlib/undef.h" 936 937 #include "stringlib/ucs2lib.h" 938 #include "stringlib/fastsearch.h" 939 #include "stringlib/partition.h" 940 #include "stringlib/split.h" 941 #include "stringlib/count.h" 942 #include "stringlib/find.h" 943 #include "stringlib/replace.h" 944 #include "stringlib/find_max_char.h" 945 #include "stringlib/undef.h" 946 947 #include "stringlib/ucs4lib.h" 948 #include "stringlib/fastsearch.h" 949 #include "stringlib/partition.h" 950 #include "stringlib/split.h" 951 #include "stringlib/count.h" 952 #include "stringlib/find.h" 953 #include "stringlib/replace.h" 954 #include "stringlib/find_max_char.h" 955 #include "stringlib/undef.h" 956 957 _Py_COMP_DIAG_PUSH 958 _Py_COMP_DIAG_IGNORE_DEPR_DECLS 959 #include "stringlib/unicodedefs.h" 960 #include "stringlib/fastsearch.h" 961 #include "stringlib/count.h" 962 #include "stringlib/find.h" 963 #include "stringlib/undef.h" 964 _Py_COMP_DIAG_POP 965 966 #undef STRINGLIB_GET_EMPTY 967 968 /* --- Unicode Object ----------------------------------------------------- */ 969 970 static inline Py_ssize_t findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971 findchar(const void *s, int kind, 972 Py_ssize_t size, Py_UCS4 ch, 973 int direction) 974 { 975 switch (kind) { 976 case PyUnicode_1BYTE_KIND: 977 if ((Py_UCS1) ch != ch) 978 return -1; 979 if (direction > 0) 980 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch); 981 else 982 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch); 983 case PyUnicode_2BYTE_KIND: 984 if ((Py_UCS2) ch != ch) 985 return -1; 986 if (direction > 0) 987 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch); 988 else 989 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch); 990 case PyUnicode_4BYTE_KIND: 991 if (direction > 0) 992 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch); 993 else 994 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch); 995 default: 996 Py_UNREACHABLE(); 997 } 998 } 999 1000 #ifdef Py_DEBUG 1001 /* Fill the data of a Unicode string with invalid characters to detect bugs 1002 earlier. 1003 1004 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 1005 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 1006 invalid character in Unicode 6.0. */ 1007 static void unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 1009 { 1010 int kind = PyUnicode_KIND(unicode); 1011 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 1012 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 1013 if (length <= old_length) 1014 return; 1015 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 1016 } 1017 #endif 1018 1019 static PyObject* resize_compact(PyObject * unicode,Py_ssize_t length)1020 resize_compact(PyObject *unicode, Py_ssize_t length) 1021 { 1022 Py_ssize_t char_size; 1023 Py_ssize_t struct_size; 1024 Py_ssize_t new_size; 1025 int share_wstr; 1026 PyObject *new_unicode; 1027 #ifdef Py_DEBUG 1028 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 1029 #endif 1030 1031 assert(unicode_modifiable(unicode)); 1032 assert(PyUnicode_IS_READY(unicode)); 1033 assert(PyUnicode_IS_COMPACT(unicode)); 1034 1035 char_size = PyUnicode_KIND(unicode); 1036 if (PyUnicode_IS_ASCII(unicode)) 1037 struct_size = sizeof(PyASCIIObject); 1038 else 1039 struct_size = sizeof(PyCompactUnicodeObject); 1040 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 1041 1042 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 1043 PyErr_NoMemory(); 1044 return NULL; 1045 } 1046 new_size = (struct_size + (length + 1) * char_size); 1047 1048 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { 1049 PyObject_Free(_PyUnicode_UTF8(unicode)); 1050 _PyUnicode_UTF8(unicode) = NULL; 1051 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1052 } 1053 #ifdef Py_REF_DEBUG 1054 _Py_RefTotal--; 1055 #endif 1056 #ifdef Py_TRACE_REFS 1057 _Py_ForgetReference(unicode); 1058 #endif 1059 1060 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size); 1061 if (new_unicode == NULL) { 1062 _Py_NewReference(unicode); 1063 PyErr_NoMemory(); 1064 return NULL; 1065 } 1066 unicode = new_unicode; 1067 _Py_NewReference(unicode); 1068 1069 _PyUnicode_LENGTH(unicode) = length; 1070 if (share_wstr) { 1071 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 1072 if (!PyUnicode_IS_ASCII(unicode)) 1073 _PyUnicode_WSTR_LENGTH(unicode) = length; 1074 } 1075 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 1076 PyObject_Free(_PyUnicode_WSTR(unicode)); 1077 _PyUnicode_WSTR(unicode) = NULL; 1078 if (!PyUnicode_IS_ASCII(unicode)) 1079 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1080 } 1081 #ifdef Py_DEBUG 1082 unicode_fill_invalid(unicode, old_length); 1083 #endif 1084 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 1085 length, 0); 1086 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1087 return unicode; 1088 } 1089 1090 static int resize_inplace(PyObject * unicode,Py_ssize_t length)1091 resize_inplace(PyObject *unicode, Py_ssize_t length) 1092 { 1093 wchar_t *wstr; 1094 Py_ssize_t new_size; 1095 assert(!PyUnicode_IS_COMPACT(unicode)); 1096 assert(Py_REFCNT(unicode) == 1); 1097 1098 if (PyUnicode_IS_READY(unicode)) { 1099 Py_ssize_t char_size; 1100 int share_wstr, share_utf8; 1101 void *data; 1102 #ifdef Py_DEBUG 1103 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 1104 #endif 1105 1106 data = _PyUnicode_DATA_ANY(unicode); 1107 char_size = PyUnicode_KIND(unicode); 1108 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 1109 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 1110 1111 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 1112 PyErr_NoMemory(); 1113 return -1; 1114 } 1115 new_size = (length + 1) * char_size; 1116 1117 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 1118 { 1119 PyObject_Free(_PyUnicode_UTF8(unicode)); 1120 _PyUnicode_UTF8(unicode) = NULL; 1121 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1122 } 1123 1124 data = (PyObject *)PyObject_Realloc(data, new_size); 1125 if (data == NULL) { 1126 PyErr_NoMemory(); 1127 return -1; 1128 } 1129 _PyUnicode_DATA_ANY(unicode) = data; 1130 if (share_wstr) { 1131 _PyUnicode_WSTR(unicode) = data; 1132 _PyUnicode_WSTR_LENGTH(unicode) = length; 1133 } 1134 if (share_utf8) { 1135 _PyUnicode_UTF8(unicode) = data; 1136 _PyUnicode_UTF8_LENGTH(unicode) = length; 1137 } 1138 _PyUnicode_LENGTH(unicode) = length; 1139 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 1140 #ifdef Py_DEBUG 1141 unicode_fill_invalid(unicode, old_length); 1142 #endif 1143 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 1144 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1145 return 0; 1146 } 1147 } 1148 assert(_PyUnicode_WSTR(unicode) != NULL); 1149 1150 /* check for integer overflow */ 1151 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 1152 PyErr_NoMemory(); 1153 return -1; 1154 } 1155 new_size = sizeof(wchar_t) * (length + 1); 1156 wstr = _PyUnicode_WSTR(unicode); 1157 wstr = PyObject_Realloc(wstr, new_size); 1158 if (!wstr) { 1159 PyErr_NoMemory(); 1160 return -1; 1161 } 1162 _PyUnicode_WSTR(unicode) = wstr; 1163 _PyUnicode_WSTR(unicode)[length] = 0; 1164 _PyUnicode_WSTR_LENGTH(unicode) = length; 1165 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1166 return 0; 1167 } 1168 1169 static PyObject* resize_copy(PyObject * unicode,Py_ssize_t length)1170 resize_copy(PyObject *unicode, Py_ssize_t length) 1171 { 1172 Py_ssize_t copy_length; 1173 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 1174 PyObject *copy; 1175 1176 assert(PyUnicode_IS_READY(unicode)); 1177 1178 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 1179 if (copy == NULL) 1180 return NULL; 1181 1182 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 1183 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 1184 return copy; 1185 } 1186 else { 1187 PyObject *w; 1188 1189 w = (PyObject*)_PyUnicode_New(length); 1190 if (w == NULL) 1191 return NULL; 1192 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 1193 copy_length = Py_MIN(copy_length, length); 1194 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 1195 copy_length * sizeof(wchar_t)); 1196 return w; 1197 } 1198 } 1199 1200 /* We allocate one more byte to make sure the string is 1201 Ux0000 terminated; some code (e.g. new_identifier) 1202 relies on that. 1203 1204 XXX This allocator could further be enhanced by assuring that the 1205 free list never reduces its size below 1. 1206 1207 */ 1208 1209 static PyUnicodeObject * _PyUnicode_New(Py_ssize_t length)1210 _PyUnicode_New(Py_ssize_t length) 1211 { 1212 PyUnicodeObject *unicode; 1213 size_t new_size; 1214 1215 /* Optimization for empty strings */ 1216 if (length == 0) { 1217 return (PyUnicodeObject *)unicode_new_empty(); 1218 } 1219 1220 /* Ensure we won't overflow the size. */ 1221 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 1222 return (PyUnicodeObject *)PyErr_NoMemory(); 1223 } 1224 if (length < 0) { 1225 PyErr_SetString(PyExc_SystemError, 1226 "Negative size passed to _PyUnicode_New"); 1227 return NULL; 1228 } 1229 1230 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 1231 if (unicode == NULL) 1232 return NULL; 1233 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 1234 1235 _PyUnicode_WSTR_LENGTH(unicode) = length; 1236 _PyUnicode_HASH(unicode) = -1; 1237 _PyUnicode_STATE(unicode).interned = 0; 1238 _PyUnicode_STATE(unicode).kind = 0; 1239 _PyUnicode_STATE(unicode).compact = 0; 1240 _PyUnicode_STATE(unicode).ready = 0; 1241 _PyUnicode_STATE(unicode).ascii = 0; 1242 _PyUnicode_DATA_ANY(unicode) = NULL; 1243 _PyUnicode_LENGTH(unicode) = 0; 1244 _PyUnicode_UTF8(unicode) = NULL; 1245 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1246 1247 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size); 1248 if (!_PyUnicode_WSTR(unicode)) { 1249 Py_DECREF(unicode); 1250 PyErr_NoMemory(); 1251 return NULL; 1252 } 1253 1254 /* Initialize the first element to guard against cases where 1255 * the caller fails before initializing str -- unicode_resize() 1256 * reads str[0], and the Keep-Alive optimization can keep memory 1257 * allocated for str alive across a call to unicode_dealloc(unicode). 1258 * We don't want unicode_resize to read uninitialized memory in 1259 * that case. 1260 */ 1261 _PyUnicode_WSTR(unicode)[0] = 0; 1262 _PyUnicode_WSTR(unicode)[length] = 0; 1263 1264 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 1265 return unicode; 1266 } 1267 1268 static const char* unicode_kind_name(PyObject * unicode)1269 unicode_kind_name(PyObject *unicode) 1270 { 1271 /* don't check consistency: unicode_kind_name() is called from 1272 _PyUnicode_Dump() */ 1273 if (!PyUnicode_IS_COMPACT(unicode)) 1274 { 1275 if (!PyUnicode_IS_READY(unicode)) 1276 return "wstr"; 1277 switch (PyUnicode_KIND(unicode)) 1278 { 1279 case PyUnicode_1BYTE_KIND: 1280 if (PyUnicode_IS_ASCII(unicode)) 1281 return "legacy ascii"; 1282 else 1283 return "legacy latin1"; 1284 case PyUnicode_2BYTE_KIND: 1285 return "legacy UCS2"; 1286 case PyUnicode_4BYTE_KIND: 1287 return "legacy UCS4"; 1288 default: 1289 return "<legacy invalid kind>"; 1290 } 1291 } 1292 assert(PyUnicode_IS_READY(unicode)); 1293 switch (PyUnicode_KIND(unicode)) { 1294 case PyUnicode_1BYTE_KIND: 1295 if (PyUnicode_IS_ASCII(unicode)) 1296 return "ascii"; 1297 else 1298 return "latin1"; 1299 case PyUnicode_2BYTE_KIND: 1300 return "UCS2"; 1301 case PyUnicode_4BYTE_KIND: 1302 return "UCS4"; 1303 default: 1304 return "<invalid compact kind>"; 1305 } 1306 } 1307 1308 #ifdef Py_DEBUG 1309 /* Functions wrapping macros for use in debugger */ _PyUnicode_utf8(void * unicode_raw)1310 const char *_PyUnicode_utf8(void *unicode_raw){ 1311 PyObject *unicode = _PyObject_CAST(unicode_raw); 1312 return PyUnicode_UTF8(unicode); 1313 } 1314 _PyUnicode_compact_data(void * unicode_raw)1315 const void *_PyUnicode_compact_data(void *unicode_raw) { 1316 PyObject *unicode = _PyObject_CAST(unicode_raw); 1317 return _PyUnicode_COMPACT_DATA(unicode); 1318 } _PyUnicode_data(void * unicode_raw)1319 const void *_PyUnicode_data(void *unicode_raw) { 1320 PyObject *unicode = _PyObject_CAST(unicode_raw); 1321 printf("obj %p\n", (void*)unicode); 1322 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1323 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1324 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1)); 1325 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1)); 1326 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1327 return PyUnicode_DATA(unicode); 1328 } 1329 1330 void _PyUnicode_Dump(PyObject * op)1331 _PyUnicode_Dump(PyObject *op) 1332 { 1333 PyASCIIObject *ascii = _PyASCIIObject_CAST(op); 1334 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); 1335 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); 1336 const void *data; 1337 1338 if (ascii->state.compact) 1339 { 1340 if (ascii->state.ascii) 1341 data = (ascii + 1); 1342 else 1343 data = (compact + 1); 1344 } 1345 else 1346 data = unicode->data.any; 1347 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length); 1348 1349 if (ascii->wstr == data) 1350 printf("shared "); 1351 printf("wstr=%p", (void *)ascii->wstr); 1352 1353 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1354 printf(" (%zu), ", compact->wstr_length); 1355 if (!ascii->state.compact && compact->utf8 == unicode->data.any) { 1356 printf("shared "); 1357 } 1358 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length); 1359 } 1360 printf(", data=%p\n", data); 1361 } 1362 #endif 1363 1364 1365 PyObject * PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1367 { 1368 /* Optimization for empty strings */ 1369 if (size == 0) { 1370 return unicode_new_empty(); 1371 } 1372 1373 PyObject *obj; 1374 PyCompactUnicodeObject *unicode; 1375 void *data; 1376 enum PyUnicode_Kind kind; 1377 int is_sharing, is_ascii; 1378 Py_ssize_t char_size; 1379 Py_ssize_t struct_size; 1380 1381 is_ascii = 0; 1382 is_sharing = 0; 1383 struct_size = sizeof(PyCompactUnicodeObject); 1384 if (maxchar < 128) { 1385 kind = PyUnicode_1BYTE_KIND; 1386 char_size = 1; 1387 is_ascii = 1; 1388 struct_size = sizeof(PyASCIIObject); 1389 } 1390 else if (maxchar < 256) { 1391 kind = PyUnicode_1BYTE_KIND; 1392 char_size = 1; 1393 } 1394 else if (maxchar < 65536) { 1395 kind = PyUnicode_2BYTE_KIND; 1396 char_size = 2; 1397 if (sizeof(wchar_t) == 2) 1398 is_sharing = 1; 1399 } 1400 else { 1401 if (maxchar > MAX_UNICODE) { 1402 PyErr_SetString(PyExc_SystemError, 1403 "invalid maximum character passed to PyUnicode_New"); 1404 return NULL; 1405 } 1406 kind = PyUnicode_4BYTE_KIND; 1407 char_size = 4; 1408 if (sizeof(wchar_t) == 4) 1409 is_sharing = 1; 1410 } 1411 1412 /* Ensure we won't overflow the size. */ 1413 if (size < 0) { 1414 PyErr_SetString(PyExc_SystemError, 1415 "Negative size passed to PyUnicode_New"); 1416 return NULL; 1417 } 1418 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1419 return PyErr_NoMemory(); 1420 1421 /* Duplicated allocation code from _PyObject_New() instead of a call to 1422 * PyObject_New() so we are able to allocate space for the object and 1423 * it's data buffer. 1424 */ 1425 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size); 1426 if (obj == NULL) { 1427 return PyErr_NoMemory(); 1428 } 1429 _PyObject_Init(obj, &PyUnicode_Type); 1430 1431 unicode = (PyCompactUnicodeObject *)obj; 1432 if (is_ascii) 1433 data = ((PyASCIIObject*)obj) + 1; 1434 else 1435 data = unicode + 1; 1436 _PyUnicode_LENGTH(unicode) = size; 1437 _PyUnicode_HASH(unicode) = -1; 1438 _PyUnicode_STATE(unicode).interned = 0; 1439 _PyUnicode_STATE(unicode).kind = kind; 1440 _PyUnicode_STATE(unicode).compact = 1; 1441 _PyUnicode_STATE(unicode).ready = 1; 1442 _PyUnicode_STATE(unicode).ascii = is_ascii; 1443 if (is_ascii) { 1444 ((char*)data)[size] = 0; 1445 _PyUnicode_WSTR(unicode) = NULL; 1446 } 1447 else if (kind == PyUnicode_1BYTE_KIND) { 1448 ((char*)data)[size] = 0; 1449 _PyUnicode_WSTR(unicode) = NULL; 1450 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1451 unicode->utf8 = NULL; 1452 unicode->utf8_length = 0; 1453 } 1454 else { 1455 unicode->utf8 = NULL; 1456 unicode->utf8_length = 0; 1457 if (kind == PyUnicode_2BYTE_KIND) 1458 ((Py_UCS2*)data)[size] = 0; 1459 else /* kind == PyUnicode_4BYTE_KIND */ 1460 ((Py_UCS4*)data)[size] = 0; 1461 if (is_sharing) { 1462 _PyUnicode_WSTR_LENGTH(unicode) = size; 1463 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1464 } 1465 else { 1466 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1467 _PyUnicode_WSTR(unicode) = NULL; 1468 } 1469 } 1470 #ifdef Py_DEBUG 1471 unicode_fill_invalid((PyObject*)unicode, 0); 1472 #endif 1473 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1474 return obj; 1475 } 1476 1477 #if SIZEOF_WCHAR_T == 2 1478 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1479 will decode surrogate pairs, the other conversions are implemented as macros 1480 for efficiency. 1481 1482 This function assumes that unicode can hold one more code point than wstr 1483 characters for a terminating null character. */ 1484 static void unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1486 PyObject *unicode) 1487 { 1488 const wchar_t *iter; 1489 Py_UCS4 *ucs4_out; 1490 1491 assert(unicode != NULL); 1492 assert(_PyUnicode_CHECK(unicode)); 1493 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1494 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1495 1496 for (iter = begin; iter < end; ) { 1497 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1498 _PyUnicode_GET_LENGTH(unicode))); 1499 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1500 && (iter+1) < end 1501 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1502 { 1503 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1504 iter += 2; 1505 } 1506 else { 1507 *ucs4_out++ = *iter; 1508 iter++; 1509 } 1510 } 1511 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1512 _PyUnicode_GET_LENGTH(unicode))); 1513 1514 } 1515 #endif 1516 1517 static int unicode_check_modifiable(PyObject * unicode)1518 unicode_check_modifiable(PyObject *unicode) 1519 { 1520 if (!unicode_modifiable(unicode)) { 1521 PyErr_SetString(PyExc_SystemError, 1522 "Cannot modify a string currently used"); 1523 return -1; 1524 } 1525 return 0; 1526 } 1527 1528 static int _copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529 _copy_characters(PyObject *to, Py_ssize_t to_start, 1530 PyObject *from, Py_ssize_t from_start, 1531 Py_ssize_t how_many, int check_maxchar) 1532 { 1533 unsigned int from_kind, to_kind; 1534 const void *from_data; 1535 void *to_data; 1536 1537 assert(0 <= how_many); 1538 assert(0 <= from_start); 1539 assert(0 <= to_start); 1540 assert(PyUnicode_Check(from)); 1541 assert(PyUnicode_IS_READY(from)); 1542 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1543 1544 assert(PyUnicode_Check(to)); 1545 assert(PyUnicode_IS_READY(to)); 1546 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1547 1548 if (how_many == 0) 1549 return 0; 1550 1551 from_kind = PyUnicode_KIND(from); 1552 from_data = PyUnicode_DATA(from); 1553 to_kind = PyUnicode_KIND(to); 1554 to_data = PyUnicode_DATA(to); 1555 1556 #ifdef Py_DEBUG 1557 if (!check_maxchar 1558 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1559 { 1560 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1561 Py_UCS4 ch; 1562 Py_ssize_t i; 1563 for (i=0; i < how_many; i++) { 1564 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1565 assert(ch <= to_maxchar); 1566 } 1567 } 1568 #endif 1569 1570 if (from_kind == to_kind) { 1571 if (check_maxchar 1572 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1573 { 1574 /* Writing Latin-1 characters into an ASCII string requires to 1575 check that all written characters are pure ASCII */ 1576 Py_UCS4 max_char; 1577 max_char = ucs1lib_find_max_char(from_data, 1578 (const Py_UCS1*)from_data + how_many); 1579 if (max_char >= 128) 1580 return -1; 1581 } 1582 memcpy((char*)to_data + to_kind * to_start, 1583 (const char*)from_data + from_kind * from_start, 1584 to_kind * how_many); 1585 } 1586 else if (from_kind == PyUnicode_1BYTE_KIND 1587 && to_kind == PyUnicode_2BYTE_KIND) 1588 { 1589 _PyUnicode_CONVERT_BYTES( 1590 Py_UCS1, Py_UCS2, 1591 PyUnicode_1BYTE_DATA(from) + from_start, 1592 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1593 PyUnicode_2BYTE_DATA(to) + to_start 1594 ); 1595 } 1596 else if (from_kind == PyUnicode_1BYTE_KIND 1597 && to_kind == PyUnicode_4BYTE_KIND) 1598 { 1599 _PyUnicode_CONVERT_BYTES( 1600 Py_UCS1, Py_UCS4, 1601 PyUnicode_1BYTE_DATA(from) + from_start, 1602 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1603 PyUnicode_4BYTE_DATA(to) + to_start 1604 ); 1605 } 1606 else if (from_kind == PyUnicode_2BYTE_KIND 1607 && to_kind == PyUnicode_4BYTE_KIND) 1608 { 1609 _PyUnicode_CONVERT_BYTES( 1610 Py_UCS2, Py_UCS4, 1611 PyUnicode_2BYTE_DATA(from) + from_start, 1612 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1613 PyUnicode_4BYTE_DATA(to) + to_start 1614 ); 1615 } 1616 else { 1617 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1618 1619 if (!check_maxchar) { 1620 if (from_kind == PyUnicode_2BYTE_KIND 1621 && to_kind == PyUnicode_1BYTE_KIND) 1622 { 1623 _PyUnicode_CONVERT_BYTES( 1624 Py_UCS2, Py_UCS1, 1625 PyUnicode_2BYTE_DATA(from) + from_start, 1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1627 PyUnicode_1BYTE_DATA(to) + to_start 1628 ); 1629 } 1630 else if (from_kind == PyUnicode_4BYTE_KIND 1631 && to_kind == PyUnicode_1BYTE_KIND) 1632 { 1633 _PyUnicode_CONVERT_BYTES( 1634 Py_UCS4, Py_UCS1, 1635 PyUnicode_4BYTE_DATA(from) + from_start, 1636 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1637 PyUnicode_1BYTE_DATA(to) + to_start 1638 ); 1639 } 1640 else if (from_kind == PyUnicode_4BYTE_KIND 1641 && to_kind == PyUnicode_2BYTE_KIND) 1642 { 1643 _PyUnicode_CONVERT_BYTES( 1644 Py_UCS4, Py_UCS2, 1645 PyUnicode_4BYTE_DATA(from) + from_start, 1646 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1647 PyUnicode_2BYTE_DATA(to) + to_start 1648 ); 1649 } 1650 else { 1651 Py_UNREACHABLE(); 1652 } 1653 } 1654 else { 1655 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1656 Py_UCS4 ch; 1657 Py_ssize_t i; 1658 1659 for (i=0; i < how_many; i++) { 1660 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1661 if (ch > to_maxchar) 1662 return -1; 1663 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1664 } 1665 } 1666 } 1667 return 0; 1668 } 1669 1670 void _PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671 _PyUnicode_FastCopyCharacters( 1672 PyObject *to, Py_ssize_t to_start, 1673 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1674 { 1675 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1676 } 1677 1678 Py_ssize_t PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1680 PyObject *from, Py_ssize_t from_start, 1681 Py_ssize_t how_many) 1682 { 1683 int err; 1684 1685 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1686 PyErr_BadInternalCall(); 1687 return -1; 1688 } 1689 1690 if (PyUnicode_READY(from) == -1) 1691 return -1; 1692 if (PyUnicode_READY(to) == -1) 1693 return -1; 1694 1695 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { 1696 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1697 return -1; 1698 } 1699 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { 1700 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1701 return -1; 1702 } 1703 if (how_many < 0) { 1704 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); 1705 return -1; 1706 } 1707 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); 1708 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1709 PyErr_Format(PyExc_SystemError, 1710 "Cannot write %zi characters at %zi " 1711 "in a string of %zi characters", 1712 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1713 return -1; 1714 } 1715 1716 if (how_many == 0) 1717 return 0; 1718 1719 if (unicode_check_modifiable(to)) 1720 return -1; 1721 1722 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1723 if (err) { 1724 PyErr_Format(PyExc_SystemError, 1725 "Cannot copy %s characters " 1726 "into a string of %s characters", 1727 unicode_kind_name(from), 1728 unicode_kind_name(to)); 1729 return -1; 1730 } 1731 return how_many; 1732 } 1733 1734 /* Find the maximum code point and count the number of surrogate pairs so a 1735 correct string length can be computed before converting a string to UCS4. 1736 This function counts single surrogates as a character and not as a pair. 1737 1738 Return 0 on success, or -1 on error. */ 1739 static int find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1741 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1742 { 1743 const wchar_t *iter; 1744 Py_UCS4 ch; 1745 1746 assert(num_surrogates != NULL && maxchar != NULL); 1747 *num_surrogates = 0; 1748 *maxchar = 0; 1749 1750 for (iter = begin; iter < end; ) { 1751 #if SIZEOF_WCHAR_T == 2 1752 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1753 && (iter+1) < end 1754 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1755 { 1756 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1757 ++(*num_surrogates); 1758 iter += 2; 1759 } 1760 else 1761 #endif 1762 { 1763 ch = *iter; 1764 iter++; 1765 } 1766 if (ch > *maxchar) { 1767 *maxchar = ch; 1768 if (*maxchar > MAX_UNICODE) { 1769 PyErr_Format(PyExc_ValueError, 1770 "character U+%x is not in range [U+0000; U+%x]", 1771 ch, MAX_UNICODE); 1772 return -1; 1773 } 1774 } 1775 } 1776 return 0; 1777 } 1778 1779 int _PyUnicode_Ready(PyObject * unicode)1780 _PyUnicode_Ready(PyObject *unicode) 1781 { 1782 wchar_t *end; 1783 Py_UCS4 maxchar = 0; 1784 Py_ssize_t num_surrogates; 1785 #if SIZEOF_WCHAR_T == 2 1786 Py_ssize_t length_wo_surrogates; 1787 #endif 1788 1789 /* _PyUnicode_Ready() is only intended for old-style API usage where 1790 strings were created using _PyObject_New() and where no canonical 1791 representation (the str field) has been set yet aka strings 1792 which are not yet ready. */ 1793 assert(_PyUnicode_CHECK(unicode)); 1794 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1795 assert(_PyUnicode_WSTR(unicode) != NULL); 1796 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1797 assert(_PyUnicode_UTF8(unicode) == NULL); 1798 /* Actually, it should neither be interned nor be anything else: */ 1799 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1800 1801 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1802 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1803 &maxchar, &num_surrogates) == -1) 1804 return -1; 1805 1806 if (maxchar < 256) { 1807 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1808 if (!_PyUnicode_DATA_ANY(unicode)) { 1809 PyErr_NoMemory(); 1810 return -1; 1811 } 1812 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1813 _PyUnicode_WSTR(unicode), end, 1814 PyUnicode_1BYTE_DATA(unicode)); 1815 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1816 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1817 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1818 if (maxchar < 128) { 1819 _PyUnicode_STATE(unicode).ascii = 1; 1820 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1821 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1822 } 1823 else { 1824 _PyUnicode_STATE(unicode).ascii = 0; 1825 _PyUnicode_UTF8(unicode) = NULL; 1826 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1827 } 1828 PyObject_Free(_PyUnicode_WSTR(unicode)); 1829 _PyUnicode_WSTR(unicode) = NULL; 1830 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1831 } 1832 /* In this case we might have to convert down from 4-byte native 1833 wchar_t to 2-byte unicode. */ 1834 else if (maxchar < 65536) { 1835 assert(num_surrogates == 0 && 1836 "FindMaxCharAndNumSurrogatePairs() messed up"); 1837 1838 #if SIZEOF_WCHAR_T == 2 1839 /* We can share representations and are done. */ 1840 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1841 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1843 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1844 _PyUnicode_UTF8(unicode) = NULL; 1845 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1846 #else 1847 /* sizeof(wchar_t) == 4 */ 1848 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc( 1849 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1850 if (!_PyUnicode_DATA_ANY(unicode)) { 1851 PyErr_NoMemory(); 1852 return -1; 1853 } 1854 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1855 _PyUnicode_WSTR(unicode), end, 1856 PyUnicode_2BYTE_DATA(unicode)); 1857 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1858 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1859 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1860 _PyUnicode_UTF8(unicode) = NULL; 1861 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1862 PyObject_Free(_PyUnicode_WSTR(unicode)); 1863 _PyUnicode_WSTR(unicode) = NULL; 1864 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1865 #endif 1866 } 1867 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */ 1868 else { 1869 #if SIZEOF_WCHAR_T == 2 1870 /* in case the native representation is 2-bytes, we need to allocate a 1871 new normalized 4-byte version. */ 1872 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1873 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1874 PyErr_NoMemory(); 1875 return -1; 1876 } 1877 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1)); 1878 if (!_PyUnicode_DATA_ANY(unicode)) { 1879 PyErr_NoMemory(); 1880 return -1; 1881 } 1882 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1884 _PyUnicode_UTF8(unicode) = NULL; 1885 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1886 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1887 _PyUnicode_STATE(unicode).ready = 1; 1888 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1889 PyObject_Free(_PyUnicode_WSTR(unicode)); 1890 _PyUnicode_WSTR(unicode) = NULL; 1891 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1892 #else 1893 assert(num_surrogates == 0); 1894 1895 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1897 _PyUnicode_UTF8(unicode) = NULL; 1898 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1899 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1900 #endif 1901 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1902 } 1903 _PyUnicode_STATE(unicode).ready = 1; 1904 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1905 return 0; 1906 } 1907 1908 static void unicode_dealloc(PyObject * unicode)1909 unicode_dealloc(PyObject *unicode) 1910 { 1911 #ifdef Py_DEBUG 1912 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) { 1913 _Py_FatalRefcountError("deallocating an Unicode singleton"); 1914 } 1915 #endif 1916 1917 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1918 case SSTATE_NOT_INTERNED: 1919 break; 1920 case SSTATE_INTERNED_MORTAL: 1921 { 1922 /* Revive the dead object temporarily. PyDict_DelItem() removes two 1923 references (key and value) which were ignored by 1924 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 1925 to prevent calling unicode_dealloc() again. Adjust refcnt after 1926 PyDict_DelItem(). */ 1927 assert(Py_REFCNT(unicode) == 0); 1928 Py_SET_REFCNT(unicode, 3); 1929 if (PyDict_DelItem(interned, unicode) != 0) { 1930 _PyErr_WriteUnraisableMsg("deletion of interned string failed", 1931 NULL); 1932 } 1933 assert(Py_REFCNT(unicode) == 1); 1934 Py_SET_REFCNT(unicode, 0); 1935 break; 1936 } 1937 1938 case SSTATE_INTERNED_IMMORTAL: 1939 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died"); 1940 break; 1941 1942 default: 1943 Py_UNREACHABLE(); 1944 } 1945 1946 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 1947 PyObject_Free(_PyUnicode_WSTR(unicode)); 1948 } 1949 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { 1950 PyObject_Free(_PyUnicode_UTF8(unicode)); 1951 } 1952 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) { 1953 PyObject_Free(_PyUnicode_DATA_ANY(unicode)); 1954 } 1955 1956 Py_TYPE(unicode)->tp_free(unicode); 1957 } 1958 1959 #ifdef Py_DEBUG 1960 static int unicode_is_singleton(PyObject * unicode)1961 unicode_is_singleton(PyObject *unicode) 1962 { 1963 if (unicode == &_Py_STR(empty)) { 1964 return 1; 1965 } 1966 1967 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode); 1968 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { 1969 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1970 if (ch < 256 && LATIN1(ch) == unicode) { 1971 return 1; 1972 } 1973 } 1974 return 0; 1975 } 1976 #endif 1977 1978 static int unicode_modifiable(PyObject * unicode)1979 unicode_modifiable(PyObject *unicode) 1980 { 1981 assert(_PyUnicode_CHECK(unicode)); 1982 if (Py_REFCNT(unicode) != 1) 1983 return 0; 1984 if (_PyUnicode_HASH(unicode) != -1) 1985 return 0; 1986 if (PyUnicode_CHECK_INTERNED(unicode)) 1987 return 0; 1988 if (!PyUnicode_CheckExact(unicode)) 1989 return 0; 1990 #ifdef Py_DEBUG 1991 /* singleton refcount is greater than 1 */ 1992 assert(!unicode_is_singleton(unicode)); 1993 #endif 1994 return 1; 1995 } 1996 1997 static int unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998 unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1999 { 2000 PyObject *unicode; 2001 Py_ssize_t old_length; 2002 2003 assert(p_unicode != NULL); 2004 unicode = *p_unicode; 2005 2006 assert(unicode != NULL); 2007 assert(PyUnicode_Check(unicode)); 2008 assert(0 <= length); 2009 2010 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 2011 old_length = PyUnicode_WSTR_LENGTH(unicode); 2012 else 2013 old_length = PyUnicode_GET_LENGTH(unicode); 2014 if (old_length == length) 2015 return 0; 2016 2017 if (length == 0) { 2018 PyObject *empty = unicode_new_empty(); 2019 Py_SETREF(*p_unicode, empty); 2020 return 0; 2021 } 2022 2023 if (!unicode_modifiable(unicode)) { 2024 PyObject *copy = resize_copy(unicode, length); 2025 if (copy == NULL) 2026 return -1; 2027 Py_SETREF(*p_unicode, copy); 2028 return 0; 2029 } 2030 2031 if (PyUnicode_IS_COMPACT(unicode)) { 2032 PyObject *new_unicode = resize_compact(unicode, length); 2033 if (new_unicode == NULL) 2034 return -1; 2035 *p_unicode = new_unicode; 2036 return 0; 2037 } 2038 return resize_inplace(unicode, length); 2039 } 2040 2041 int PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 2043 { 2044 PyObject *unicode; 2045 if (p_unicode == NULL) { 2046 PyErr_BadInternalCall(); 2047 return -1; 2048 } 2049 unicode = *p_unicode; 2050 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 2051 { 2052 PyErr_BadInternalCall(); 2053 return -1; 2054 } 2055 return unicode_resize(p_unicode, length); 2056 } 2057 2058 /* Copy an ASCII or latin1 char* string into a Python Unicode string. 2059 2060 WARNING: The function doesn't copy the terminating null character and 2061 doesn't check the maximum character (may write a latin1 character in an 2062 ASCII string). */ 2063 static void unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 2065 const char *str, Py_ssize_t len) 2066 { 2067 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 2068 const void *data = PyUnicode_DATA(unicode); 2069 const char *end = str + len; 2070 2071 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 2072 switch (kind) { 2073 case PyUnicode_1BYTE_KIND: { 2074 #ifdef Py_DEBUG 2075 if (PyUnicode_IS_ASCII(unicode)) { 2076 Py_UCS4 maxchar = ucs1lib_find_max_char( 2077 (const Py_UCS1*)str, 2078 (const Py_UCS1*)str + len); 2079 assert(maxchar < 128); 2080 } 2081 #endif 2082 memcpy((char *) data + index, str, len); 2083 break; 2084 } 2085 case PyUnicode_2BYTE_KIND: { 2086 Py_UCS2 *start = (Py_UCS2 *)data + index; 2087 Py_UCS2 *ucs2 = start; 2088 2089 for (; str < end; ++ucs2, ++str) 2090 *ucs2 = (Py_UCS2)*str; 2091 2092 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 2093 break; 2094 } 2095 case PyUnicode_4BYTE_KIND: { 2096 Py_UCS4 *start = (Py_UCS4 *)data + index; 2097 Py_UCS4 *ucs4 = start; 2098 2099 for (; str < end; ++ucs4, ++str) 2100 *ucs4 = (Py_UCS4)*str; 2101 2102 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 2103 break; 2104 } 2105 default: 2106 Py_UNREACHABLE(); 2107 } 2108 } 2109 2110 static PyObject* get_latin1_char(Py_UCS1 ch)2111 get_latin1_char(Py_UCS1 ch) 2112 { 2113 return Py_NewRef(LATIN1(ch)); 2114 } 2115 2116 static PyObject* unicode_char(Py_UCS4 ch)2117 unicode_char(Py_UCS4 ch) 2118 { 2119 PyObject *unicode; 2120 2121 assert(ch <= MAX_UNICODE); 2122 2123 if (ch < 256) { 2124 return get_latin1_char(ch); 2125 } 2126 2127 unicode = PyUnicode_New(1, ch); 2128 if (unicode == NULL) 2129 return NULL; 2130 2131 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); 2132 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 2133 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 2134 } else { 2135 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 2136 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 2137 } 2138 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2139 return unicode; 2140 } 2141 2142 PyObject * PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 2144 { 2145 if (u == NULL) { 2146 if (size > 0) { 2147 if (PyErr_WarnEx(PyExc_DeprecationWarning, 2148 "PyUnicode_FromUnicode(NULL, size) is deprecated; " 2149 "use PyUnicode_New() instead", 1) < 0) { 2150 return NULL; 2151 } 2152 } 2153 return (PyObject*)_PyUnicode_New(size); 2154 } 2155 2156 if (size < 0) { 2157 PyErr_BadInternalCall(); 2158 return NULL; 2159 } 2160 2161 return PyUnicode_FromWideChar(u, size); 2162 } 2163 2164 PyObject * PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) 2166 { 2167 PyObject *unicode; 2168 Py_UCS4 maxchar = 0; 2169 Py_ssize_t num_surrogates; 2170 2171 if (u == NULL && size != 0) { 2172 PyErr_BadInternalCall(); 2173 return NULL; 2174 } 2175 2176 if (size == -1) { 2177 size = wcslen(u); 2178 } 2179 2180 /* If the Unicode data is known at construction time, we can apply 2181 some optimizations which share commonly used objects. */ 2182 2183 /* Optimization for empty strings */ 2184 if (size == 0) 2185 _Py_RETURN_UNICODE_EMPTY(); 2186 2187 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 2188 /* Oracle Solaris uses non-Unicode internal wchar_t form for 2189 non-Unicode locales and hence needs conversion to UCS-4 first. */ 2190 if (_Py_LocaleUsesNonUnicodeWchar()) { 2191 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); 2192 if (!converted) { 2193 return NULL; 2194 } 2195 PyObject *unicode = _PyUnicode_FromUCS4(converted, size); 2196 PyMem_Free(converted); 2197 return unicode; 2198 } 2199 #endif 2200 2201 /* Single character Unicode objects in the Latin-1 range are 2202 shared when using this constructor */ 2203 if (size == 1 && (Py_UCS4)*u < 256) 2204 return get_latin1_char((unsigned char)*u); 2205 2206 /* If not empty and not single character, copy the Unicode data 2207 into the new object */ 2208 if (find_maxchar_surrogates(u, u + size, 2209 &maxchar, &num_surrogates) == -1) 2210 return NULL; 2211 2212 unicode = PyUnicode_New(size - num_surrogates, maxchar); 2213 if (!unicode) 2214 return NULL; 2215 2216 switch (PyUnicode_KIND(unicode)) { 2217 case PyUnicode_1BYTE_KIND: 2218 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 2219 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 2220 break; 2221 case PyUnicode_2BYTE_KIND: 2222 #if Py_UNICODE_SIZE == 2 2223 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 2224 #else 2225 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 2226 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 2227 #endif 2228 break; 2229 case PyUnicode_4BYTE_KIND: 2230 #if SIZEOF_WCHAR_T == 2 2231 /* This is the only case which has to process surrogates, thus 2232 a simple copy loop is not enough and we need a function. */ 2233 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 2234 #else 2235 assert(num_surrogates == 0); 2236 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 2237 #endif 2238 break; 2239 default: 2240 Py_UNREACHABLE(); 2241 } 2242 2243 return unicode_result(unicode); 2244 } 2245 2246 PyObject * PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 2248 { 2249 if (size < 0) { 2250 PyErr_SetString(PyExc_SystemError, 2251 "Negative size passed to PyUnicode_FromStringAndSize"); 2252 return NULL; 2253 } 2254 if (u != NULL) { 2255 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 2256 } 2257 else { 2258 if (size > 0) { 2259 if (PyErr_WarnEx(PyExc_DeprecationWarning, 2260 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; " 2261 "use PyUnicode_New() instead", 1) < 0) { 2262 return NULL; 2263 } 2264 } 2265 return (PyObject *)_PyUnicode_New(size); 2266 } 2267 } 2268 2269 PyObject * PyUnicode_FromString(const char * u)2270 PyUnicode_FromString(const char *u) 2271 { 2272 size_t size = strlen(u); 2273 if (size > PY_SSIZE_T_MAX) { 2274 PyErr_SetString(PyExc_OverflowError, "input too long"); 2275 return NULL; 2276 } 2277 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 2278 } 2279 2280 2281 PyObject * _PyUnicode_FromId(_Py_Identifier * id)2282 _PyUnicode_FromId(_Py_Identifier *id) 2283 { 2284 PyInterpreterState *interp = _PyInterpreterState_GET(); 2285 struct _Py_unicode_ids *ids = &interp->unicode.ids; 2286 2287 Py_ssize_t index = _Py_atomic_size_get(&id->index); 2288 if (index < 0) { 2289 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; 2290 2291 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); 2292 // Check again to detect concurrent access. Another thread can have 2293 // initialized the index while this thread waited for the lock. 2294 index = _Py_atomic_size_get(&id->index); 2295 if (index < 0) { 2296 assert(rt_ids->next_index < PY_SSIZE_T_MAX); 2297 index = rt_ids->next_index; 2298 rt_ids->next_index++; 2299 _Py_atomic_size_set(&id->index, index); 2300 } 2301 PyThread_release_lock(rt_ids->lock); 2302 } 2303 assert(index >= 0); 2304 2305 PyObject *obj; 2306 if (index < ids->size) { 2307 obj = ids->array[index]; 2308 if (obj) { 2309 // Return a borrowed reference 2310 return obj; 2311 } 2312 } 2313 2314 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string), 2315 NULL, NULL); 2316 if (!obj) { 2317 return NULL; 2318 } 2319 PyUnicode_InternInPlace(&obj); 2320 2321 if (index >= ids->size) { 2322 // Overallocate to reduce the number of realloc 2323 Py_ssize_t new_size = Py_MAX(index * 2, 16); 2324 Py_ssize_t item_size = sizeof(ids->array[0]); 2325 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); 2326 if (new_array == NULL) { 2327 PyErr_NoMemory(); 2328 return NULL; 2329 } 2330 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); 2331 ids->array = new_array; 2332 ids->size = new_size; 2333 } 2334 2335 // The array stores a strong reference 2336 ids->array[index] = obj; 2337 2338 // Return a borrowed reference 2339 return obj; 2340 } 2341 2342 2343 static void unicode_clear_identifiers(struct _Py_unicode_state * state)2344 unicode_clear_identifiers(struct _Py_unicode_state *state) 2345 { 2346 struct _Py_unicode_ids *ids = &state->ids; 2347 for (Py_ssize_t i=0; i < ids->size; i++) { 2348 Py_XDECREF(ids->array[i]); 2349 } 2350 ids->size = 0; 2351 PyMem_Free(ids->array); 2352 ids->array = NULL; 2353 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid 2354 // after Py_Finalize(). 2355 } 2356 2357 2358 /* Internal function, doesn't check maximum character */ 2359 2360 PyObject* _PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 2362 { 2363 const unsigned char *s = (const unsigned char *)buffer; 2364 PyObject *unicode; 2365 if (size == 1) { 2366 #ifdef Py_DEBUG 2367 assert((unsigned char)s[0] < 128); 2368 #endif 2369 return get_latin1_char(s[0]); 2370 } 2371 unicode = PyUnicode_New(size, 127); 2372 if (!unicode) 2373 return NULL; 2374 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 2375 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2376 return unicode; 2377 } 2378 2379 static Py_UCS4 kind_maxchar_limit(unsigned int kind)2380 kind_maxchar_limit(unsigned int kind) 2381 { 2382 switch (kind) { 2383 case PyUnicode_1BYTE_KIND: 2384 return 0x80; 2385 case PyUnicode_2BYTE_KIND: 2386 return 0x100; 2387 case PyUnicode_4BYTE_KIND: 2388 return 0x10000; 2389 default: 2390 Py_UNREACHABLE(); 2391 } 2392 } 2393 2394 static PyObject* _PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 2396 { 2397 PyObject *res; 2398 unsigned char max_char; 2399 2400 if (size == 0) { 2401 _Py_RETURN_UNICODE_EMPTY(); 2402 } 2403 assert(size > 0); 2404 if (size == 1) { 2405 return get_latin1_char(u[0]); 2406 } 2407 2408 max_char = ucs1lib_find_max_char(u, u + size); 2409 res = PyUnicode_New(size, max_char); 2410 if (!res) 2411 return NULL; 2412 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2413 assert(_PyUnicode_CheckConsistency(res, 1)); 2414 return res; 2415 } 2416 2417 static PyObject* _PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2419 { 2420 PyObject *res; 2421 Py_UCS2 max_char; 2422 2423 if (size == 0) 2424 _Py_RETURN_UNICODE_EMPTY(); 2425 assert(size > 0); 2426 if (size == 1) 2427 return unicode_char(u[0]); 2428 2429 max_char = ucs2lib_find_max_char(u, u + size); 2430 res = PyUnicode_New(size, max_char); 2431 if (!res) 2432 return NULL; 2433 if (max_char >= 256) 2434 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2435 else { 2436 _PyUnicode_CONVERT_BYTES( 2437 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2438 } 2439 assert(_PyUnicode_CheckConsistency(res, 1)); 2440 return res; 2441 } 2442 2443 static PyObject* _PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2445 { 2446 PyObject *res; 2447 Py_UCS4 max_char; 2448 2449 if (size == 0) 2450 _Py_RETURN_UNICODE_EMPTY(); 2451 assert(size > 0); 2452 if (size == 1) 2453 return unicode_char(u[0]); 2454 2455 max_char = ucs4lib_find_max_char(u, u + size); 2456 res = PyUnicode_New(size, max_char); 2457 if (!res) 2458 return NULL; 2459 if (max_char < 256) 2460 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2461 PyUnicode_1BYTE_DATA(res)); 2462 else if (max_char < 0x10000) 2463 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2464 PyUnicode_2BYTE_DATA(res)); 2465 else 2466 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2467 assert(_PyUnicode_CheckConsistency(res, 1)); 2468 return res; 2469 } 2470 2471 PyObject* PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2473 { 2474 if (size < 0) { 2475 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2476 return NULL; 2477 } 2478 switch (kind) { 2479 case PyUnicode_1BYTE_KIND: 2480 return _PyUnicode_FromUCS1(buffer, size); 2481 case PyUnicode_2BYTE_KIND: 2482 return _PyUnicode_FromUCS2(buffer, size); 2483 case PyUnicode_4BYTE_KIND: 2484 return _PyUnicode_FromUCS4(buffer, size); 2485 default: 2486 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2487 return NULL; 2488 } 2489 } 2490 2491 Py_UCS4 _PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2493 { 2494 enum PyUnicode_Kind kind; 2495 const void *startptr, *endptr; 2496 2497 assert(PyUnicode_IS_READY(unicode)); 2498 assert(0 <= start); 2499 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2500 assert(start <= end); 2501 2502 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2503 return PyUnicode_MAX_CHAR_VALUE(unicode); 2504 2505 if (start == end) 2506 return 127; 2507 2508 if (PyUnicode_IS_ASCII(unicode)) 2509 return 127; 2510 2511 kind = PyUnicode_KIND(unicode); 2512 startptr = PyUnicode_DATA(unicode); 2513 endptr = (char *)startptr + end * kind; 2514 startptr = (char *)startptr + start * kind; 2515 switch(kind) { 2516 case PyUnicode_1BYTE_KIND: 2517 return ucs1lib_find_max_char(startptr, endptr); 2518 case PyUnicode_2BYTE_KIND: 2519 return ucs2lib_find_max_char(startptr, endptr); 2520 case PyUnicode_4BYTE_KIND: 2521 return ucs4lib_find_max_char(startptr, endptr); 2522 default: 2523 Py_UNREACHABLE(); 2524 } 2525 } 2526 2527 /* Ensure that a string uses the most efficient storage, if it is not the 2528 case: create a new string with of the right kind. Write NULL into *p_unicode 2529 on error. */ 2530 static void unicode_adjust_maxchar(PyObject ** p_unicode)2531 unicode_adjust_maxchar(PyObject **p_unicode) 2532 { 2533 PyObject *unicode, *copy; 2534 Py_UCS4 max_char; 2535 Py_ssize_t len; 2536 unsigned int kind; 2537 2538 assert(p_unicode != NULL); 2539 unicode = *p_unicode; 2540 assert(PyUnicode_IS_READY(unicode)); 2541 if (PyUnicode_IS_ASCII(unicode)) 2542 return; 2543 2544 len = PyUnicode_GET_LENGTH(unicode); 2545 kind = PyUnicode_KIND(unicode); 2546 if (kind == PyUnicode_1BYTE_KIND) { 2547 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2548 max_char = ucs1lib_find_max_char(u, u + len); 2549 if (max_char >= 128) 2550 return; 2551 } 2552 else if (kind == PyUnicode_2BYTE_KIND) { 2553 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2554 max_char = ucs2lib_find_max_char(u, u + len); 2555 if (max_char >= 256) 2556 return; 2557 } 2558 else if (kind == PyUnicode_4BYTE_KIND) { 2559 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2560 max_char = ucs4lib_find_max_char(u, u + len); 2561 if (max_char >= 0x10000) 2562 return; 2563 } 2564 else 2565 Py_UNREACHABLE(); 2566 2567 copy = PyUnicode_New(len, max_char); 2568 if (copy != NULL) 2569 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2570 Py_DECREF(unicode); 2571 *p_unicode = copy; 2572 } 2573 2574 PyObject* _PyUnicode_Copy(PyObject * unicode)2575 _PyUnicode_Copy(PyObject *unicode) 2576 { 2577 Py_ssize_t length; 2578 PyObject *copy; 2579 2580 if (!PyUnicode_Check(unicode)) { 2581 PyErr_BadInternalCall(); 2582 return NULL; 2583 } 2584 if (PyUnicode_READY(unicode) == -1) 2585 return NULL; 2586 2587 length = PyUnicode_GET_LENGTH(unicode); 2588 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2589 if (!copy) 2590 return NULL; 2591 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2592 2593 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2594 length * PyUnicode_KIND(unicode)); 2595 assert(_PyUnicode_CheckConsistency(copy, 1)); 2596 return copy; 2597 } 2598 2599 2600 /* Widen Unicode objects to larger buffers. Don't write terminating null 2601 character. Return NULL on error. */ 2602 2603 static void* unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind) 2605 { 2606 void *result; 2607 2608 assert(skind < kind); 2609 switch (kind) { 2610 case PyUnicode_2BYTE_KIND: 2611 result = PyMem_New(Py_UCS2, len); 2612 if (!result) 2613 return PyErr_NoMemory(); 2614 assert(skind == PyUnicode_1BYTE_KIND); 2615 _PyUnicode_CONVERT_BYTES( 2616 Py_UCS1, Py_UCS2, 2617 (const Py_UCS1 *)data, 2618 ((const Py_UCS1 *)data) + len, 2619 result); 2620 return result; 2621 case PyUnicode_4BYTE_KIND: 2622 result = PyMem_New(Py_UCS4, len); 2623 if (!result) 2624 return PyErr_NoMemory(); 2625 if (skind == PyUnicode_2BYTE_KIND) { 2626 _PyUnicode_CONVERT_BYTES( 2627 Py_UCS2, Py_UCS4, 2628 (const Py_UCS2 *)data, 2629 ((const Py_UCS2 *)data) + len, 2630 result); 2631 } 2632 else { 2633 assert(skind == PyUnicode_1BYTE_KIND); 2634 _PyUnicode_CONVERT_BYTES( 2635 Py_UCS1, Py_UCS4, 2636 (const Py_UCS1 *)data, 2637 ((const Py_UCS1 *)data) + len, 2638 result); 2639 } 2640 return result; 2641 default: 2642 Py_UNREACHABLE(); 2643 return NULL; 2644 } 2645 } 2646 2647 static Py_UCS4* as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2649 int copy_null) 2650 { 2651 int kind; 2652 const void *data; 2653 Py_ssize_t len, targetlen; 2654 if (PyUnicode_READY(string) == -1) 2655 return NULL; 2656 kind = PyUnicode_KIND(string); 2657 data = PyUnicode_DATA(string); 2658 len = PyUnicode_GET_LENGTH(string); 2659 targetlen = len; 2660 if (copy_null) 2661 targetlen++; 2662 if (!target) { 2663 target = PyMem_New(Py_UCS4, targetlen); 2664 if (!target) { 2665 PyErr_NoMemory(); 2666 return NULL; 2667 } 2668 } 2669 else { 2670 if (targetsize < targetlen) { 2671 PyErr_Format(PyExc_SystemError, 2672 "string is longer than the buffer"); 2673 if (copy_null && 0 < targetsize) 2674 target[0] = 0; 2675 return NULL; 2676 } 2677 } 2678 if (kind == PyUnicode_1BYTE_KIND) { 2679 const Py_UCS1 *start = (const Py_UCS1 *) data; 2680 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2681 } 2682 else if (kind == PyUnicode_2BYTE_KIND) { 2683 const Py_UCS2 *start = (const Py_UCS2 *) data; 2684 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2685 } 2686 else if (kind == PyUnicode_4BYTE_KIND) { 2687 memcpy(target, data, len * sizeof(Py_UCS4)); 2688 } 2689 else { 2690 Py_UNREACHABLE(); 2691 } 2692 if (copy_null) 2693 target[len] = 0; 2694 return target; 2695 } 2696 2697 Py_UCS4* PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2699 int copy_null) 2700 { 2701 if (target == NULL || targetsize < 0) { 2702 PyErr_BadInternalCall(); 2703 return NULL; 2704 } 2705 return as_ucs4(string, target, targetsize, copy_null); 2706 } 2707 2708 Py_UCS4* PyUnicode_AsUCS4Copy(PyObject * string)2709 PyUnicode_AsUCS4Copy(PyObject *string) 2710 { 2711 return as_ucs4(string, NULL, 0, 1); 2712 } 2713 2714 /* maximum number of characters required for output of %lld or %p. 2715 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2716 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2717 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2718 2719 static int unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2721 Py_ssize_t width, Py_ssize_t precision) 2722 { 2723 Py_ssize_t length, fill, arglen; 2724 Py_UCS4 maxchar; 2725 2726 if (PyUnicode_READY(str) == -1) 2727 return -1; 2728 2729 length = PyUnicode_GET_LENGTH(str); 2730 if ((precision == -1 || precision >= length) 2731 && width <= length) 2732 return _PyUnicodeWriter_WriteStr(writer, str); 2733 2734 if (precision != -1) 2735 length = Py_MIN(precision, length); 2736 2737 arglen = Py_MAX(length, width); 2738 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2739 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2740 else 2741 maxchar = writer->maxchar; 2742 2743 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2744 return -1; 2745 2746 if (width > length) { 2747 fill = width - length; 2748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2749 return -1; 2750 writer->pos += fill; 2751 } 2752 2753 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2754 str, 0, length); 2755 writer->pos += length; 2756 return 0; 2757 } 2758 2759 static int unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2761 Py_ssize_t width, Py_ssize_t precision) 2762 { 2763 /* UTF-8 */ 2764 Py_ssize_t length; 2765 PyObject *unicode; 2766 int res; 2767 2768 if (precision == -1) { 2769 length = strlen(str); 2770 } 2771 else { 2772 length = 0; 2773 while (length < precision && str[length]) { 2774 length++; 2775 } 2776 } 2777 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2778 if (unicode == NULL) 2779 return -1; 2780 2781 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2782 Py_DECREF(unicode); 2783 return res; 2784 } 2785 2786 static const char* unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787 unicode_fromformat_arg(_PyUnicodeWriter *writer, 2788 const char *f, va_list *vargs) 2789 { 2790 const char *p; 2791 Py_ssize_t len; 2792 int zeropad; 2793 Py_ssize_t width; 2794 Py_ssize_t precision; 2795 int longflag; 2796 int longlongflag; 2797 int size_tflag; 2798 Py_ssize_t fill; 2799 2800 p = f; 2801 f++; 2802 zeropad = 0; 2803 if (*f == '0') { 2804 zeropad = 1; 2805 f++; 2806 } 2807 2808 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2809 width = -1; 2810 if (Py_ISDIGIT((unsigned)*f)) { 2811 width = *f - '0'; 2812 f++; 2813 while (Py_ISDIGIT((unsigned)*f)) { 2814 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2815 PyErr_SetString(PyExc_ValueError, 2816 "width too big"); 2817 return NULL; 2818 } 2819 width = (width * 10) + (*f - '0'); 2820 f++; 2821 } 2822 } 2823 precision = -1; 2824 if (*f == '.') { 2825 f++; 2826 if (Py_ISDIGIT((unsigned)*f)) { 2827 precision = (*f - '0'); 2828 f++; 2829 while (Py_ISDIGIT((unsigned)*f)) { 2830 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2831 PyErr_SetString(PyExc_ValueError, 2832 "precision too big"); 2833 return NULL; 2834 } 2835 precision = (precision * 10) + (*f - '0'); 2836 f++; 2837 } 2838 } 2839 if (*f == '%') { 2840 /* "%.3%s" => f points to "3" */ 2841 f--; 2842 } 2843 } 2844 if (*f == '\0') { 2845 /* bogus format "%.123" => go backward, f points to "3" */ 2846 f--; 2847 } 2848 2849 /* Handle %ld, %lu, %lld and %llu. */ 2850 longflag = 0; 2851 longlongflag = 0; 2852 size_tflag = 0; 2853 if (*f == 'l') { 2854 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2855 longflag = 1; 2856 ++f; 2857 } 2858 else if (f[1] == 'l' && 2859 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2860 longlongflag = 1; 2861 f += 2; 2862 } 2863 } 2864 /* handle the size_t flag. */ 2865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2866 size_tflag = 1; 2867 ++f; 2868 } 2869 2870 if (f[1] == '\0') 2871 writer->overallocate = 0; 2872 2873 switch (*f) { 2874 case 'c': 2875 { 2876 int ordinal = va_arg(*vargs, int); 2877 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2878 PyErr_SetString(PyExc_OverflowError, 2879 "character argument not in range(0x110000)"); 2880 return NULL; 2881 } 2882 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2883 return NULL; 2884 break; 2885 } 2886 2887 case 'i': 2888 case 'd': 2889 case 'u': 2890 case 'x': 2891 { 2892 /* used by sprintf */ 2893 char buffer[MAX_LONG_LONG_CHARS]; 2894 Py_ssize_t arglen; 2895 2896 if (*f == 'u') { 2897 if (longflag) { 2898 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long)); 2899 } 2900 else if (longlongflag) { 2901 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long)); 2902 } 2903 else if (size_tflag) { 2904 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t)); 2905 } 2906 else { 2907 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int)); 2908 } 2909 } 2910 else if (*f == 'x') { 2911 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2912 } 2913 else { 2914 if (longflag) { 2915 len = sprintf(buffer, "%li", va_arg(*vargs, long)); 2916 } 2917 else if (longlongflag) { 2918 len = sprintf(buffer, "%lli", va_arg(*vargs, long long)); 2919 } 2920 else if (size_tflag) { 2921 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t)); 2922 } 2923 else { 2924 len = sprintf(buffer, "%i", va_arg(*vargs, int)); 2925 } 2926 } 2927 assert(len >= 0); 2928 2929 if (precision < len) 2930 precision = len; 2931 2932 arglen = Py_MAX(precision, width); 2933 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2934 return NULL; 2935 2936 if (width > precision) { 2937 Py_UCS4 fillchar; 2938 fill = width - precision; 2939 fillchar = zeropad?'0':' '; 2940 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2941 return NULL; 2942 writer->pos += fill; 2943 } 2944 if (precision > len) { 2945 fill = precision - len; 2946 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2947 return NULL; 2948 writer->pos += fill; 2949 } 2950 2951 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2952 return NULL; 2953 break; 2954 } 2955 2956 case 'p': 2957 { 2958 char number[MAX_LONG_LONG_CHARS]; 2959 2960 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2961 assert(len >= 0); 2962 2963 /* %p is ill-defined: ensure leading 0x. */ 2964 if (number[1] == 'X') 2965 number[1] = 'x'; 2966 else if (number[1] != 'x') { 2967 memmove(number + 2, number, 2968 strlen(number) + 1); 2969 number[0] = '0'; 2970 number[1] = 'x'; 2971 len += 2; 2972 } 2973 2974 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2975 return NULL; 2976 break; 2977 } 2978 2979 case 's': 2980 { 2981 /* UTF-8 */ 2982 const char *s = va_arg(*vargs, const char*); 2983 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2984 return NULL; 2985 break; 2986 } 2987 2988 case 'U': 2989 { 2990 PyObject *obj = va_arg(*vargs, PyObject *); 2991 assert(obj && _PyUnicode_CHECK(obj)); 2992 2993 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2994 return NULL; 2995 break; 2996 } 2997 2998 case 'V': 2999 { 3000 PyObject *obj = va_arg(*vargs, PyObject *); 3001 const char *str = va_arg(*vargs, const char *); 3002 if (obj) { 3003 assert(_PyUnicode_CHECK(obj)); 3004 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 3005 return NULL; 3006 } 3007 else { 3008 assert(str != NULL); 3009 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 3010 return NULL; 3011 } 3012 break; 3013 } 3014 3015 case 'S': 3016 { 3017 PyObject *obj = va_arg(*vargs, PyObject *); 3018 PyObject *str; 3019 assert(obj); 3020 str = PyObject_Str(obj); 3021 if (!str) 3022 return NULL; 3023 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 3024 Py_DECREF(str); 3025 return NULL; 3026 } 3027 Py_DECREF(str); 3028 break; 3029 } 3030 3031 case 'R': 3032 { 3033 PyObject *obj = va_arg(*vargs, PyObject *); 3034 PyObject *repr; 3035 assert(obj); 3036 repr = PyObject_Repr(obj); 3037 if (!repr) 3038 return NULL; 3039 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 3040 Py_DECREF(repr); 3041 return NULL; 3042 } 3043 Py_DECREF(repr); 3044 break; 3045 } 3046 3047 case 'A': 3048 { 3049 PyObject *obj = va_arg(*vargs, PyObject *); 3050 PyObject *ascii; 3051 assert(obj); 3052 ascii = PyObject_ASCII(obj); 3053 if (!ascii) 3054 return NULL; 3055 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 3056 Py_DECREF(ascii); 3057 return NULL; 3058 } 3059 Py_DECREF(ascii); 3060 break; 3061 } 3062 3063 case '%': 3064 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 3065 return NULL; 3066 break; 3067 3068 default: 3069 /* if we stumble upon an unknown formatting code, copy the rest 3070 of the format string to the output string. (we cannot just 3071 skip the code, since there's no way to know what's in the 3072 argument list) */ 3073 len = strlen(p); 3074 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 3075 return NULL; 3076 f = p+len; 3077 return f; 3078 } 3079 3080 f++; 3081 return f; 3082 } 3083 3084 PyObject * PyUnicode_FromFormatV(const char * format,va_list vargs)3085 PyUnicode_FromFormatV(const char *format, va_list vargs) 3086 { 3087 va_list vargs2; 3088 const char *f; 3089 _PyUnicodeWriter writer; 3090 3091 _PyUnicodeWriter_Init(&writer); 3092 writer.min_length = strlen(format) + 100; 3093 writer.overallocate = 1; 3094 3095 // Copy varags to be able to pass a reference to a subfunction. 3096 va_copy(vargs2, vargs); 3097 3098 for (f = format; *f; ) { 3099 if (*f == '%') { 3100 f = unicode_fromformat_arg(&writer, f, &vargs2); 3101 if (f == NULL) 3102 goto fail; 3103 } 3104 else { 3105 const char *p; 3106 Py_ssize_t len; 3107 3108 p = f; 3109 do 3110 { 3111 if ((unsigned char)*p > 127) { 3112 PyErr_Format(PyExc_ValueError, 3113 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 3114 "string, got a non-ASCII byte: 0x%02x", 3115 (unsigned char)*p); 3116 goto fail; 3117 } 3118 p++; 3119 } 3120 while (*p != '\0' && *p != '%'); 3121 len = p - f; 3122 3123 if (*p == '\0') 3124 writer.overallocate = 0; 3125 3126 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 3127 goto fail; 3128 3129 f = p; 3130 } 3131 } 3132 va_end(vargs2); 3133 return _PyUnicodeWriter_Finish(&writer); 3134 3135 fail: 3136 va_end(vargs2); 3137 _PyUnicodeWriter_Dealloc(&writer); 3138 return NULL; 3139 } 3140 3141 PyObject * PyUnicode_FromFormat(const char * format,...)3142 PyUnicode_FromFormat(const char *format, ...) 3143 { 3144 PyObject* ret; 3145 va_list vargs; 3146 3147 #ifdef HAVE_STDARG_PROTOTYPES 3148 va_start(vargs, format); 3149 #else 3150 va_start(vargs); 3151 #endif 3152 ret = PyUnicode_FromFormatV(format, vargs); 3153 va_end(vargs); 3154 return ret; 3155 } 3156 3157 static Py_ssize_t unicode_get_widechar_size(PyObject * unicode)3158 unicode_get_widechar_size(PyObject *unicode) 3159 { 3160 Py_ssize_t res; 3161 3162 assert(unicode != NULL); 3163 assert(_PyUnicode_CHECK(unicode)); 3164 3165 #if USE_UNICODE_WCHAR_CACHE 3166 if (_PyUnicode_WSTR(unicode) != NULL) { 3167 return PyUnicode_WSTR_LENGTH(unicode); 3168 } 3169 #endif /* USE_UNICODE_WCHAR_CACHE */ 3170 assert(PyUnicode_IS_READY(unicode)); 3171 3172 res = _PyUnicode_LENGTH(unicode); 3173 #if SIZEOF_WCHAR_T == 2 3174 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3175 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode); 3176 const Py_UCS4 *end = s + res; 3177 for (; s < end; ++s) { 3178 if (*s > 0xFFFF) { 3179 ++res; 3180 } 3181 } 3182 } 3183 #endif 3184 return res; 3185 } 3186 3187 static void unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) 3189 { 3190 assert(unicode != NULL); 3191 assert(_PyUnicode_CHECK(unicode)); 3192 3193 #if USE_UNICODE_WCHAR_CACHE 3194 const wchar_t *wstr = _PyUnicode_WSTR(unicode); 3195 if (wstr != NULL) { 3196 memcpy(w, wstr, size * sizeof(wchar_t)); 3197 return; 3198 } 3199 #else /* USE_UNICODE_WCHAR_CACHE */ 3200 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) { 3201 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t)); 3202 return; 3203 } 3204 #endif /* USE_UNICODE_WCHAR_CACHE */ 3205 assert(PyUnicode_IS_READY(unicode)); 3206 3207 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3208 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode); 3209 for (; size--; ++s, ++w) { 3210 *w = *s; 3211 } 3212 } 3213 else { 3214 #if SIZEOF_WCHAR_T == 4 3215 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND); 3216 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode); 3217 for (; size--; ++s, ++w) { 3218 *w = *s; 3219 } 3220 #else 3221 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 3222 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode); 3223 for (; size--; ++s, ++w) { 3224 Py_UCS4 ch = *s; 3225 if (ch > 0xFFFF) { 3226 assert(ch <= MAX_UNICODE); 3227 /* encode surrogate pair in this case */ 3228 *w++ = Py_UNICODE_HIGH_SURROGATE(ch); 3229 if (!size--) 3230 break; 3231 *w = Py_UNICODE_LOW_SURROGATE(ch); 3232 } 3233 else { 3234 *w = ch; 3235 } 3236 } 3237 #endif 3238 } 3239 } 3240 3241 #ifdef HAVE_WCHAR_H 3242 3243 /* Convert a Unicode object to a wide character string. 3244 3245 - If w is NULL: return the number of wide characters (including the null 3246 character) required to convert the unicode object. Ignore size argument. 3247 3248 - Otherwise: return the number of wide characters (excluding the null 3249 character) written into w. Write at most size wide characters (including 3250 the null character). */ 3251 Py_ssize_t PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252 PyUnicode_AsWideChar(PyObject *unicode, 3253 wchar_t *w, 3254 Py_ssize_t size) 3255 { 3256 Py_ssize_t res; 3257 3258 if (unicode == NULL) { 3259 PyErr_BadInternalCall(); 3260 return -1; 3261 } 3262 if (!PyUnicode_Check(unicode)) { 3263 PyErr_BadArgument(); 3264 return -1; 3265 } 3266 3267 res = unicode_get_widechar_size(unicode); 3268 if (w == NULL) { 3269 return res + 1; 3270 } 3271 3272 if (size > res) { 3273 size = res + 1; 3274 } 3275 else { 3276 res = size; 3277 } 3278 unicode_copy_as_widechar(unicode, w, size); 3279 3280 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 3281 /* Oracle Solaris uses non-Unicode internal wchar_t form for 3282 non-Unicode locales and hence needs conversion first. */ 3283 if (_Py_LocaleUsesNonUnicodeWchar()) { 3284 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) { 3285 return -1; 3286 } 3287 } 3288 #endif 3289 3290 return res; 3291 } 3292 3293 wchar_t* PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294 PyUnicode_AsWideCharString(PyObject *unicode, 3295 Py_ssize_t *size) 3296 { 3297 wchar_t *buffer; 3298 Py_ssize_t buflen; 3299 3300 if (unicode == NULL) { 3301 PyErr_BadInternalCall(); 3302 return NULL; 3303 } 3304 if (!PyUnicode_Check(unicode)) { 3305 PyErr_BadArgument(); 3306 return NULL; 3307 } 3308 3309 buflen = unicode_get_widechar_size(unicode); 3310 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1)); 3311 if (buffer == NULL) { 3312 PyErr_NoMemory(); 3313 return NULL; 3314 } 3315 unicode_copy_as_widechar(unicode, buffer, buflen + 1); 3316 3317 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for 3319 non-Unicode locales and hence needs conversion first. */ 3320 if (_Py_LocaleUsesNonUnicodeWchar()) { 3321 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) { 3322 return NULL; 3323 } 3324 } 3325 #endif 3326 3327 if (size != NULL) { 3328 *size = buflen; 3329 } 3330 else if (wcslen(buffer) != (size_t)buflen) { 3331 PyMem_Free(buffer); 3332 PyErr_SetString(PyExc_ValueError, 3333 "embedded null character"); 3334 return NULL; 3335 } 3336 return buffer; 3337 } 3338 3339 #endif /* HAVE_WCHAR_H */ 3340 3341 int _PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr) 3343 { 3344 wchar_t **p = (wchar_t **)ptr; 3345 if (obj == NULL) { 3346 #if !USE_UNICODE_WCHAR_CACHE 3347 PyMem_Free(*p); 3348 #endif /* USE_UNICODE_WCHAR_CACHE */ 3349 *p = NULL; 3350 return 1; 3351 } 3352 if (PyUnicode_Check(obj)) { 3353 #if USE_UNICODE_WCHAR_CACHE 3354 *p = (wchar_t *)_PyUnicode_AsUnicode(obj); 3355 if (*p == NULL) { 3356 return 0; 3357 } 3358 return 1; 3359 #else /* USE_UNICODE_WCHAR_CACHE */ 3360 *p = PyUnicode_AsWideCharString(obj, NULL); 3361 if (*p == NULL) { 3362 return 0; 3363 } 3364 return Py_CLEANUP_SUPPORTED; 3365 #endif /* USE_UNICODE_WCHAR_CACHE */ 3366 } 3367 PyErr_Format(PyExc_TypeError, 3368 "argument must be str, not %.50s", 3369 Py_TYPE(obj)->tp_name); 3370 return 0; 3371 } 3372 3373 int _PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr) 3375 { 3376 wchar_t **p = (wchar_t **)ptr; 3377 if (obj == NULL) { 3378 #if !USE_UNICODE_WCHAR_CACHE 3379 PyMem_Free(*p); 3380 #endif /* USE_UNICODE_WCHAR_CACHE */ 3381 *p = NULL; 3382 return 1; 3383 } 3384 if (obj == Py_None) { 3385 *p = NULL; 3386 return 1; 3387 } 3388 if (PyUnicode_Check(obj)) { 3389 #if USE_UNICODE_WCHAR_CACHE 3390 *p = (wchar_t *)_PyUnicode_AsUnicode(obj); 3391 if (*p == NULL) { 3392 return 0; 3393 } 3394 return 1; 3395 #else /* USE_UNICODE_WCHAR_CACHE */ 3396 *p = PyUnicode_AsWideCharString(obj, NULL); 3397 if (*p == NULL) { 3398 return 0; 3399 } 3400 return Py_CLEANUP_SUPPORTED; 3401 #endif /* USE_UNICODE_WCHAR_CACHE */ 3402 } 3403 PyErr_Format(PyExc_TypeError, 3404 "argument must be str or None, not %.50s", 3405 Py_TYPE(obj)->tp_name); 3406 return 0; 3407 } 3408 3409 PyObject * PyUnicode_FromOrdinal(int ordinal)3410 PyUnicode_FromOrdinal(int ordinal) 3411 { 3412 if (ordinal < 0 || ordinal > MAX_UNICODE) { 3413 PyErr_SetString(PyExc_ValueError, 3414 "chr() arg not in range(0x110000)"); 3415 return NULL; 3416 } 3417 3418 return unicode_char((Py_UCS4)ordinal); 3419 } 3420 3421 PyObject * PyUnicode_FromObject(PyObject * obj)3422 PyUnicode_FromObject(PyObject *obj) 3423 { 3424 /* XXX Perhaps we should make this API an alias of 3425 PyObject_Str() instead ?! */ 3426 if (PyUnicode_CheckExact(obj)) { 3427 if (PyUnicode_READY(obj) == -1) 3428 return NULL; 3429 Py_INCREF(obj); 3430 return obj; 3431 } 3432 if (PyUnicode_Check(obj)) { 3433 /* For a Unicode subtype that's not a Unicode object, 3434 return a true Unicode object with the same data. */ 3435 return _PyUnicode_Copy(obj); 3436 } 3437 PyErr_Format(PyExc_TypeError, 3438 "Can't convert '%.100s' object to str implicitly", 3439 Py_TYPE(obj)->tp_name); 3440 return NULL; 3441 } 3442 3443 PyObject * PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444 PyUnicode_FromEncodedObject(PyObject *obj, 3445 const char *encoding, 3446 const char *errors) 3447 { 3448 Py_buffer buffer; 3449 PyObject *v; 3450 3451 if (obj == NULL) { 3452 PyErr_BadInternalCall(); 3453 return NULL; 3454 } 3455 3456 /* Decoding bytes objects is the most common case and should be fast */ 3457 if (PyBytes_Check(obj)) { 3458 if (PyBytes_GET_SIZE(obj) == 0) { 3459 if (unicode_check_encoding_errors(encoding, errors) < 0) { 3460 return NULL; 3461 } 3462 _Py_RETURN_UNICODE_EMPTY(); 3463 } 3464 return PyUnicode_Decode( 3465 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3466 encoding, errors); 3467 } 3468 3469 if (PyUnicode_Check(obj)) { 3470 PyErr_SetString(PyExc_TypeError, 3471 "decoding str is not supported"); 3472 return NULL; 3473 } 3474 3475 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3476 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3477 PyErr_Format(PyExc_TypeError, 3478 "decoding to str: need a bytes-like object, %.80s found", 3479 Py_TYPE(obj)->tp_name); 3480 return NULL; 3481 } 3482 3483 if (buffer.len == 0) { 3484 PyBuffer_Release(&buffer); 3485 if (unicode_check_encoding_errors(encoding, errors) < 0) { 3486 return NULL; 3487 } 3488 _Py_RETURN_UNICODE_EMPTY(); 3489 } 3490 3491 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3492 PyBuffer_Release(&buffer); 3493 return v; 3494 } 3495 3496 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but 3497 also convert to lowercase. Return 1 on success, or 0 on error (encoding is 3498 longer than lower_len-1). */ 3499 int _Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500 _Py_normalize_encoding(const char *encoding, 3501 char *lower, 3502 size_t lower_len) 3503 { 3504 const char *e; 3505 char *l; 3506 char *l_end; 3507 int punct; 3508 3509 assert(encoding != NULL); 3510 3511 e = encoding; 3512 l = lower; 3513 l_end = &lower[lower_len - 1]; 3514 punct = 0; 3515 while (1) { 3516 char c = *e; 3517 if (c == 0) { 3518 break; 3519 } 3520 3521 if (Py_ISALNUM(c) || c == '.') { 3522 if (punct && l != lower) { 3523 if (l == l_end) { 3524 return 0; 3525 } 3526 *l++ = '_'; 3527 } 3528 punct = 0; 3529 3530 if (l == l_end) { 3531 return 0; 3532 } 3533 *l++ = Py_TOLOWER(c); 3534 } 3535 else { 3536 punct = 1; 3537 } 3538 3539 e++; 3540 } 3541 *l = '\0'; 3542 return 1; 3543 } 3544 3545 PyObject * PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546 PyUnicode_Decode(const char *s, 3547 Py_ssize_t size, 3548 const char *encoding, 3549 const char *errors) 3550 { 3551 PyObject *buffer = NULL, *unicode; 3552 Py_buffer info; 3553 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ 3554 3555 if (unicode_check_encoding_errors(encoding, errors) < 0) { 3556 return NULL; 3557 } 3558 3559 if (size == 0) { 3560 _Py_RETURN_UNICODE_EMPTY(); 3561 } 3562 3563 if (encoding == NULL) { 3564 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3565 } 3566 3567 /* Shortcuts for common default encodings */ 3568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3569 char *lower = buflower; 3570 3571 /* Fast paths */ 3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3573 lower += 3; 3574 if (*lower == '_') { 3575 /* Match "utf8" and "utf_8" */ 3576 lower++; 3577 } 3578 3579 if (lower[0] == '8' && lower[1] == 0) { 3580 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3581 } 3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3583 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3584 } 3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3586 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3587 } 3588 } 3589 else { 3590 if (strcmp(lower, "ascii") == 0 3591 || strcmp(lower, "us_ascii") == 0) { 3592 return PyUnicode_DecodeASCII(s, size, errors); 3593 } 3594 #ifdef MS_WINDOWS 3595 else if (strcmp(lower, "mbcs") == 0) { 3596 return PyUnicode_DecodeMBCS(s, size, errors); 3597 } 3598 #endif 3599 else if (strcmp(lower, "latin1") == 0 3600 || strcmp(lower, "latin_1") == 0 3601 || strcmp(lower, "iso_8859_1") == 0 3602 || strcmp(lower, "iso8859_1") == 0) { 3603 return PyUnicode_DecodeLatin1(s, size, errors); 3604 } 3605 } 3606 } 3607 3608 /* Decode via the codec registry */ 3609 buffer = NULL; 3610 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3611 goto onError; 3612 buffer = PyMemoryView_FromBuffer(&info); 3613 if (buffer == NULL) 3614 goto onError; 3615 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3616 if (unicode == NULL) 3617 goto onError; 3618 if (!PyUnicode_Check(unicode)) { 3619 PyErr_Format(PyExc_TypeError, 3620 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3621 "use codecs.decode() to decode to arbitrary types", 3622 encoding, 3623 Py_TYPE(unicode)->tp_name); 3624 Py_DECREF(unicode); 3625 goto onError; 3626 } 3627 Py_DECREF(buffer); 3628 return unicode_result(unicode); 3629 3630 onError: 3631 Py_XDECREF(buffer); 3632 return NULL; 3633 } 3634 3635 PyObject * PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636 PyUnicode_AsDecodedObject(PyObject *unicode, 3637 const char *encoding, 3638 const char *errors) 3639 { 3640 if (!PyUnicode_Check(unicode)) { 3641 PyErr_BadArgument(); 3642 return NULL; 3643 } 3644 3645 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3646 "PyUnicode_AsDecodedObject() is deprecated; " 3647 "use PyCodec_Decode() to decode from str", 1) < 0) 3648 return NULL; 3649 3650 if (encoding == NULL) 3651 encoding = PyUnicode_GetDefaultEncoding(); 3652 3653 /* Decode via the codec registry */ 3654 return PyCodec_Decode(unicode, encoding, errors); 3655 } 3656 3657 PyObject * PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658 PyUnicode_AsDecodedUnicode(PyObject *unicode, 3659 const char *encoding, 3660 const char *errors) 3661 { 3662 PyObject *v; 3663 3664 if (!PyUnicode_Check(unicode)) { 3665 PyErr_BadArgument(); 3666 goto onError; 3667 } 3668 3669 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3670 "PyUnicode_AsDecodedUnicode() is deprecated; " 3671 "use PyCodec_Decode() to decode from str to str", 1) < 0) 3672 return NULL; 3673 3674 if (encoding == NULL) 3675 encoding = PyUnicode_GetDefaultEncoding(); 3676 3677 /* Decode via the codec registry */ 3678 v = PyCodec_Decode(unicode, encoding, errors); 3679 if (v == NULL) 3680 goto onError; 3681 if (!PyUnicode_Check(v)) { 3682 PyErr_Format(PyExc_TypeError, 3683 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3684 "use codecs.decode() to decode to arbitrary types", 3685 encoding, 3686 Py_TYPE(unicode)->tp_name); 3687 Py_DECREF(v); 3688 goto onError; 3689 } 3690 return unicode_result(v); 3691 3692 onError: 3693 return NULL; 3694 } 3695 3696 PyObject * PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697 PyUnicode_AsEncodedObject(PyObject *unicode, 3698 const char *encoding, 3699 const char *errors) 3700 { 3701 PyObject *v; 3702 3703 if (!PyUnicode_Check(unicode)) { 3704 PyErr_BadArgument(); 3705 goto onError; 3706 } 3707 3708 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3709 "PyUnicode_AsEncodedObject() is deprecated; " 3710 "use PyUnicode_AsEncodedString() to encode from str to bytes " 3711 "or PyCodec_Encode() for generic encoding", 1) < 0) 3712 return NULL; 3713 3714 if (encoding == NULL) 3715 encoding = PyUnicode_GetDefaultEncoding(); 3716 3717 /* Encode via the codec registry */ 3718 v = PyCodec_Encode(unicode, encoding, errors); 3719 if (v == NULL) 3720 goto onError; 3721 return v; 3722 3723 onError: 3724 return NULL; 3725 } 3726 3727 3728 static PyObject * unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler, 3730 int current_locale) 3731 { 3732 Py_ssize_t wlen; 3733 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3734 if (wstr == NULL) { 3735 return NULL; 3736 } 3737 3738 if ((size_t)wlen != wcslen(wstr)) { 3739 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3740 PyMem_Free(wstr); 3741 return NULL; 3742 } 3743 3744 char *str; 3745 size_t error_pos; 3746 const char *reason; 3747 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, 3748 current_locale, error_handler); 3749 PyMem_Free(wstr); 3750 3751 if (res != 0) { 3752 if (res == -2) { 3753 PyObject *exc; 3754 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", 3755 "locale", unicode, 3756 (Py_ssize_t)error_pos, 3757 (Py_ssize_t)(error_pos+1), 3758 reason); 3759 if (exc != NULL) { 3760 PyCodec_StrictErrors(exc); 3761 Py_DECREF(exc); 3762 } 3763 } 3764 else if (res == -3) { 3765 PyErr_SetString(PyExc_ValueError, "unsupported error handler"); 3766 } 3767 else { 3768 PyErr_NoMemory(); 3769 } 3770 return NULL; 3771 } 3772 3773 PyObject *bytes = PyBytes_FromString(str); 3774 PyMem_RawFree(str); 3775 return bytes; 3776 } 3777 3778 PyObject * PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3780 { 3781 _Py_error_handler error_handler = _Py_GetErrorHandler(errors); 3782 return unicode_encode_locale(unicode, error_handler, 1); 3783 } 3784 3785 PyObject * PyUnicode_EncodeFSDefault(PyObject * unicode)3786 PyUnicode_EncodeFSDefault(PyObject *unicode) 3787 { 3788 PyInterpreterState *interp = _PyInterpreterState_GET(); 3789 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec; 3790 if (fs_codec->utf8) { 3791 return unicode_encode_utf8(unicode, 3792 fs_codec->error_handler, 3793 fs_codec->errors); 3794 } 3795 #ifndef _Py_FORCE_UTF8_FS_ENCODING 3796 else if (fs_codec->encoding) { 3797 return PyUnicode_AsEncodedString(unicode, 3798 fs_codec->encoding, 3799 fs_codec->errors); 3800 } 3801 #endif 3802 else { 3803 /* Before _PyUnicode_InitEncodings() is called, the Python codec 3804 machinery is not ready and so cannot be used: 3805 use wcstombs() in this case. */ 3806 const PyConfig *config = _PyInterpreterState_GetConfig(interp); 3807 const wchar_t *filesystem_errors = config->filesystem_errors; 3808 assert(filesystem_errors != NULL); 3809 _Py_error_handler errors = get_error_handler_wide(filesystem_errors); 3810 assert(errors != _Py_ERROR_UNKNOWN); 3811 #ifdef _Py_FORCE_UTF8_FS_ENCODING 3812 return unicode_encode_utf8(unicode, errors, NULL); 3813 #else 3814 return unicode_encode_locale(unicode, errors, 0); 3815 #endif 3816 } 3817 } 3818 3819 PyObject * PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820 PyUnicode_AsEncodedString(PyObject *unicode, 3821 const char *encoding, 3822 const char *errors) 3823 { 3824 PyObject *v; 3825 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */ 3826 3827 if (!PyUnicode_Check(unicode)) { 3828 PyErr_BadArgument(); 3829 return NULL; 3830 } 3831 3832 if (unicode_check_encoding_errors(encoding, errors) < 0) { 3833 return NULL; 3834 } 3835 3836 if (encoding == NULL) { 3837 return _PyUnicode_AsUTF8String(unicode, errors); 3838 } 3839 3840 /* Shortcuts for common default encodings */ 3841 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3842 char *lower = buflower; 3843 3844 /* Fast paths */ 3845 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3846 lower += 3; 3847 if (*lower == '_') { 3848 /* Match "utf8" and "utf_8" */ 3849 lower++; 3850 } 3851 3852 if (lower[0] == '8' && lower[1] == 0) { 3853 return _PyUnicode_AsUTF8String(unicode, errors); 3854 } 3855 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3856 return _PyUnicode_EncodeUTF16(unicode, errors, 0); 3857 } 3858 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3859 return _PyUnicode_EncodeUTF32(unicode, errors, 0); 3860 } 3861 } 3862 else { 3863 if (strcmp(lower, "ascii") == 0 3864 || strcmp(lower, "us_ascii") == 0) { 3865 return _PyUnicode_AsASCIIString(unicode, errors); 3866 } 3867 #ifdef MS_WINDOWS 3868 else if (strcmp(lower, "mbcs") == 0) { 3869 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3870 } 3871 #endif 3872 else if (strcmp(lower, "latin1") == 0 || 3873 strcmp(lower, "latin_1") == 0 || 3874 strcmp(lower, "iso_8859_1") == 0 || 3875 strcmp(lower, "iso8859_1") == 0) { 3876 return _PyUnicode_AsLatin1String(unicode, errors); 3877 } 3878 } 3879 } 3880 3881 /* Encode via the codec registry */ 3882 v = _PyCodec_EncodeText(unicode, encoding, errors); 3883 if (v == NULL) 3884 return NULL; 3885 3886 /* The normal path */ 3887 if (PyBytes_Check(v)) 3888 return v; 3889 3890 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3891 if (PyByteArray_Check(v)) { 3892 int error; 3893 PyObject *b; 3894 3895 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3896 "encoder %s returned bytearray instead of bytes; " 3897 "use codecs.encode() to encode to arbitrary types", 3898 encoding); 3899 if (error) { 3900 Py_DECREF(v); 3901 return NULL; 3902 } 3903 3904 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), 3905 PyByteArray_GET_SIZE(v)); 3906 Py_DECREF(v); 3907 return b; 3908 } 3909 3910 PyErr_Format(PyExc_TypeError, 3911 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3912 "use codecs.encode() to encode to arbitrary types", 3913 encoding, 3914 Py_TYPE(v)->tp_name); 3915 Py_DECREF(v); 3916 return NULL; 3917 } 3918 3919 PyObject * PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3921 const char *encoding, 3922 const char *errors) 3923 { 3924 PyObject *v; 3925 3926 if (!PyUnicode_Check(unicode)) { 3927 PyErr_BadArgument(); 3928 goto onError; 3929 } 3930 3931 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3932 "PyUnicode_AsEncodedUnicode() is deprecated; " 3933 "use PyCodec_Encode() to encode from str to str", 1) < 0) 3934 return NULL; 3935 3936 if (encoding == NULL) 3937 encoding = PyUnicode_GetDefaultEncoding(); 3938 3939 /* Encode via the codec registry */ 3940 v = PyCodec_Encode(unicode, encoding, errors); 3941 if (v == NULL) 3942 goto onError; 3943 if (!PyUnicode_Check(v)) { 3944 PyErr_Format(PyExc_TypeError, 3945 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3946 "use codecs.encode() to encode to arbitrary types", 3947 encoding, 3948 Py_TYPE(v)->tp_name); 3949 Py_DECREF(v); 3950 goto onError; 3951 } 3952 return v; 3953 3954 onError: 3955 return NULL; 3956 } 3957 3958 static PyObject* unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959 unicode_decode_locale(const char *str, Py_ssize_t len, 3960 _Py_error_handler errors, int current_locale) 3961 { 3962 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3963 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3964 return NULL; 3965 } 3966 3967 wchar_t *wstr; 3968 size_t wlen; 3969 const char *reason; 3970 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 3971 current_locale, errors); 3972 if (res != 0) { 3973 if (res == -2) { 3974 PyObject *exc; 3975 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", 3976 "locale", str, len, 3977 (Py_ssize_t)wlen, 3978 (Py_ssize_t)(wlen + 1), 3979 reason); 3980 if (exc != NULL) { 3981 PyCodec_StrictErrors(exc); 3982 Py_DECREF(exc); 3983 } 3984 } 3985 else if (res == -3) { 3986 PyErr_SetString(PyExc_ValueError, "unsupported error handler"); 3987 } 3988 else { 3989 PyErr_NoMemory(); 3990 } 3991 return NULL; 3992 } 3993 3994 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen); 3995 PyMem_RawFree(wstr); 3996 return unicode; 3997 } 3998 3999 PyObject* PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 4001 const char *errors) 4002 { 4003 _Py_error_handler error_handler = _Py_GetErrorHandler(errors); 4004 return unicode_decode_locale(str, len, error_handler, 1); 4005 } 4006 4007 PyObject* PyUnicode_DecodeLocale(const char * str,const char * errors)4008 PyUnicode_DecodeLocale(const char *str, const char *errors) 4009 { 4010 Py_ssize_t size = (Py_ssize_t)strlen(str); 4011 _Py_error_handler error_handler = _Py_GetErrorHandler(errors); 4012 return unicode_decode_locale(str, size, error_handler, 1); 4013 } 4014 4015 4016 PyObject* PyUnicode_DecodeFSDefault(const char * s)4017 PyUnicode_DecodeFSDefault(const char *s) { 4018 Py_ssize_t size = (Py_ssize_t)strlen(s); 4019 return PyUnicode_DecodeFSDefaultAndSize(s, size); 4020 } 4021 4022 PyObject* PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 4024 { 4025 PyInterpreterState *interp = _PyInterpreterState_GET(); 4026 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec; 4027 if (fs_codec->utf8) { 4028 return unicode_decode_utf8(s, size, 4029 fs_codec->error_handler, 4030 fs_codec->errors, 4031 NULL); 4032 } 4033 #ifndef _Py_FORCE_UTF8_FS_ENCODING 4034 else if (fs_codec->encoding) { 4035 return PyUnicode_Decode(s, size, 4036 fs_codec->encoding, 4037 fs_codec->errors); 4038 } 4039 #endif 4040 else { 4041 /* Before _PyUnicode_InitEncodings() is called, the Python codec 4042 machinery is not ready and so cannot be used: 4043 use mbstowcs() in this case. */ 4044 const PyConfig *config = _PyInterpreterState_GetConfig(interp); 4045 const wchar_t *filesystem_errors = config->filesystem_errors; 4046 assert(filesystem_errors != NULL); 4047 _Py_error_handler errors = get_error_handler_wide(filesystem_errors); 4048 assert(errors != _Py_ERROR_UNKNOWN); 4049 #ifdef _Py_FORCE_UTF8_FS_ENCODING 4050 return unicode_decode_utf8(s, size, errors, NULL, NULL); 4051 #else 4052 return unicode_decode_locale(s, size, errors, 0); 4053 #endif 4054 } 4055 } 4056 4057 4058 int PyUnicode_FSConverter(PyObject * arg,void * addr)4059 PyUnicode_FSConverter(PyObject* arg, void* addr) 4060 { 4061 PyObject *path = NULL; 4062 PyObject *output = NULL; 4063 Py_ssize_t size; 4064 const char *data; 4065 if (arg == NULL) { 4066 Py_DECREF(*(PyObject**)addr); 4067 *(PyObject**)addr = NULL; 4068 return 1; 4069 } 4070 path = PyOS_FSPath(arg); 4071 if (path == NULL) { 4072 return 0; 4073 } 4074 if (PyBytes_Check(path)) { 4075 output = path; 4076 } 4077 else { // PyOS_FSPath() guarantees its returned value is bytes or str. 4078 output = PyUnicode_EncodeFSDefault(path); 4079 Py_DECREF(path); 4080 if (!output) { 4081 return 0; 4082 } 4083 assert(PyBytes_Check(output)); 4084 } 4085 4086 size = PyBytes_GET_SIZE(output); 4087 data = PyBytes_AS_STRING(output); 4088 if ((size_t)size != strlen(data)) { 4089 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 4090 Py_DECREF(output); 4091 return 0; 4092 } 4093 *(PyObject**)addr = output; 4094 return Py_CLEANUP_SUPPORTED; 4095 } 4096 4097 4098 int PyUnicode_FSDecoder(PyObject * arg,void * addr)4099 PyUnicode_FSDecoder(PyObject* arg, void* addr) 4100 { 4101 int is_buffer = 0; 4102 PyObject *path = NULL; 4103 PyObject *output = NULL; 4104 if (arg == NULL) { 4105 Py_DECREF(*(PyObject**)addr); 4106 *(PyObject**)addr = NULL; 4107 return 1; 4108 } 4109 4110 is_buffer = PyObject_CheckBuffer(arg); 4111 if (!is_buffer) { 4112 path = PyOS_FSPath(arg); 4113 if (path == NULL) { 4114 return 0; 4115 } 4116 } 4117 else { 4118 path = arg; 4119 Py_INCREF(arg); 4120 } 4121 4122 if (PyUnicode_Check(path)) { 4123 output = path; 4124 } 4125 else if (PyBytes_Check(path) || is_buffer) { 4126 PyObject *path_bytes = NULL; 4127 4128 if (!PyBytes_Check(path) && 4129 PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 4130 "path should be string, bytes, or os.PathLike, not %.200s", 4131 Py_TYPE(arg)->tp_name)) { 4132 Py_DECREF(path); 4133 return 0; 4134 } 4135 path_bytes = PyBytes_FromObject(path); 4136 Py_DECREF(path); 4137 if (!path_bytes) { 4138 return 0; 4139 } 4140 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), 4141 PyBytes_GET_SIZE(path_bytes)); 4142 Py_DECREF(path_bytes); 4143 if (!output) { 4144 return 0; 4145 } 4146 } 4147 else { 4148 PyErr_Format(PyExc_TypeError, 4149 "path should be string, bytes, or os.PathLike, not %.200s", 4150 Py_TYPE(arg)->tp_name); 4151 Py_DECREF(path); 4152 return 0; 4153 } 4154 if (PyUnicode_READY(output) == -1) { 4155 Py_DECREF(output); 4156 return 0; 4157 } 4158 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 4159 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 4160 PyErr_SetString(PyExc_ValueError, "embedded null character"); 4161 Py_DECREF(output); 4162 return 0; 4163 } 4164 *(PyObject**)addr = output; 4165 return Py_CLEANUP_SUPPORTED; 4166 } 4167 4168 4169 static int unicode_fill_utf8(PyObject *unicode); 4170 4171 const char * PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 4173 { 4174 if (!PyUnicode_Check(unicode)) { 4175 PyErr_BadArgument(); 4176 return NULL; 4177 } 4178 if (PyUnicode_READY(unicode) == -1) 4179 return NULL; 4180 4181 if (PyUnicode_UTF8(unicode) == NULL) { 4182 if (unicode_fill_utf8(unicode) == -1) { 4183 return NULL; 4184 } 4185 } 4186 4187 if (psize) 4188 *psize = PyUnicode_UTF8_LENGTH(unicode); 4189 return PyUnicode_UTF8(unicode); 4190 } 4191 4192 const char * PyUnicode_AsUTF8(PyObject * unicode)4193 PyUnicode_AsUTF8(PyObject *unicode) 4194 { 4195 return PyUnicode_AsUTF8AndSize(unicode, NULL); 4196 } 4197 4198 Py_UNICODE * PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 4200 { 4201 if (!PyUnicode_Check(unicode)) { 4202 PyErr_BadArgument(); 4203 return NULL; 4204 } 4205 Py_UNICODE *w = _PyUnicode_WSTR(unicode); 4206 if (w == NULL) { 4207 /* Non-ASCII compact unicode object */ 4208 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND); 4209 assert(PyUnicode_IS_READY(unicode)); 4210 4211 Py_ssize_t wlen = unicode_get_widechar_size(unicode); 4212 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 4213 PyErr_NoMemory(); 4214 return NULL; 4215 } 4216 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1)); 4217 if (w == NULL) { 4218 PyErr_NoMemory(); 4219 return NULL; 4220 } 4221 unicode_copy_as_widechar(unicode, w, wlen + 1); 4222 _PyUnicode_WSTR(unicode) = w; 4223 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) { 4224 _PyUnicode_WSTR_LENGTH(unicode) = wlen; 4225 } 4226 } 4227 if (size != NULL) 4228 *size = PyUnicode_WSTR_LENGTH(unicode); 4229 return w; 4230 } 4231 4232 /* Deprecated APIs */ 4233 4234 _Py_COMP_DIAG_PUSH 4235 _Py_COMP_DIAG_IGNORE_DEPR_DECLS 4236 4237 Py_UNICODE * PyUnicode_AsUnicode(PyObject * unicode)4238 PyUnicode_AsUnicode(PyObject *unicode) 4239 { 4240 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 4241 } 4242 4243 const Py_UNICODE * _PyUnicode_AsUnicode(PyObject * unicode)4244 _PyUnicode_AsUnicode(PyObject *unicode) 4245 { 4246 Py_ssize_t size; 4247 const Py_UNICODE *wstr; 4248 4249 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size); 4250 if (wstr && wcslen(wstr) != (size_t)size) { 4251 PyErr_SetString(PyExc_ValueError, "embedded null character"); 4252 return NULL; 4253 } 4254 return wstr; 4255 } 4256 4257 4258 Py_ssize_t PyUnicode_GetSize(PyObject * unicode)4259 PyUnicode_GetSize(PyObject *unicode) 4260 { 4261 if (!PyUnicode_Check(unicode)) { 4262 PyErr_BadArgument(); 4263 goto onError; 4264 } 4265 if (_PyUnicode_WSTR(unicode) == NULL) { 4266 if (PyUnicode_AsUnicode(unicode) == NULL) 4267 goto onError; 4268 } 4269 return PyUnicode_WSTR_LENGTH(unicode); 4270 4271 onError: 4272 return -1; 4273 } 4274 4275 _Py_COMP_DIAG_POP 4276 4277 Py_ssize_t PyUnicode_GetLength(PyObject * unicode)4278 PyUnicode_GetLength(PyObject *unicode) 4279 { 4280 if (!PyUnicode_Check(unicode)) { 4281 PyErr_BadArgument(); 4282 return -1; 4283 } 4284 if (PyUnicode_READY(unicode) == -1) 4285 return -1; 4286 return PyUnicode_GET_LENGTH(unicode); 4287 } 4288 4289 Py_UCS4 PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4291 { 4292 const void *data; 4293 int kind; 4294 4295 if (!PyUnicode_Check(unicode)) { 4296 PyErr_BadArgument(); 4297 return (Py_UCS4)-1; 4298 } 4299 if (PyUnicode_READY(unicode) == -1) { 4300 return (Py_UCS4)-1; 4301 } 4302 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4303 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4304 return (Py_UCS4)-1; 4305 } 4306 data = PyUnicode_DATA(unicode); 4307 kind = PyUnicode_KIND(unicode); 4308 return PyUnicode_READ(kind, data, index); 4309 } 4310 4311 int PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4313 { 4314 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4315 PyErr_BadArgument(); 4316 return -1; 4317 } 4318 assert(PyUnicode_IS_READY(unicode)); 4319 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4320 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4321 return -1; 4322 } 4323 if (unicode_check_modifiable(unicode)) 4324 return -1; 4325 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4326 PyErr_SetString(PyExc_ValueError, "character out of range"); 4327 return -1; 4328 } 4329 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4330 index, ch); 4331 return 0; 4332 } 4333 4334 const char * PyUnicode_GetDefaultEncoding(void)4335 PyUnicode_GetDefaultEncoding(void) 4336 { 4337 return "utf-8"; 4338 } 4339 4340 /* create or adjust a UnicodeDecodeError */ 4341 static void make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342 make_decode_exception(PyObject **exceptionObject, 4343 const char *encoding, 4344 const char *input, Py_ssize_t length, 4345 Py_ssize_t startpos, Py_ssize_t endpos, 4346 const char *reason) 4347 { 4348 if (*exceptionObject == NULL) { 4349 *exceptionObject = PyUnicodeDecodeError_Create( 4350 encoding, input, length, startpos, endpos, reason); 4351 } 4352 else { 4353 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4354 goto onError; 4355 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4356 goto onError; 4357 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4358 goto onError; 4359 } 4360 return; 4361 4362 onError: 4363 Py_CLEAR(*exceptionObject); 4364 } 4365 4366 #ifdef MS_WINDOWS 4367 static int widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize) 4369 { 4370 if (newsize > *size) { 4371 wchar_t *newbuf = *buf; 4372 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) { 4373 PyErr_NoMemory(); 4374 return -1; 4375 } 4376 *buf = newbuf; 4377 } 4378 *size = newsize; 4379 return 0; 4380 } 4381 4382 /* error handling callback helper: 4383 build arguments, call the callback and check the arguments, 4384 if no exception occurred, copy the replacement to the output 4385 and adjust various state variables. 4386 return 0 on success, -1 on error 4387 */ 4388 4389 static int unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390 unicode_decode_call_errorhandler_wchar( 4391 const char *errors, PyObject **errorHandler, 4392 const char *encoding, const char *reason, 4393 const char **input, const char **inend, Py_ssize_t *startinpos, 4394 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4395 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos) 4396 { 4397 static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4398 4399 PyObject *restuple = NULL; 4400 PyObject *repunicode = NULL; 4401 Py_ssize_t outsize; 4402 Py_ssize_t insize; 4403 Py_ssize_t requiredsize; 4404 Py_ssize_t newpos; 4405 PyObject *inputobj = NULL; 4406 Py_ssize_t repwlen; 4407 4408 if (*errorHandler == NULL) { 4409 *errorHandler = PyCodec_LookupError(errors); 4410 if (*errorHandler == NULL) 4411 goto onError; 4412 } 4413 4414 make_decode_exception(exceptionObject, 4415 encoding, 4416 *input, *inend - *input, 4417 *startinpos, *endinpos, 4418 reason); 4419 if (*exceptionObject == NULL) 4420 goto onError; 4421 4422 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject); 4423 if (restuple == NULL) 4424 goto onError; 4425 if (!PyTuple_Check(restuple)) { 4426 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4427 goto onError; 4428 } 4429 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4430 goto onError; 4431 4432 /* Copy back the bytes variables, which might have been modified by the 4433 callback */ 4434 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4435 if (!inputobj) 4436 goto onError; 4437 *input = PyBytes_AS_STRING(inputobj); 4438 insize = PyBytes_GET_SIZE(inputobj); 4439 *inend = *input + insize; 4440 /* we can DECREF safely, as the exception has another reference, 4441 so the object won't go away. */ 4442 Py_DECREF(inputobj); 4443 4444 if (newpos<0) 4445 newpos = insize+newpos; 4446 if (newpos<0 || newpos>insize) { 4447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4448 goto onError; 4449 } 4450 4451 #if USE_UNICODE_WCHAR_CACHE 4452 _Py_COMP_DIAG_PUSH 4453 _Py_COMP_DIAG_IGNORE_DEPR_DECLS 4454 repwlen = PyUnicode_GetSize(repunicode); 4455 if (repwlen < 0) 4456 goto onError; 4457 _Py_COMP_DIAG_POP 4458 #else /* USE_UNICODE_WCHAR_CACHE */ 4459 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0); 4460 if (repwlen < 0) 4461 goto onError; 4462 repwlen--; 4463 #endif /* USE_UNICODE_WCHAR_CACHE */ 4464 /* need more space? (at least enough for what we 4465 have+the replacement+the rest of the string (starting 4466 at the new input position), so we won't have to check space 4467 when there are no errors in the rest of the string) */ 4468 requiredsize = *outpos; 4469 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4470 goto overflow; 4471 requiredsize += repwlen; 4472 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4473 goto overflow; 4474 requiredsize += insize - newpos; 4475 outsize = *bufsize; 4476 if (requiredsize > outsize) { 4477 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4478 requiredsize = 2*outsize; 4479 if (widechar_resize(buf, bufsize, requiredsize) < 0) { 4480 goto onError; 4481 } 4482 } 4483 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen); 4484 *outpos += repwlen; 4485 *endinpos = newpos; 4486 *inptr = *input + newpos; 4487 4488 /* we made it! */ 4489 Py_DECREF(restuple); 4490 return 0; 4491 4492 overflow: 4493 PyErr_SetString(PyExc_OverflowError, 4494 "decoded result is too long for a Python string"); 4495 4496 onError: 4497 Py_XDECREF(restuple); 4498 return -1; 4499 } 4500 #endif /* MS_WINDOWS */ 4501 4502 static int unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503 unicode_decode_call_errorhandler_writer( 4504 const char *errors, PyObject **errorHandler, 4505 const char *encoding, const char *reason, 4506 const char **input, const char **inend, Py_ssize_t *startinpos, 4507 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4508 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4509 { 4510 static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4511 4512 PyObject *restuple = NULL; 4513 PyObject *repunicode = NULL; 4514 Py_ssize_t insize; 4515 Py_ssize_t newpos; 4516 Py_ssize_t replen; 4517 Py_ssize_t remain; 4518 PyObject *inputobj = NULL; 4519 int need_to_grow = 0; 4520 const char *new_inptr; 4521 4522 if (*errorHandler == NULL) { 4523 *errorHandler = PyCodec_LookupError(errors); 4524 if (*errorHandler == NULL) 4525 goto onError; 4526 } 4527 4528 make_decode_exception(exceptionObject, 4529 encoding, 4530 *input, *inend - *input, 4531 *startinpos, *endinpos, 4532 reason); 4533 if (*exceptionObject == NULL) 4534 goto onError; 4535 4536 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject); 4537 if (restuple == NULL) 4538 goto onError; 4539 if (!PyTuple_Check(restuple)) { 4540 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4541 goto onError; 4542 } 4543 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4544 goto onError; 4545 4546 /* Copy back the bytes variables, which might have been modified by the 4547 callback */ 4548 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4549 if (!inputobj) 4550 goto onError; 4551 remain = *inend - *input - *endinpos; 4552 *input = PyBytes_AS_STRING(inputobj); 4553 insize = PyBytes_GET_SIZE(inputobj); 4554 *inend = *input + insize; 4555 /* we can DECREF safely, as the exception has another reference, 4556 so the object won't go away. */ 4557 Py_DECREF(inputobj); 4558 4559 if (newpos<0) 4560 newpos = insize+newpos; 4561 if (newpos<0 || newpos>insize) { 4562 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4563 goto onError; 4564 } 4565 4566 replen = PyUnicode_GET_LENGTH(repunicode); 4567 if (replen > 1) { 4568 writer->min_length += replen - 1; 4569 need_to_grow = 1; 4570 } 4571 new_inptr = *input + newpos; 4572 if (*inend - new_inptr > remain) { 4573 /* We don't know the decoding algorithm here so we make the worst 4574 assumption that one byte decodes to one unicode character. 4575 If unfortunately one byte could decode to more unicode characters, 4576 the decoder may write out-of-bound then. Is it possible for the 4577 algorithms using this function? */ 4578 writer->min_length += *inend - new_inptr - remain; 4579 need_to_grow = 1; 4580 } 4581 if (need_to_grow) { 4582 writer->overallocate = 1; 4583 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos, 4584 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4585 goto onError; 4586 } 4587 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4588 goto onError; 4589 4590 *endinpos = newpos; 4591 *inptr = new_inptr; 4592 4593 /* we made it! */ 4594 Py_DECREF(restuple); 4595 return 0; 4596 4597 onError: 4598 Py_XDECREF(restuple); 4599 return -1; 4600 } 4601 4602 /* --- UTF-7 Codec -------------------------------------------------------- */ 4603 4604 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4605 4606 /* Three simple macros defining base-64. */ 4607 4608 /* Is c a base-64 character? */ 4609 4610 #define IS_BASE64(c) \ 4611 (((c) >= 'A' && (c) <= 'Z') || \ 4612 ((c) >= 'a' && (c) <= 'z') || \ 4613 ((c) >= '0' && (c) <= '9') || \ 4614 (c) == '+' || (c) == '/') 4615 4616 /* given that c is a base-64 character, what is its base-64 value? */ 4617 4618 #define FROM_BASE64(c) \ 4619 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4620 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4621 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4622 (c) == '+' ? 62 : 63) 4623 4624 /* What is the base-64 character of the bottom 6 bits of n? */ 4625 4626 #define TO_BASE64(n) \ 4627 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4628 4629 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4630 * decoded as itself. We are permissive on decoding; the only ASCII 4631 * byte not decoding to itself is the + which begins a base64 4632 * string. */ 4633 4634 #define DECODE_DIRECT(c) \ 4635 ((c) <= 127 && (c) != '+') 4636 4637 /* The UTF-7 encoder treats ASCII characters differently according to 4638 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4639 * the above). See RFC2152. This array identifies these different 4640 * sets: 4641 * 0 : "Set D" 4642 * alphanumeric and '(),-./:? 4643 * 1 : "Set O" 4644 * !"#$%&*;<=>@[]^_`{|} 4645 * 2 : "whitespace" 4646 * ht nl cr sp 4647 * 3 : special (must be base64 encoded) 4648 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4649 */ 4650 4651 static 4652 char utf7_category[128] = { 4653 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4654 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4655 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4656 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4657 /* sp ! " # $ % & ' ( ) * + , - . / */ 4658 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4659 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4661 /* @ A B C D E F G H I J K L M N O */ 4662 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4663 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4665 /* ` a b c d e f g h i j k l m n o */ 4666 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4667 /* p q r s t u v w x y z { | } ~ del */ 4668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4669 }; 4670 4671 /* ENCODE_DIRECT: this character should be encoded as itself. The 4672 * answer depends on whether we are encoding set O as itself, and also 4673 * on whether we are encoding whitespace as itself. RFC2152 makes it 4674 * clear that the answers to these questions vary between 4675 * applications, so this code needs to be flexible. */ 4676 4677 #define ENCODE_DIRECT(c, directO, directWS) \ 4678 ((c) < 128 && (c) > 0 && \ 4679 ((utf7_category[(c)] == 0) || \ 4680 (directWS && (utf7_category[(c)] == 2)) || \ 4681 (directO && (utf7_category[(c)] == 1)))) 4682 4683 PyObject * PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684 PyUnicode_DecodeUTF7(const char *s, 4685 Py_ssize_t size, 4686 const char *errors) 4687 { 4688 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4689 } 4690 4691 /* The decoder. The only state we preserve is our read position, 4692 * i.e. how many characters we have consumed. So if we end in the 4693 * middle of a shift sequence we have to back off the read position 4694 * and the output to the beginning of the sequence, otherwise we lose 4695 * all the shift state (seen bits, number of bits seen, high 4696 * surrogate). */ 4697 4698 PyObject * PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699 PyUnicode_DecodeUTF7Stateful(const char *s, 4700 Py_ssize_t size, 4701 const char *errors, 4702 Py_ssize_t *consumed) 4703 { 4704 const char *starts = s; 4705 Py_ssize_t startinpos; 4706 Py_ssize_t endinpos; 4707 const char *e; 4708 _PyUnicodeWriter writer; 4709 const char *errmsg = ""; 4710 int inShift = 0; 4711 Py_ssize_t shiftOutStart; 4712 unsigned int base64bits = 0; 4713 unsigned long base64buffer = 0; 4714 Py_UCS4 surrogate = 0; 4715 PyObject *errorHandler = NULL; 4716 PyObject *exc = NULL; 4717 4718 if (size == 0) { 4719 if (consumed) 4720 *consumed = 0; 4721 _Py_RETURN_UNICODE_EMPTY(); 4722 } 4723 4724 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4725 _PyUnicodeWriter_Init(&writer); 4726 writer.min_length = size; 4727 4728 shiftOutStart = 0; 4729 e = s + size; 4730 4731 while (s < e) { 4732 Py_UCS4 ch; 4733 restart: 4734 ch = (unsigned char) *s; 4735 4736 if (inShift) { /* in a base-64 section */ 4737 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4738 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4739 base64bits += 6; 4740 s++; 4741 if (base64bits >= 16) { 4742 /* we have enough bits for a UTF-16 value */ 4743 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4744 base64bits -= 16; 4745 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4746 assert(outCh <= 0xffff); 4747 if (surrogate) { 4748 /* expecting a second surrogate */ 4749 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4750 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4751 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4752 goto onError; 4753 surrogate = 0; 4754 continue; 4755 } 4756 else { 4757 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4758 goto onError; 4759 surrogate = 0; 4760 } 4761 } 4762 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4763 /* first surrogate */ 4764 surrogate = outCh; 4765 } 4766 else { 4767 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4768 goto onError; 4769 } 4770 } 4771 } 4772 else { /* now leaving a base-64 section */ 4773 inShift = 0; 4774 if (base64bits > 0) { /* left-over bits */ 4775 if (base64bits >= 6) { 4776 /* We've seen at least one base-64 character */ 4777 s++; 4778 errmsg = "partial character in shift sequence"; 4779 goto utf7Error; 4780 } 4781 else { 4782 /* Some bits remain; they should be zero */ 4783 if (base64buffer != 0) { 4784 s++; 4785 errmsg = "non-zero padding bits in shift sequence"; 4786 goto utf7Error; 4787 } 4788 } 4789 } 4790 if (surrogate && DECODE_DIRECT(ch)) { 4791 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4792 goto onError; 4793 } 4794 surrogate = 0; 4795 if (ch == '-') { 4796 /* '-' is absorbed; other terminating 4797 characters are preserved */ 4798 s++; 4799 } 4800 } 4801 } 4802 else if ( ch == '+' ) { 4803 startinpos = s-starts; 4804 s++; /* consume '+' */ 4805 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4806 s++; 4807 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4808 goto onError; 4809 } 4810 else if (s < e && !IS_BASE64(*s)) { 4811 s++; 4812 errmsg = "ill-formed sequence"; 4813 goto utf7Error; 4814 } 4815 else { /* begin base64-encoded section */ 4816 inShift = 1; 4817 surrogate = 0; 4818 shiftOutStart = writer.pos; 4819 base64bits = 0; 4820 base64buffer = 0; 4821 } 4822 } 4823 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4824 s++; 4825 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4826 goto onError; 4827 } 4828 else { 4829 startinpos = s-starts; 4830 s++; 4831 errmsg = "unexpected special character"; 4832 goto utf7Error; 4833 } 4834 continue; 4835 utf7Error: 4836 endinpos = s-starts; 4837 if (unicode_decode_call_errorhandler_writer( 4838 errors, &errorHandler, 4839 "utf7", errmsg, 4840 &starts, &e, &startinpos, &endinpos, &exc, &s, 4841 &writer)) 4842 goto onError; 4843 } 4844 4845 /* end of string */ 4846 4847 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4848 /* if we're in an inconsistent state, that's an error */ 4849 inShift = 0; 4850 if (surrogate || 4851 (base64bits >= 6) || 4852 (base64bits > 0 && base64buffer != 0)) { 4853 endinpos = size; 4854 if (unicode_decode_call_errorhandler_writer( 4855 errors, &errorHandler, 4856 "utf7", "unterminated shift sequence", 4857 &starts, &e, &startinpos, &endinpos, &exc, &s, 4858 &writer)) 4859 goto onError; 4860 if (s < e) 4861 goto restart; 4862 } 4863 } 4864 4865 /* return state */ 4866 if (consumed) { 4867 if (inShift) { 4868 *consumed = startinpos; 4869 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4870 PyObject *result = PyUnicode_FromKindAndData( 4871 writer.kind, writer.data, shiftOutStart); 4872 Py_XDECREF(errorHandler); 4873 Py_XDECREF(exc); 4874 _PyUnicodeWriter_Dealloc(&writer); 4875 return result; 4876 } 4877 writer.pos = shiftOutStart; /* back off output */ 4878 } 4879 else { 4880 *consumed = s-starts; 4881 } 4882 } 4883 4884 Py_XDECREF(errorHandler); 4885 Py_XDECREF(exc); 4886 return _PyUnicodeWriter_Finish(&writer); 4887 4888 onError: 4889 Py_XDECREF(errorHandler); 4890 Py_XDECREF(exc); 4891 _PyUnicodeWriter_Dealloc(&writer); 4892 return NULL; 4893 } 4894 4895 4896 PyObject * _PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897 _PyUnicode_EncodeUTF7(PyObject *str, 4898 int base64SetO, 4899 int base64WhiteSpace, 4900 const char *errors) 4901 { 4902 int kind; 4903 const void *data; 4904 Py_ssize_t len; 4905 PyObject *v; 4906 int inShift = 0; 4907 Py_ssize_t i; 4908 unsigned int base64bits = 0; 4909 unsigned long base64buffer = 0; 4910 char * out; 4911 const char * start; 4912 4913 if (PyUnicode_READY(str) == -1) 4914 return NULL; 4915 kind = PyUnicode_KIND(str); 4916 data = PyUnicode_DATA(str); 4917 len = PyUnicode_GET_LENGTH(str); 4918 4919 if (len == 0) 4920 return PyBytes_FromStringAndSize(NULL, 0); 4921 4922 /* It might be possible to tighten this worst case */ 4923 if (len > PY_SSIZE_T_MAX / 8) 4924 return PyErr_NoMemory(); 4925 v = PyBytes_FromStringAndSize(NULL, len * 8); 4926 if (v == NULL) 4927 return NULL; 4928 4929 start = out = PyBytes_AS_STRING(v); 4930 for (i = 0; i < len; ++i) { 4931 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4932 4933 if (inShift) { 4934 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4935 /* shifting out */ 4936 if (base64bits) { /* output remaining bits */ 4937 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4938 base64buffer = 0; 4939 base64bits = 0; 4940 } 4941 inShift = 0; 4942 /* Characters not in the BASE64 set implicitly unshift the sequence 4943 so no '-' is required, except if the character is itself a '-' */ 4944 if (IS_BASE64(ch) || ch == '-') { 4945 *out++ = '-'; 4946 } 4947 *out++ = (char) ch; 4948 } 4949 else { 4950 goto encode_char; 4951 } 4952 } 4953 else { /* not in a shift sequence */ 4954 if (ch == '+') { 4955 *out++ = '+'; 4956 *out++ = '-'; 4957 } 4958 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4959 *out++ = (char) ch; 4960 } 4961 else { 4962 *out++ = '+'; 4963 inShift = 1; 4964 goto encode_char; 4965 } 4966 } 4967 continue; 4968 encode_char: 4969 if (ch >= 0x10000) { 4970 assert(ch <= MAX_UNICODE); 4971 4972 /* code first surrogate */ 4973 base64bits += 16; 4974 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4975 while (base64bits >= 6) { 4976 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4977 base64bits -= 6; 4978 } 4979 /* prepare second surrogate */ 4980 ch = Py_UNICODE_LOW_SURROGATE(ch); 4981 } 4982 base64bits += 16; 4983 base64buffer = (base64buffer << 16) | ch; 4984 while (base64bits >= 6) { 4985 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4986 base64bits -= 6; 4987 } 4988 } 4989 if (base64bits) 4990 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4991 if (inShift) 4992 *out++ = '-'; 4993 if (_PyBytes_Resize(&v, out - start) < 0) 4994 return NULL; 4995 return v; 4996 } 4997 4998 #undef IS_BASE64 4999 #undef FROM_BASE64 5000 #undef TO_BASE64 5001 #undef DECODE_DIRECT 5002 #undef ENCODE_DIRECT 5003 5004 /* --- UTF-8 Codec -------------------------------------------------------- */ 5005 5006 PyObject * PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007 PyUnicode_DecodeUTF8(const char *s, 5008 Py_ssize_t size, 5009 const char *errors) 5010 { 5011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 5012 } 5013 5014 #include "stringlib/asciilib.h" 5015 #include "stringlib/codecs.h" 5016 #include "stringlib/undef.h" 5017 5018 #include "stringlib/ucs1lib.h" 5019 #include "stringlib/codecs.h" 5020 #include "stringlib/undef.h" 5021 5022 #include "stringlib/ucs2lib.h" 5023 #include "stringlib/codecs.h" 5024 #include "stringlib/undef.h" 5025 5026 #include "stringlib/ucs4lib.h" 5027 #include "stringlib/codecs.h" 5028 #include "stringlib/undef.h" 5029 5030 /* Mask to quickly check whether a C 'size_t' contains a 5031 non-ASCII, UTF8-encoded char. */ 5032 #if (SIZEOF_SIZE_T == 8) 5033 # define ASCII_CHAR_MASK 0x8080808080808080ULL 5034 #elif (SIZEOF_SIZE_T == 4) 5035 # define ASCII_CHAR_MASK 0x80808080U 5036 #else 5037 # error C 'size_t' size should be either 4 or 8! 5038 #endif 5039 5040 static Py_ssize_t ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041 ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 5042 { 5043 const char *p = start; 5044 5045 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P 5046 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)); 5047 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { 5048 /* Fast path, see in STRINGLIB(utf8_decode) for 5049 an explanation. */ 5050 /* Help allocation */ 5051 const char *_p = p; 5052 Py_UCS1 * q = dest; 5053 while (_p + SIZEOF_SIZE_T <= end) { 5054 size_t value = *(const size_t *) _p; 5055 if (value & ASCII_CHAR_MASK) 5056 break; 5057 *((size_t *)q) = value; 5058 _p += SIZEOF_SIZE_T; 5059 q += SIZEOF_SIZE_T; 5060 } 5061 p = _p; 5062 while (p < end) { 5063 if ((unsigned char)*p & 0x80) 5064 break; 5065 *q++ = *p++; 5066 } 5067 return p - start; 5068 } 5069 #endif 5070 while (p < end) { 5071 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 5072 for an explanation. */ 5073 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { 5074 /* Help allocation */ 5075 const char *_p = p; 5076 while (_p + SIZEOF_SIZE_T <= end) { 5077 size_t value = *(const size_t *) _p; 5078 if (value & ASCII_CHAR_MASK) 5079 break; 5080 _p += SIZEOF_SIZE_T; 5081 } 5082 p = _p; 5083 if (_p == end) 5084 break; 5085 } 5086 if ((unsigned char)*p & 0x80) 5087 break; 5088 ++p; 5089 } 5090 memcpy(dest, start, p - start); 5091 return p - start; 5092 } 5093 5094 static PyObject * unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095 unicode_decode_utf8(const char *s, Py_ssize_t size, 5096 _Py_error_handler error_handler, const char *errors, 5097 Py_ssize_t *consumed) 5098 { 5099 if (size == 0) { 5100 if (consumed) 5101 *consumed = 0; 5102 _Py_RETURN_UNICODE_EMPTY(); 5103 } 5104 5105 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 5106 if (size == 1 && (unsigned char)s[0] < 128) { 5107 if (consumed) { 5108 *consumed = 1; 5109 } 5110 return get_latin1_char((unsigned char)s[0]); 5111 } 5112 5113 const char *starts = s; 5114 const char *end = s + size; 5115 5116 // fast path: try ASCII string. 5117 PyObject *u = PyUnicode_New(size, 127); 5118 if (u == NULL) { 5119 return NULL; 5120 } 5121 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u)); 5122 if (s == end) { 5123 return u; 5124 } 5125 5126 // Use _PyUnicodeWriter after fast path is failed. 5127 _PyUnicodeWriter writer; 5128 _PyUnicodeWriter_InitWithBuffer(&writer, u); 5129 writer.pos = s - starts; 5130 5131 Py_ssize_t startinpos, endinpos; 5132 const char *errmsg = ""; 5133 PyObject *error_handler_obj = NULL; 5134 PyObject *exc = NULL; 5135 5136 while (s < end) { 5137 Py_UCS4 ch; 5138 int kind = writer.kind; 5139 5140 if (kind == PyUnicode_1BYTE_KIND) { 5141 if (PyUnicode_IS_ASCII(writer.buffer)) 5142 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 5143 else 5144 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 5145 } else if (kind == PyUnicode_2BYTE_KIND) { 5146 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 5147 } else { 5148 assert(kind == PyUnicode_4BYTE_KIND); 5149 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 5150 } 5151 5152 switch (ch) { 5153 case 0: 5154 if (s == end || consumed) 5155 goto End; 5156 errmsg = "unexpected end of data"; 5157 startinpos = s - starts; 5158 endinpos = end - starts; 5159 break; 5160 case 1: 5161 errmsg = "invalid start byte"; 5162 startinpos = s - starts; 5163 endinpos = startinpos + 1; 5164 break; 5165 case 2: 5166 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2 5167 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF) 5168 { 5169 /* Truncated surrogate code in range D800-DFFF */ 5170 goto End; 5171 } 5172 /* fall through */ 5173 case 3: 5174 case 4: 5175 errmsg = "invalid continuation byte"; 5176 startinpos = s - starts; 5177 endinpos = startinpos + ch - 1; 5178 break; 5179 default: 5180 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5181 goto onError; 5182 continue; 5183 } 5184 5185 if (error_handler == _Py_ERROR_UNKNOWN) 5186 error_handler = _Py_GetErrorHandler(errors); 5187 5188 switch (error_handler) { 5189 case _Py_ERROR_IGNORE: 5190 s += (endinpos - startinpos); 5191 break; 5192 5193 case _Py_ERROR_REPLACE: 5194 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) 5195 goto onError; 5196 s += (endinpos - startinpos); 5197 break; 5198 5199 case _Py_ERROR_SURROGATEESCAPE: 5200 { 5201 Py_ssize_t i; 5202 5203 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 5204 goto onError; 5205 for (i=startinpos; i<endinpos; i++) { 5206 ch = (Py_UCS4)(unsigned char)(starts[i]); 5207 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, 5208 ch + 0xdc00); 5209 writer.pos++; 5210 } 5211 s += (endinpos - startinpos); 5212 break; 5213 } 5214 5215 default: 5216 if (unicode_decode_call_errorhandler_writer( 5217 errors, &error_handler_obj, 5218 "utf-8", errmsg, 5219 &starts, &end, &startinpos, &endinpos, &exc, &s, 5220 &writer)) 5221 goto onError; 5222 } 5223 } 5224 5225 End: 5226 if (consumed) 5227 *consumed = s - starts; 5228 5229 Py_XDECREF(error_handler_obj); 5230 Py_XDECREF(exc); 5231 return _PyUnicodeWriter_Finish(&writer); 5232 5233 onError: 5234 Py_XDECREF(error_handler_obj); 5235 Py_XDECREF(exc); 5236 _PyUnicodeWriter_Dealloc(&writer); 5237 return NULL; 5238 } 5239 5240 5241 PyObject * PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242 PyUnicode_DecodeUTF8Stateful(const char *s, 5243 Py_ssize_t size, 5244 const char *errors, 5245 Py_ssize_t *consumed) 5246 { 5247 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed); 5248 } 5249 5250 5251 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is 5252 non-zero, use strict error handler otherwise. 5253 5254 On success, write a pointer to a newly allocated wide character string into 5255 *wstr (use PyMem_RawFree() to free the memory) and write the output length 5256 (in number of wchar_t units) into *wlen (if wlen is set). 5257 5258 On memory allocation failure, return -1. 5259 5260 On decoding error (if surrogateescape is zero), return -2. If wlen is 5261 non-NULL, write the start of the illegal byte sequence into *wlen. If reason 5262 is not NULL, write the decoding error message into *reason. */ 5263 int _Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, 5265 const char **reason, _Py_error_handler errors) 5266 { 5267 const char *orig_s = s; 5268 const char *e; 5269 wchar_t *unicode; 5270 Py_ssize_t outpos; 5271 5272 int surrogateescape = 0; 5273 int surrogatepass = 0; 5274 switch (errors) 5275 { 5276 case _Py_ERROR_STRICT: 5277 break; 5278 case _Py_ERROR_SURROGATEESCAPE: 5279 surrogateescape = 1; 5280 break; 5281 case _Py_ERROR_SURROGATEPASS: 5282 surrogatepass = 1; 5283 break; 5284 default: 5285 return -3; 5286 } 5287 5288 /* Note: size will always be longer than the resulting Unicode 5289 character count */ 5290 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) { 5291 return -1; 5292 } 5293 5294 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 5295 if (!unicode) { 5296 return -1; 5297 } 5298 5299 /* Unpack UTF-8 encoded data */ 5300 e = s + size; 5301 outpos = 0; 5302 while (s < e) { 5303 Py_UCS4 ch; 5304 #if SIZEOF_WCHAR_T == 4 5305 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 5306 #else 5307 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 5308 #endif 5309 if (ch > 0xFF) { 5310 #if SIZEOF_WCHAR_T == 4 5311 Py_UNREACHABLE(); 5312 #else 5313 assert(ch > 0xFFFF && ch <= MAX_UNICODE); 5314 /* write a surrogate pair */ 5315 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 5316 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 5317 #endif 5318 } 5319 else { 5320 if (!ch && s == e) { 5321 break; 5322 } 5323 5324 if (surrogateescape) { 5325 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 5326 } 5327 else { 5328 /* Is it a valid three-byte code? */ 5329 if (surrogatepass 5330 && (e - s) >= 3 5331 && (s[0] & 0xf0) == 0xe0 5332 && (s[1] & 0xc0) == 0x80 5333 && (s[2] & 0xc0) == 0x80) 5334 { 5335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 5336 s += 3; 5337 unicode[outpos++] = ch; 5338 } 5339 else { 5340 PyMem_RawFree(unicode ); 5341 if (reason != NULL) { 5342 switch (ch) { 5343 case 0: 5344 *reason = "unexpected end of data"; 5345 break; 5346 case 1: 5347 *reason = "invalid start byte"; 5348 break; 5349 /* 2, 3, 4 */ 5350 default: 5351 *reason = "invalid continuation byte"; 5352 break; 5353 } 5354 } 5355 if (wlen != NULL) { 5356 *wlen = s - orig_s; 5357 } 5358 return -2; 5359 } 5360 } 5361 } 5362 } 5363 unicode[outpos] = L'\0'; 5364 if (wlen) { 5365 *wlen = outpos; 5366 } 5367 *wstr = unicode; 5368 return 0; 5369 } 5370 5371 5372 wchar_t* _Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen, 5374 size_t *wlen) 5375 { 5376 wchar_t *wstr; 5377 int res = _Py_DecodeUTF8Ex(arg, arglen, 5378 &wstr, wlen, 5379 NULL, _Py_ERROR_SURROGATEESCAPE); 5380 if (res != 0) { 5381 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */ 5382 assert(res != -3); 5383 if (wlen) { 5384 *wlen = (size_t)res; 5385 } 5386 return NULL; 5387 } 5388 return wstr; 5389 } 5390 5391 5392 /* UTF-8 encoder using the surrogateescape error handler . 5393 5394 On success, return 0 and write the newly allocated character string (use 5395 PyMem_Free() to free the memory) into *str. 5396 5397 On encoding failure, return -2 and write the position of the invalid 5398 surrogate character into *error_pos (if error_pos is set) and the decoding 5399 error message into *reason (if reason is set). 5400 5401 On memory allocation failure, return -1. */ 5402 int _Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, 5404 const char **reason, int raw_malloc, _Py_error_handler errors) 5405 { 5406 const Py_ssize_t max_char_size = 4; 5407 Py_ssize_t len = wcslen(text); 5408 5409 assert(len >= 0); 5410 5411 int surrogateescape = 0; 5412 int surrogatepass = 0; 5413 switch (errors) 5414 { 5415 case _Py_ERROR_STRICT: 5416 break; 5417 case _Py_ERROR_SURROGATEESCAPE: 5418 surrogateescape = 1; 5419 break; 5420 case _Py_ERROR_SURROGATEPASS: 5421 surrogatepass = 1; 5422 break; 5423 default: 5424 return -3; 5425 } 5426 5427 if (len > PY_SSIZE_T_MAX / max_char_size - 1) { 5428 return -1; 5429 } 5430 char *bytes; 5431 if (raw_malloc) { 5432 bytes = PyMem_RawMalloc((len + 1) * max_char_size); 5433 } 5434 else { 5435 bytes = PyMem_Malloc((len + 1) * max_char_size); 5436 } 5437 if (bytes == NULL) { 5438 return -1; 5439 } 5440 5441 char *p = bytes; 5442 Py_ssize_t i; 5443 for (i = 0; i < len; ) { 5444 Py_ssize_t ch_pos = i; 5445 Py_UCS4 ch = text[i]; 5446 i++; 5447 #if Py_UNICODE_SIZE == 2 5448 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) 5449 && i < len 5450 && Py_UNICODE_IS_LOW_SURROGATE(text[i])) 5451 { 5452 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]); 5453 i++; 5454 } 5455 #endif 5456 5457 if (ch < 0x80) { 5458 /* Encode ASCII */ 5459 *p++ = (char) ch; 5460 5461 } 5462 else if (ch < 0x0800) { 5463 /* Encode Latin-1 */ 5464 *p++ = (char)(0xc0 | (ch >> 6)); 5465 *p++ = (char)(0x80 | (ch & 0x3f)); 5466 } 5467 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { 5468 /* surrogateescape error handler */ 5469 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { 5470 if (error_pos != NULL) { 5471 *error_pos = (size_t)ch_pos; 5472 } 5473 if (reason != NULL) { 5474 *reason = "encoding error"; 5475 } 5476 if (raw_malloc) { 5477 PyMem_RawFree(bytes); 5478 } 5479 else { 5480 PyMem_Free(bytes); 5481 } 5482 return -2; 5483 } 5484 *p++ = (char)(ch & 0xff); 5485 } 5486 else if (ch < 0x10000) { 5487 *p++ = (char)(0xe0 | (ch >> 12)); 5488 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 5489 *p++ = (char)(0x80 | (ch & 0x3f)); 5490 } 5491 else { /* ch >= 0x10000 */ 5492 assert(ch <= MAX_UNICODE); 5493 /* Encode UCS4 Unicode ordinals */ 5494 *p++ = (char)(0xf0 | (ch >> 18)); 5495 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 5496 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 5497 *p++ = (char)(0x80 | (ch & 0x3f)); 5498 } 5499 } 5500 *p++ = '\0'; 5501 5502 size_t final_size = (p - bytes); 5503 char *bytes2; 5504 if (raw_malloc) { 5505 bytes2 = PyMem_RawRealloc(bytes, final_size); 5506 } 5507 else { 5508 bytes2 = PyMem_Realloc(bytes, final_size); 5509 } 5510 if (bytes2 == NULL) { 5511 if (error_pos != NULL) { 5512 *error_pos = (size_t)-1; 5513 } 5514 if (raw_malloc) { 5515 PyMem_RawFree(bytes); 5516 } 5517 else { 5518 PyMem_Free(bytes); 5519 } 5520 return -1; 5521 } 5522 *str = bytes2; 5523 return 0; 5524 } 5525 5526 5527 /* Primary internal function which creates utf8 encoded bytes objects. 5528 5529 Allocation strategy: if the string is short, convert into a stack buffer 5530 and allocate exactly as much space needed at the end. Else allocate the 5531 maximum possible needed (4 result bytes per Unicode character), and return 5532 the excess memory at the end. 5533 */ 5534 static PyObject * unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, 5536 const char *errors) 5537 { 5538 if (!PyUnicode_Check(unicode)) { 5539 PyErr_BadArgument(); 5540 return NULL; 5541 } 5542 5543 if (PyUnicode_READY(unicode) == -1) 5544 return NULL; 5545 5546 if (PyUnicode_UTF8(unicode)) 5547 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 5548 PyUnicode_UTF8_LENGTH(unicode)); 5549 5550 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 5551 const void *data = PyUnicode_DATA(unicode); 5552 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); 5553 5554 _PyBytesWriter writer; 5555 char *end; 5556 5557 switch (kind) { 5558 default: 5559 Py_UNREACHABLE(); 5560 case PyUnicode_1BYTE_KIND: 5561 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 5562 assert(!PyUnicode_IS_ASCII(unicode)); 5563 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); 5564 break; 5565 case PyUnicode_2BYTE_KIND: 5566 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); 5567 break; 5568 case PyUnicode_4BYTE_KIND: 5569 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); 5570 break; 5571 } 5572 5573 if (end == NULL) { 5574 _PyBytesWriter_Dealloc(&writer); 5575 return NULL; 5576 } 5577 return _PyBytesWriter_Finish(&writer, end); 5578 } 5579 5580 static int unicode_fill_utf8(PyObject * unicode)5581 unicode_fill_utf8(PyObject *unicode) 5582 { 5583 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 5584 assert(!PyUnicode_IS_ASCII(unicode)); 5585 5586 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 5587 const void *data = PyUnicode_DATA(unicode); 5588 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); 5589 5590 _PyBytesWriter writer; 5591 char *end; 5592 5593 switch (kind) { 5594 default: 5595 Py_UNREACHABLE(); 5596 case PyUnicode_1BYTE_KIND: 5597 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, 5598 _Py_ERROR_STRICT, NULL); 5599 break; 5600 case PyUnicode_2BYTE_KIND: 5601 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, 5602 _Py_ERROR_STRICT, NULL); 5603 break; 5604 case PyUnicode_4BYTE_KIND: 5605 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, 5606 _Py_ERROR_STRICT, NULL); 5607 break; 5608 } 5609 if (end == NULL) { 5610 _PyBytesWriter_Dealloc(&writer); 5611 return -1; 5612 } 5613 5614 const char *start = writer.use_small_buffer ? writer.small_buffer : 5615 PyBytes_AS_STRING(writer.buffer); 5616 Py_ssize_t len = end - start; 5617 5618 char *cache = PyObject_Malloc(len + 1); 5619 if (cache == NULL) { 5620 _PyBytesWriter_Dealloc(&writer); 5621 PyErr_NoMemory(); 5622 return -1; 5623 } 5624 _PyUnicode_UTF8(unicode) = cache; 5625 _PyUnicode_UTF8_LENGTH(unicode) = len; 5626 memcpy(cache, start, len); 5627 cache[len] = '\0'; 5628 _PyBytesWriter_Dealloc(&writer); 5629 return 0; 5630 } 5631 5632 PyObject * _PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 5634 { 5635 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors); 5636 } 5637 5638 5639 PyObject * PyUnicode_AsUTF8String(PyObject * unicode)5640 PyUnicode_AsUTF8String(PyObject *unicode) 5641 { 5642 return _PyUnicode_AsUTF8String(unicode, NULL); 5643 } 5644 5645 /* --- UTF-32 Codec ------------------------------------------------------- */ 5646 5647 PyObject * PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648 PyUnicode_DecodeUTF32(const char *s, 5649 Py_ssize_t size, 5650 const char *errors, 5651 int *byteorder) 5652 { 5653 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 5654 } 5655 5656 PyObject * PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657 PyUnicode_DecodeUTF32Stateful(const char *s, 5658 Py_ssize_t size, 5659 const char *errors, 5660 int *byteorder, 5661 Py_ssize_t *consumed) 5662 { 5663 const char *starts = s; 5664 Py_ssize_t startinpos; 5665 Py_ssize_t endinpos; 5666 _PyUnicodeWriter writer; 5667 const unsigned char *q, *e; 5668 int le, bo = 0; /* assume native ordering by default */ 5669 const char *encoding; 5670 const char *errmsg = ""; 5671 PyObject *errorHandler = NULL; 5672 PyObject *exc = NULL; 5673 5674 q = (const unsigned char *)s; 5675 e = q + size; 5676 5677 if (byteorder) 5678 bo = *byteorder; 5679 5680 /* Check for BOM marks (U+FEFF) in the input and adjust current 5681 byte order setting accordingly. In native mode, the leading BOM 5682 mark is skipped, in all other modes, it is copied to the output 5683 stream as-is (giving a ZWNBSP character). */ 5684 if (bo == 0 && size >= 4) { 5685 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5686 if (bom == 0x0000FEFF) { 5687 bo = -1; 5688 q += 4; 5689 } 5690 else if (bom == 0xFFFE0000) { 5691 bo = 1; 5692 q += 4; 5693 } 5694 if (byteorder) 5695 *byteorder = bo; 5696 } 5697 5698 if (q == e) { 5699 if (consumed) 5700 *consumed = size; 5701 _Py_RETURN_UNICODE_EMPTY(); 5702 } 5703 5704 #ifdef WORDS_BIGENDIAN 5705 le = bo < 0; 5706 #else 5707 le = bo <= 0; 5708 #endif 5709 encoding = le ? "utf-32-le" : "utf-32-be"; 5710 5711 _PyUnicodeWriter_Init(&writer); 5712 writer.min_length = (e - q + 3) / 4; 5713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5714 goto onError; 5715 5716 while (1) { 5717 Py_UCS4 ch = 0; 5718 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5719 5720 if (e - q >= 4) { 5721 enum PyUnicode_Kind kind = writer.kind; 5722 void *data = writer.data; 5723 const unsigned char *last = e - 4; 5724 Py_ssize_t pos = writer.pos; 5725 if (le) { 5726 do { 5727 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5728 if (ch > maxch) 5729 break; 5730 if (kind != PyUnicode_1BYTE_KIND && 5731 Py_UNICODE_IS_SURROGATE(ch)) 5732 break; 5733 PyUnicode_WRITE(kind, data, pos++, ch); 5734 q += 4; 5735 } while (q <= last); 5736 } 5737 else { 5738 do { 5739 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5740 if (ch > maxch) 5741 break; 5742 if (kind != PyUnicode_1BYTE_KIND && 5743 Py_UNICODE_IS_SURROGATE(ch)) 5744 break; 5745 PyUnicode_WRITE(kind, data, pos++, ch); 5746 q += 4; 5747 } while (q <= last); 5748 } 5749 writer.pos = pos; 5750 } 5751 5752 if (Py_UNICODE_IS_SURROGATE(ch)) { 5753 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5754 startinpos = ((const char *)q) - starts; 5755 endinpos = startinpos + 4; 5756 } 5757 else if (ch <= maxch) { 5758 if (q == e || consumed) 5759 break; 5760 /* remaining bytes at the end? (size should be divisible by 4) */ 5761 errmsg = "truncated data"; 5762 startinpos = ((const char *)q) - starts; 5763 endinpos = ((const char *)e) - starts; 5764 } 5765 else { 5766 if (ch < 0x110000) { 5767 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5768 goto onError; 5769 q += 4; 5770 continue; 5771 } 5772 errmsg = "code point not in range(0x110000)"; 5773 startinpos = ((const char *)q) - starts; 5774 endinpos = startinpos + 4; 5775 } 5776 5777 /* The remaining input chars are ignored if the callback 5778 chooses to skip the input */ 5779 if (unicode_decode_call_errorhandler_writer( 5780 errors, &errorHandler, 5781 encoding, errmsg, 5782 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5783 &writer)) 5784 goto onError; 5785 } 5786 5787 if (consumed) 5788 *consumed = (const char *)q-starts; 5789 5790 Py_XDECREF(errorHandler); 5791 Py_XDECREF(exc); 5792 return _PyUnicodeWriter_Finish(&writer); 5793 5794 onError: 5795 _PyUnicodeWriter_Dealloc(&writer); 5796 Py_XDECREF(errorHandler); 5797 Py_XDECREF(exc); 5798 return NULL; 5799 } 5800 5801 PyObject * _PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802 _PyUnicode_EncodeUTF32(PyObject *str, 5803 const char *errors, 5804 int byteorder) 5805 { 5806 enum PyUnicode_Kind kind; 5807 const void *data; 5808 Py_ssize_t len; 5809 PyObject *v; 5810 uint32_t *out; 5811 #if PY_LITTLE_ENDIAN 5812 int native_ordering = byteorder <= 0; 5813 #else 5814 int native_ordering = byteorder >= 0; 5815 #endif 5816 const char *encoding; 5817 Py_ssize_t nsize, pos; 5818 PyObject *errorHandler = NULL; 5819 PyObject *exc = NULL; 5820 PyObject *rep = NULL; 5821 5822 if (!PyUnicode_Check(str)) { 5823 PyErr_BadArgument(); 5824 return NULL; 5825 } 5826 if (PyUnicode_READY(str) == -1) 5827 return NULL; 5828 kind = PyUnicode_KIND(str); 5829 data = PyUnicode_DATA(str); 5830 len = PyUnicode_GET_LENGTH(str); 5831 5832 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5833 return PyErr_NoMemory(); 5834 nsize = len + (byteorder == 0); 5835 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5836 if (v == NULL) 5837 return NULL; 5838 5839 /* output buffer is 4-bytes aligned */ 5840 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5841 out = (uint32_t *)PyBytes_AS_STRING(v); 5842 if (byteorder == 0) 5843 *out++ = 0xFEFF; 5844 if (len == 0) 5845 goto done; 5846 5847 if (byteorder == -1) 5848 encoding = "utf-32-le"; 5849 else if (byteorder == 1) 5850 encoding = "utf-32-be"; 5851 else 5852 encoding = "utf-32"; 5853 5854 if (kind == PyUnicode_1BYTE_KIND) { 5855 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5856 goto done; 5857 } 5858 5859 pos = 0; 5860 while (pos < len) { 5861 Py_ssize_t newpos, repsize, moreunits; 5862 5863 if (kind == PyUnicode_2BYTE_KIND) { 5864 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5865 &out, native_ordering); 5866 } 5867 else { 5868 assert(kind == PyUnicode_4BYTE_KIND); 5869 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5870 &out, native_ordering); 5871 } 5872 if (pos == len) 5873 break; 5874 5875 rep = unicode_encode_call_errorhandler( 5876 errors, &errorHandler, 5877 encoding, "surrogates not allowed", 5878 str, &exc, pos, pos + 1, &newpos); 5879 if (!rep) 5880 goto error; 5881 5882 if (PyBytes_Check(rep)) { 5883 repsize = PyBytes_GET_SIZE(rep); 5884 if (repsize & 3) { 5885 raise_encode_exception(&exc, encoding, 5886 str, pos, pos + 1, 5887 "surrogates not allowed"); 5888 goto error; 5889 } 5890 moreunits = repsize / 4; 5891 } 5892 else { 5893 assert(PyUnicode_Check(rep)); 5894 if (PyUnicode_READY(rep) < 0) 5895 goto error; 5896 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5897 if (!PyUnicode_IS_ASCII(rep)) { 5898 raise_encode_exception(&exc, encoding, 5899 str, pos, pos + 1, 5900 "surrogates not allowed"); 5901 goto error; 5902 } 5903 } 5904 moreunits += pos - newpos; 5905 pos = newpos; 5906 5907 /* four bytes are reserved for each surrogate */ 5908 if (moreunits > 0) { 5909 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); 5910 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { 5911 /* integer overflow */ 5912 PyErr_NoMemory(); 5913 goto error; 5914 } 5915 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0) 5916 goto error; 5917 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; 5918 } 5919 5920 if (PyBytes_Check(rep)) { 5921 memcpy(out, PyBytes_AS_STRING(rep), repsize); 5922 out += repsize / 4; 5923 } else /* rep is unicode */ { 5924 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5925 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5926 &out, native_ordering); 5927 } 5928 5929 Py_CLEAR(rep); 5930 } 5931 5932 /* Cut back to size actually needed. This is necessary for, for example, 5933 encoding of a string containing isolated surrogates and the 'ignore' 5934 handler is used. */ 5935 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5936 if (nsize != PyBytes_GET_SIZE(v)) 5937 _PyBytes_Resize(&v, nsize); 5938 Py_XDECREF(errorHandler); 5939 Py_XDECREF(exc); 5940 done: 5941 return v; 5942 error: 5943 Py_XDECREF(rep); 5944 Py_XDECREF(errorHandler); 5945 Py_XDECREF(exc); 5946 Py_XDECREF(v); 5947 return NULL; 5948 } 5949 5950 PyObject * PyUnicode_AsUTF32String(PyObject * unicode)5951 PyUnicode_AsUTF32String(PyObject *unicode) 5952 { 5953 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5954 } 5955 5956 /* --- UTF-16 Codec ------------------------------------------------------- */ 5957 5958 PyObject * PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959 PyUnicode_DecodeUTF16(const char *s, 5960 Py_ssize_t size, 5961 const char *errors, 5962 int *byteorder) 5963 { 5964 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5965 } 5966 5967 PyObject * PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968 PyUnicode_DecodeUTF16Stateful(const char *s, 5969 Py_ssize_t size, 5970 const char *errors, 5971 int *byteorder, 5972 Py_ssize_t *consumed) 5973 { 5974 const char *starts = s; 5975 Py_ssize_t startinpos; 5976 Py_ssize_t endinpos; 5977 _PyUnicodeWriter writer; 5978 const unsigned char *q, *e; 5979 int bo = 0; /* assume native ordering by default */ 5980 int native_ordering; 5981 const char *errmsg = ""; 5982 PyObject *errorHandler = NULL; 5983 PyObject *exc = NULL; 5984 const char *encoding; 5985 5986 q = (const unsigned char *)s; 5987 e = q + size; 5988 5989 if (byteorder) 5990 bo = *byteorder; 5991 5992 /* Check for BOM marks (U+FEFF) in the input and adjust current 5993 byte order setting accordingly. In native mode, the leading BOM 5994 mark is skipped, in all other modes, it is copied to the output 5995 stream as-is (giving a ZWNBSP character). */ 5996 if (bo == 0 && size >= 2) { 5997 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5998 if (bom == 0xFEFF) { 5999 q += 2; 6000 bo = -1; 6001 } 6002 else if (bom == 0xFFFE) { 6003 q += 2; 6004 bo = 1; 6005 } 6006 if (byteorder) 6007 *byteorder = bo; 6008 } 6009 6010 if (q == e) { 6011 if (consumed) 6012 *consumed = size; 6013 _Py_RETURN_UNICODE_EMPTY(); 6014 } 6015 6016 #if PY_LITTLE_ENDIAN 6017 native_ordering = bo <= 0; 6018 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 6019 #else 6020 native_ordering = bo >= 0; 6021 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 6022 #endif 6023 6024 /* Note: size will always be longer than the resulting Unicode 6025 character count normally. Error handler will take care of 6026 resizing when needed. */ 6027 _PyUnicodeWriter_Init(&writer); 6028 writer.min_length = (e - q + 1) / 2; 6029 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 6030 goto onError; 6031 6032 while (1) { 6033 Py_UCS4 ch = 0; 6034 if (e - q >= 2) { 6035 int kind = writer.kind; 6036 if (kind == PyUnicode_1BYTE_KIND) { 6037 if (PyUnicode_IS_ASCII(writer.buffer)) 6038 ch = asciilib_utf16_decode(&q, e, 6039 (Py_UCS1*)writer.data, &writer.pos, 6040 native_ordering); 6041 else 6042 ch = ucs1lib_utf16_decode(&q, e, 6043 (Py_UCS1*)writer.data, &writer.pos, 6044 native_ordering); 6045 } else if (kind == PyUnicode_2BYTE_KIND) { 6046 ch = ucs2lib_utf16_decode(&q, e, 6047 (Py_UCS2*)writer.data, &writer.pos, 6048 native_ordering); 6049 } else { 6050 assert(kind == PyUnicode_4BYTE_KIND); 6051 ch = ucs4lib_utf16_decode(&q, e, 6052 (Py_UCS4*)writer.data, &writer.pos, 6053 native_ordering); 6054 } 6055 } 6056 6057 switch (ch) 6058 { 6059 case 0: 6060 /* remaining byte at the end? (size should be even) */ 6061 if (q == e || consumed) 6062 goto End; 6063 errmsg = "truncated data"; 6064 startinpos = ((const char *)q) - starts; 6065 endinpos = ((const char *)e) - starts; 6066 break; 6067 /* The remaining input chars are ignored if the callback 6068 chooses to skip the input */ 6069 case 1: 6070 q -= 2; 6071 if (consumed) 6072 goto End; 6073 errmsg = "unexpected end of data"; 6074 startinpos = ((const char *)q) - starts; 6075 endinpos = ((const char *)e) - starts; 6076 break; 6077 case 2: 6078 errmsg = "illegal encoding"; 6079 startinpos = ((const char *)q) - 2 - starts; 6080 endinpos = startinpos + 2; 6081 break; 6082 case 3: 6083 errmsg = "illegal UTF-16 surrogate"; 6084 startinpos = ((const char *)q) - 4 - starts; 6085 endinpos = startinpos + 2; 6086 break; 6087 default: 6088 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6089 goto onError; 6090 continue; 6091 } 6092 6093 if (unicode_decode_call_errorhandler_writer( 6094 errors, 6095 &errorHandler, 6096 encoding, errmsg, 6097 &starts, 6098 (const char **)&e, 6099 &startinpos, 6100 &endinpos, 6101 &exc, 6102 (const char **)&q, 6103 &writer)) 6104 goto onError; 6105 } 6106 6107 End: 6108 if (consumed) 6109 *consumed = (const char *)q-starts; 6110 6111 Py_XDECREF(errorHandler); 6112 Py_XDECREF(exc); 6113 return _PyUnicodeWriter_Finish(&writer); 6114 6115 onError: 6116 _PyUnicodeWriter_Dealloc(&writer); 6117 Py_XDECREF(errorHandler); 6118 Py_XDECREF(exc); 6119 return NULL; 6120 } 6121 6122 PyObject * _PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123 _PyUnicode_EncodeUTF16(PyObject *str, 6124 const char *errors, 6125 int byteorder) 6126 { 6127 enum PyUnicode_Kind kind; 6128 const void *data; 6129 Py_ssize_t len; 6130 PyObject *v; 6131 unsigned short *out; 6132 Py_ssize_t pairs; 6133 #if PY_BIG_ENDIAN 6134 int native_ordering = byteorder >= 0; 6135 #else 6136 int native_ordering = byteorder <= 0; 6137 #endif 6138 const char *encoding; 6139 Py_ssize_t nsize, pos; 6140 PyObject *errorHandler = NULL; 6141 PyObject *exc = NULL; 6142 PyObject *rep = NULL; 6143 6144 if (!PyUnicode_Check(str)) { 6145 PyErr_BadArgument(); 6146 return NULL; 6147 } 6148 if (PyUnicode_READY(str) == -1) 6149 return NULL; 6150 kind = PyUnicode_KIND(str); 6151 data = PyUnicode_DATA(str); 6152 len = PyUnicode_GET_LENGTH(str); 6153 6154 pairs = 0; 6155 if (kind == PyUnicode_4BYTE_KIND) { 6156 const Py_UCS4 *in = (const Py_UCS4 *)data; 6157 const Py_UCS4 *end = in + len; 6158 while (in < end) { 6159 if (*in++ >= 0x10000) { 6160 pairs++; 6161 } 6162 } 6163 } 6164 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { 6165 return PyErr_NoMemory(); 6166 } 6167 nsize = len + pairs + (byteorder == 0); 6168 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 6169 if (v == NULL) { 6170 return NULL; 6171 } 6172 6173 /* output buffer is 2-bytes aligned */ 6174 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 6175 out = (unsigned short *)PyBytes_AS_STRING(v); 6176 if (byteorder == 0) { 6177 *out++ = 0xFEFF; 6178 } 6179 if (len == 0) { 6180 goto done; 6181 } 6182 6183 if (kind == PyUnicode_1BYTE_KIND) { 6184 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 6185 goto done; 6186 } 6187 6188 if (byteorder < 0) { 6189 encoding = "utf-16-le"; 6190 } 6191 else if (byteorder > 0) { 6192 encoding = "utf-16-be"; 6193 } 6194 else { 6195 encoding = "utf-16"; 6196 } 6197 6198 pos = 0; 6199 while (pos < len) { 6200 Py_ssize_t newpos, repsize, moreunits; 6201 6202 if (kind == PyUnicode_2BYTE_KIND) { 6203 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 6204 &out, native_ordering); 6205 } 6206 else { 6207 assert(kind == PyUnicode_4BYTE_KIND); 6208 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 6209 &out, native_ordering); 6210 } 6211 if (pos == len) 6212 break; 6213 6214 rep = unicode_encode_call_errorhandler( 6215 errors, &errorHandler, 6216 encoding, "surrogates not allowed", 6217 str, &exc, pos, pos + 1, &newpos); 6218 if (!rep) 6219 goto error; 6220 6221 if (PyBytes_Check(rep)) { 6222 repsize = PyBytes_GET_SIZE(rep); 6223 if (repsize & 1) { 6224 raise_encode_exception(&exc, encoding, 6225 str, pos, pos + 1, 6226 "surrogates not allowed"); 6227 goto error; 6228 } 6229 moreunits = repsize / 2; 6230 } 6231 else { 6232 assert(PyUnicode_Check(rep)); 6233 if (PyUnicode_READY(rep) < 0) 6234 goto error; 6235 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 6236 if (!PyUnicode_IS_ASCII(rep)) { 6237 raise_encode_exception(&exc, encoding, 6238 str, pos, pos + 1, 6239 "surrogates not allowed"); 6240 goto error; 6241 } 6242 } 6243 moreunits += pos - newpos; 6244 pos = newpos; 6245 6246 /* two bytes are reserved for each surrogate */ 6247 if (moreunits > 0) { 6248 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 6249 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { 6250 /* integer overflow */ 6251 PyErr_NoMemory(); 6252 goto error; 6253 } 6254 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0) 6255 goto error; 6256 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 6257 } 6258 6259 if (PyBytes_Check(rep)) { 6260 memcpy(out, PyBytes_AS_STRING(rep), repsize); 6261 out += repsize / 2; 6262 } else /* rep is unicode */ { 6263 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 6264 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 6265 &out, native_ordering); 6266 } 6267 6268 Py_CLEAR(rep); 6269 } 6270 6271 /* Cut back to size actually needed. This is necessary for, for example, 6272 encoding of a string containing isolated surrogates and the 'ignore' handler 6273 is used. */ 6274 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 6275 if (nsize != PyBytes_GET_SIZE(v)) 6276 _PyBytes_Resize(&v, nsize); 6277 Py_XDECREF(errorHandler); 6278 Py_XDECREF(exc); 6279 done: 6280 return v; 6281 error: 6282 Py_XDECREF(rep); 6283 Py_XDECREF(errorHandler); 6284 Py_XDECREF(exc); 6285 Py_XDECREF(v); 6286 return NULL; 6287 #undef STORECHAR 6288 } 6289 6290 PyObject * PyUnicode_AsUTF16String(PyObject * unicode)6291 PyUnicode_AsUTF16String(PyObject *unicode) 6292 { 6293 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 6294 } 6295 6296 /* --- Unicode Escape Codec ----------------------------------------------- */ 6297 6298 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; 6299 6300 PyObject * _PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6301 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, 6302 Py_ssize_t size, 6303 const char *errors, 6304 Py_ssize_t *consumed, 6305 const char **first_invalid_escape) 6306 { 6307 const char *starts = s; 6308 _PyUnicodeWriter writer; 6309 const char *end; 6310 PyObject *errorHandler = NULL; 6311 PyObject *exc = NULL; 6312 6313 // so we can remember if we've seen an invalid escape char or not 6314 *first_invalid_escape = NULL; 6315 6316 if (size == 0) { 6317 if (consumed) { 6318 *consumed = 0; 6319 } 6320 _Py_RETURN_UNICODE_EMPTY(); 6321 } 6322 /* Escaped strings will always be longer than the resulting 6323 Unicode string, so we start with size here and then reduce the 6324 length after conversion to the true value. 6325 (but if the error callback returns a long replacement string 6326 we'll have to allocate more space) */ 6327 _PyUnicodeWriter_Init(&writer); 6328 writer.min_length = size; 6329 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 6330 goto onError; 6331 } 6332 6333 end = s + size; 6334 while (s < end) { 6335 unsigned char c = (unsigned char) *s++; 6336 Py_UCS4 ch; 6337 int count; 6338 const char *message; 6339 6340 #define WRITE_ASCII_CHAR(ch) \ 6341 do { \ 6342 assert(ch <= 127); \ 6343 assert(writer.pos < writer.size); \ 6344 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6345 } while(0) 6346 6347 #define WRITE_CHAR(ch) \ 6348 do { \ 6349 if (ch <= writer.maxchar) { \ 6350 assert(writer.pos < writer.size); \ 6351 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6352 } \ 6353 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 6354 goto onError; \ 6355 } \ 6356 } while(0) 6357 6358 /* Non-escape characters are interpreted as Unicode ordinals */ 6359 if (c != '\\') { 6360 WRITE_CHAR(c); 6361 continue; 6362 } 6363 6364 Py_ssize_t startinpos = s - starts - 1; 6365 /* \ - Escapes */ 6366 if (s >= end) { 6367 message = "\\ at end of string"; 6368 goto incomplete; 6369 } 6370 c = (unsigned char) *s++; 6371 6372 assert(writer.pos < writer.size); 6373 switch (c) { 6374 6375 /* \x escapes */ 6376 case '\n': continue; 6377 case '\\': WRITE_ASCII_CHAR('\\'); continue; 6378 case '\'': WRITE_ASCII_CHAR('\''); continue; 6379 case '\"': WRITE_ASCII_CHAR('\"'); continue; 6380 case 'b': WRITE_ASCII_CHAR('\b'); continue; 6381 /* FF */ 6382 case 'f': WRITE_ASCII_CHAR('\014'); continue; 6383 case 't': WRITE_ASCII_CHAR('\t'); continue; 6384 case 'n': WRITE_ASCII_CHAR('\n'); continue; 6385 case 'r': WRITE_ASCII_CHAR('\r'); continue; 6386 /* VT */ 6387 case 'v': WRITE_ASCII_CHAR('\013'); continue; 6388 /* BEL, not classic C */ 6389 case 'a': WRITE_ASCII_CHAR('\007'); continue; 6390 6391 /* \OOO (octal) escapes */ 6392 case '0': case '1': case '2': case '3': 6393 case '4': case '5': case '6': case '7': 6394 ch = c - '0'; 6395 if (s < end && '0' <= *s && *s <= '7') { 6396 ch = (ch<<3) + *s++ - '0'; 6397 if (s < end && '0' <= *s && *s <= '7') { 6398 ch = (ch<<3) + *s++ - '0'; 6399 } 6400 } 6401 if (ch > 0377) { 6402 if (*first_invalid_escape == NULL) { 6403 *first_invalid_escape = s-3; /* Back up 3 chars, since we've 6404 already incremented s. */ 6405 } 6406 } 6407 WRITE_CHAR(ch); 6408 continue; 6409 6410 /* hex escapes */ 6411 /* \xXX */ 6412 case 'x': 6413 count = 2; 6414 message = "truncated \\xXX escape"; 6415 goto hexescape; 6416 6417 /* \uXXXX */ 6418 case 'u': 6419 count = 4; 6420 message = "truncated \\uXXXX escape"; 6421 goto hexescape; 6422 6423 /* \UXXXXXXXX */ 6424 case 'U': 6425 count = 8; 6426 message = "truncated \\UXXXXXXXX escape"; 6427 hexescape: 6428 for (ch = 0; count; ++s, --count) { 6429 if (s >= end) { 6430 goto incomplete; 6431 } 6432 c = (unsigned char)*s; 6433 ch <<= 4; 6434 if (c >= '0' && c <= '9') { 6435 ch += c - '0'; 6436 } 6437 else if (c >= 'a' && c <= 'f') { 6438 ch += c - ('a' - 10); 6439 } 6440 else if (c >= 'A' && c <= 'F') { 6441 ch += c - ('A' - 10); 6442 } 6443 else { 6444 goto error; 6445 } 6446 } 6447 6448 /* when we get here, ch is a 32-bit unicode character */ 6449 if (ch > MAX_UNICODE) { 6450 message = "illegal Unicode character"; 6451 goto error; 6452 } 6453 6454 WRITE_CHAR(ch); 6455 continue; 6456 6457 /* \N{name} */ 6458 case 'N': 6459 if (ucnhash_capi == NULL) { 6460 /* load the unicode data module */ 6461 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 6462 PyUnicodeData_CAPSULE_NAME, 1); 6463 if (ucnhash_capi == NULL) { 6464 PyErr_SetString( 6465 PyExc_UnicodeError, 6466 "\\N escapes not supported (can't load unicodedata module)" 6467 ); 6468 goto onError; 6469 } 6470 } 6471 6472 message = "malformed \\N character escape"; 6473 if (s >= end) { 6474 goto incomplete; 6475 } 6476 if (*s == '{') { 6477 const char *start = ++s; 6478 size_t namelen; 6479 /* look for the closing brace */ 6480 while (s < end && *s != '}') 6481 s++; 6482 if (s >= end) { 6483 goto incomplete; 6484 } 6485 namelen = s - start; 6486 if (namelen) { 6487 /* found a name. look it up in the unicode database */ 6488 s++; 6489 ch = 0xffffffff; /* in case 'getcode' messes up */ 6490 if (namelen <= INT_MAX && 6491 ucnhash_capi->getcode(start, (int)namelen, 6492 &ch, 0)) { 6493 assert(ch <= MAX_UNICODE); 6494 WRITE_CHAR(ch); 6495 continue; 6496 } 6497 message = "unknown Unicode character name"; 6498 } 6499 } 6500 goto error; 6501 6502 default: 6503 if (*first_invalid_escape == NULL) { 6504 *first_invalid_escape = s-1; /* Back up one char, since we've 6505 already incremented s. */ 6506 } 6507 WRITE_ASCII_CHAR('\\'); 6508 WRITE_CHAR(c); 6509 continue; 6510 } 6511 6512 incomplete: 6513 if (consumed) { 6514 *consumed = startinpos; 6515 break; 6516 } 6517 error:; 6518 Py_ssize_t endinpos = s-starts; 6519 writer.min_length = end - s + writer.pos; 6520 if (unicode_decode_call_errorhandler_writer( 6521 errors, &errorHandler, 6522 "unicodeescape", message, 6523 &starts, &end, &startinpos, &endinpos, &exc, &s, 6524 &writer)) { 6525 goto onError; 6526 } 6527 assert(end - s <= writer.size - writer.pos); 6528 6529 #undef WRITE_ASCII_CHAR 6530 #undef WRITE_CHAR 6531 } 6532 6533 Py_XDECREF(errorHandler); 6534 Py_XDECREF(exc); 6535 return _PyUnicodeWriter_Finish(&writer); 6536 6537 onError: 6538 _PyUnicodeWriter_Dealloc(&writer); 6539 Py_XDECREF(errorHandler); 6540 Py_XDECREF(exc); 6541 return NULL; 6542 } 6543 6544 PyObject * _PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6545 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, 6546 Py_ssize_t size, 6547 const char *errors, 6548 Py_ssize_t *consumed) 6549 { 6550 const char *first_invalid_escape; 6551 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, 6552 consumed, 6553 &first_invalid_escape); 6554 if (result == NULL) 6555 return NULL; 6556 if (first_invalid_escape != NULL) { 6557 unsigned char c = *first_invalid_escape; 6558 if ('4' <= c && c <= '7') { 6559 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 6560 "invalid octal escape sequence '\\%.3s'", 6561 first_invalid_escape) < 0) 6562 { 6563 Py_DECREF(result); 6564 return NULL; 6565 } 6566 } 6567 else { 6568 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 6569 "invalid escape sequence '\\%c'", 6570 c) < 0) 6571 { 6572 Py_DECREF(result); 6573 return NULL; 6574 } 6575 } 6576 } 6577 return result; 6578 } 6579 6580 PyObject * PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6581 PyUnicode_DecodeUnicodeEscape(const char *s, 6582 Py_ssize_t size, 6583 const char *errors) 6584 { 6585 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL); 6586 } 6587 6588 /* Return a Unicode-Escape string version of the Unicode object. */ 6589 6590 PyObject * PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6591 PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6592 { 6593 Py_ssize_t i, len; 6594 PyObject *repr; 6595 char *p; 6596 enum PyUnicode_Kind kind; 6597 const void *data; 6598 Py_ssize_t expandsize; 6599 6600 /* Initial allocation is based on the longest-possible character 6601 escape. 6602 6603 For UCS1 strings it's '\xxx', 4 bytes per source character. 6604 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 6605 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 6606 */ 6607 6608 if (!PyUnicode_Check(unicode)) { 6609 PyErr_BadArgument(); 6610 return NULL; 6611 } 6612 if (PyUnicode_READY(unicode) == -1) { 6613 return NULL; 6614 } 6615 6616 len = PyUnicode_GET_LENGTH(unicode); 6617 if (len == 0) { 6618 return PyBytes_FromStringAndSize(NULL, 0); 6619 } 6620 6621 kind = PyUnicode_KIND(unicode); 6622 data = PyUnicode_DATA(unicode); 6623 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6624 bytes, and 1 byte characters 4. */ 6625 expandsize = kind * 2 + 2; 6626 if (len > PY_SSIZE_T_MAX / expandsize) { 6627 return PyErr_NoMemory(); 6628 } 6629 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6630 if (repr == NULL) { 6631 return NULL; 6632 } 6633 6634 p = PyBytes_AS_STRING(repr); 6635 for (i = 0; i < len; i++) { 6636 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6637 6638 /* U+0000-U+00ff range */ 6639 if (ch < 0x100) { 6640 if (ch >= ' ' && ch < 127) { 6641 if (ch != '\\') { 6642 /* Copy printable US ASCII as-is */ 6643 *p++ = (char) ch; 6644 } 6645 /* Escape backslashes */ 6646 else { 6647 *p++ = '\\'; 6648 *p++ = '\\'; 6649 } 6650 } 6651 6652 /* Map special whitespace to '\t', \n', '\r' */ 6653 else if (ch == '\t') { 6654 *p++ = '\\'; 6655 *p++ = 't'; 6656 } 6657 else if (ch == '\n') { 6658 *p++ = '\\'; 6659 *p++ = 'n'; 6660 } 6661 else if (ch == '\r') { 6662 *p++ = '\\'; 6663 *p++ = 'r'; 6664 } 6665 6666 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */ 6667 else { 6668 *p++ = '\\'; 6669 *p++ = 'x'; 6670 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6671 *p++ = Py_hexdigits[ch & 0x000F]; 6672 } 6673 } 6674 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */ 6675 else if (ch < 0x10000) { 6676 *p++ = '\\'; 6677 *p++ = 'u'; 6678 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6679 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6680 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6681 *p++ = Py_hexdigits[ch & 0x000F]; 6682 } 6683 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */ 6684 else { 6685 6686 /* Make sure that the first two digits are zero */ 6687 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6688 *p++ = '\\'; 6689 *p++ = 'U'; 6690 *p++ = '0'; 6691 *p++ = '0'; 6692 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 6693 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 6694 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 6695 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 6696 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 6697 *p++ = Py_hexdigits[ch & 0x0000000F]; 6698 } 6699 } 6700 6701 assert(p - PyBytes_AS_STRING(repr) > 0); 6702 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6703 return NULL; 6704 } 6705 return repr; 6706 } 6707 6708 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 6709 6710 PyObject * _PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6711 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, 6712 Py_ssize_t size, 6713 const char *errors, 6714 Py_ssize_t *consumed) 6715 { 6716 const char *starts = s; 6717 _PyUnicodeWriter writer; 6718 const char *end; 6719 PyObject *errorHandler = NULL; 6720 PyObject *exc = NULL; 6721 6722 if (size == 0) { 6723 if (consumed) { 6724 *consumed = 0; 6725 } 6726 _Py_RETURN_UNICODE_EMPTY(); 6727 } 6728 6729 /* Escaped strings will always be longer than the resulting 6730 Unicode string, so we start with size here and then reduce the 6731 length after conversion to the true value. (But decoding error 6732 handler might have to resize the string) */ 6733 _PyUnicodeWriter_Init(&writer); 6734 writer.min_length = size; 6735 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 6736 goto onError; 6737 } 6738 6739 end = s + size; 6740 while (s < end) { 6741 unsigned char c = (unsigned char) *s++; 6742 Py_UCS4 ch; 6743 int count; 6744 const char *message; 6745 6746 #define WRITE_CHAR(ch) \ 6747 do { \ 6748 if (ch <= writer.maxchar) { \ 6749 assert(writer.pos < writer.size); \ 6750 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6751 } \ 6752 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 6753 goto onError; \ 6754 } \ 6755 } while(0) 6756 6757 /* Non-escape characters are interpreted as Unicode ordinals */ 6758 if (c != '\\' || (s >= end && !consumed)) { 6759 WRITE_CHAR(c); 6760 continue; 6761 } 6762 6763 Py_ssize_t startinpos = s - starts - 1; 6764 /* \ - Escapes */ 6765 if (s >= end) { 6766 assert(consumed); 6767 // Set message to silent compiler warning. 6768 // Actually it is never used. 6769 message = "\\ at end of string"; 6770 goto incomplete; 6771 } 6772 6773 c = (unsigned char) *s++; 6774 if (c == 'u') { 6775 count = 4; 6776 message = "truncated \\uXXXX escape"; 6777 } 6778 else if (c == 'U') { 6779 count = 8; 6780 message = "truncated \\UXXXXXXXX escape"; 6781 } 6782 else { 6783 assert(writer.pos < writer.size); 6784 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\'); 6785 WRITE_CHAR(c); 6786 continue; 6787 } 6788 6789 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ 6790 for (ch = 0; count; ++s, --count) { 6791 if (s >= end) { 6792 goto incomplete; 6793 } 6794 c = (unsigned char)*s; 6795 ch <<= 4; 6796 if (c >= '0' && c <= '9') { 6797 ch += c - '0'; 6798 } 6799 else if (c >= 'a' && c <= 'f') { 6800 ch += c - ('a' - 10); 6801 } 6802 else if (c >= 'A' && c <= 'F') { 6803 ch += c - ('A' - 10); 6804 } 6805 else { 6806 goto error; 6807 } 6808 } 6809 if (ch > MAX_UNICODE) { 6810 message = "\\Uxxxxxxxx out of range"; 6811 goto error; 6812 } 6813 WRITE_CHAR(ch); 6814 continue; 6815 6816 incomplete: 6817 if (consumed) { 6818 *consumed = startinpos; 6819 break; 6820 } 6821 error:; 6822 Py_ssize_t endinpos = s-starts; 6823 writer.min_length = end - s + writer.pos; 6824 if (unicode_decode_call_errorhandler_writer( 6825 errors, &errorHandler, 6826 "rawunicodeescape", message, 6827 &starts, &end, &startinpos, &endinpos, &exc, &s, 6828 &writer)) { 6829 goto onError; 6830 } 6831 assert(end - s <= writer.size - writer.pos); 6832 6833 #undef WRITE_CHAR 6834 } 6835 Py_XDECREF(errorHandler); 6836 Py_XDECREF(exc); 6837 return _PyUnicodeWriter_Finish(&writer); 6838 6839 onError: 6840 _PyUnicodeWriter_Dealloc(&writer); 6841 Py_XDECREF(errorHandler); 6842 Py_XDECREF(exc); 6843 return NULL; 6844 } 6845 6846 PyObject * PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6847 PyUnicode_DecodeRawUnicodeEscape(const char *s, 6848 Py_ssize_t size, 6849 const char *errors) 6850 { 6851 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); 6852 } 6853 6854 6855 PyObject * PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6856 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6857 { 6858 PyObject *repr; 6859 char *p; 6860 Py_ssize_t expandsize, pos; 6861 int kind; 6862 const void *data; 6863 Py_ssize_t len; 6864 6865 if (!PyUnicode_Check(unicode)) { 6866 PyErr_BadArgument(); 6867 return NULL; 6868 } 6869 if (PyUnicode_READY(unicode) == -1) { 6870 return NULL; 6871 } 6872 kind = PyUnicode_KIND(unicode); 6873 data = PyUnicode_DATA(unicode); 6874 len = PyUnicode_GET_LENGTH(unicode); 6875 if (kind == PyUnicode_1BYTE_KIND) { 6876 return PyBytes_FromStringAndSize(data, len); 6877 } 6878 6879 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6880 bytes, and 1 byte characters 4. */ 6881 expandsize = kind * 2 + 2; 6882 6883 if (len > PY_SSIZE_T_MAX / expandsize) { 6884 return PyErr_NoMemory(); 6885 } 6886 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6887 if (repr == NULL) { 6888 return NULL; 6889 } 6890 if (len == 0) { 6891 return repr; 6892 } 6893 6894 p = PyBytes_AS_STRING(repr); 6895 for (pos = 0; pos < len; pos++) { 6896 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6897 6898 /* U+0000-U+00ff range: Copy 8-bit characters as-is */ 6899 if (ch < 0x100) { 6900 *p++ = (char) ch; 6901 } 6902 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */ 6903 else if (ch < 0x10000) { 6904 *p++ = '\\'; 6905 *p++ = 'u'; 6906 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6907 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6908 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6909 *p++ = Py_hexdigits[ch & 15]; 6910 } 6911 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */ 6912 else { 6913 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6914 *p++ = '\\'; 6915 *p++ = 'U'; 6916 *p++ = '0'; 6917 *p++ = '0'; 6918 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6919 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6920 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6921 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6922 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6923 *p++ = Py_hexdigits[ch & 15]; 6924 } 6925 } 6926 6927 assert(p > PyBytes_AS_STRING(repr)); 6928 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6929 return NULL; 6930 } 6931 return repr; 6932 } 6933 6934 /* --- Latin-1 Codec ------------------------------------------------------ */ 6935 6936 PyObject * PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6937 PyUnicode_DecodeLatin1(const char *s, 6938 Py_ssize_t size, 6939 const char *errors) 6940 { 6941 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6942 return _PyUnicode_FromUCS1((const unsigned char*)s, size); 6943 } 6944 6945 /* create or adjust a UnicodeEncodeError */ 6946 static void make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6947 make_encode_exception(PyObject **exceptionObject, 6948 const char *encoding, 6949 PyObject *unicode, 6950 Py_ssize_t startpos, Py_ssize_t endpos, 6951 const char *reason) 6952 { 6953 if (*exceptionObject == NULL) { 6954 *exceptionObject = PyObject_CallFunction( 6955 PyExc_UnicodeEncodeError, "sOnns", 6956 encoding, unicode, startpos, endpos, reason); 6957 } 6958 else { 6959 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6960 goto onError; 6961 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6962 goto onError; 6963 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6964 goto onError; 6965 return; 6966 onError: 6967 Py_CLEAR(*exceptionObject); 6968 } 6969 } 6970 6971 /* raises a UnicodeEncodeError */ 6972 static void raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6973 raise_encode_exception(PyObject **exceptionObject, 6974 const char *encoding, 6975 PyObject *unicode, 6976 Py_ssize_t startpos, Py_ssize_t endpos, 6977 const char *reason) 6978 { 6979 make_encode_exception(exceptionObject, 6980 encoding, unicode, startpos, endpos, reason); 6981 if (*exceptionObject != NULL) 6982 PyCodec_StrictErrors(*exceptionObject); 6983 } 6984 6985 /* error handling callback helper: 6986 build arguments, call the callback and check the arguments, 6987 put the result into newpos and return the replacement string, which 6988 has to be freed by the caller */ 6989 static PyObject * unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6990 unicode_encode_call_errorhandler(const char *errors, 6991 PyObject **errorHandler, 6992 const char *encoding, const char *reason, 6993 PyObject *unicode, PyObject **exceptionObject, 6994 Py_ssize_t startpos, Py_ssize_t endpos, 6995 Py_ssize_t *newpos) 6996 { 6997 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6998 Py_ssize_t len; 6999 PyObject *restuple; 7000 PyObject *resunicode; 7001 7002 if (*errorHandler == NULL) { 7003 *errorHandler = PyCodec_LookupError(errors); 7004 if (*errorHandler == NULL) 7005 return NULL; 7006 } 7007 7008 if (PyUnicode_READY(unicode) == -1) 7009 return NULL; 7010 len = PyUnicode_GET_LENGTH(unicode); 7011 7012 make_encode_exception(exceptionObject, 7013 encoding, unicode, startpos, endpos, reason); 7014 if (*exceptionObject == NULL) 7015 return NULL; 7016 7017 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject); 7018 if (restuple == NULL) 7019 return NULL; 7020 if (!PyTuple_Check(restuple)) { 7021 PyErr_SetString(PyExc_TypeError, &argparse[3]); 7022 Py_DECREF(restuple); 7023 return NULL; 7024 } 7025 if (!PyArg_ParseTuple(restuple, argparse, 7026 &resunicode, newpos)) { 7027 Py_DECREF(restuple); 7028 return NULL; 7029 } 7030 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 7031 PyErr_SetString(PyExc_TypeError, &argparse[3]); 7032 Py_DECREF(restuple); 7033 return NULL; 7034 } 7035 if (*newpos<0) 7036 *newpos = len + *newpos; 7037 if (*newpos<0 || *newpos>len) { 7038 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7039 Py_DECREF(restuple); 7040 return NULL; 7041 } 7042 Py_INCREF(resunicode); 7043 Py_DECREF(restuple); 7044 return resunicode; 7045 } 7046 7047 static PyObject * unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7048 unicode_encode_ucs1(PyObject *unicode, 7049 const char *errors, 7050 const Py_UCS4 limit) 7051 { 7052 /* input state */ 7053 Py_ssize_t pos=0, size; 7054 int kind; 7055 const void *data; 7056 /* pointer into the output */ 7057 char *str; 7058 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 7059 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 7060 PyObject *error_handler_obj = NULL; 7061 PyObject *exc = NULL; 7062 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 7063 PyObject *rep = NULL; 7064 /* output object */ 7065 _PyBytesWriter writer; 7066 7067 if (PyUnicode_READY(unicode) == -1) 7068 return NULL; 7069 size = PyUnicode_GET_LENGTH(unicode); 7070 kind = PyUnicode_KIND(unicode); 7071 data = PyUnicode_DATA(unicode); 7072 /* allocate enough for a simple encoding without 7073 replacements, if we need more, we'll resize */ 7074 if (size == 0) 7075 return PyBytes_FromStringAndSize(NULL, 0); 7076 7077 _PyBytesWriter_Init(&writer); 7078 str = _PyBytesWriter_Alloc(&writer, size); 7079 if (str == NULL) 7080 return NULL; 7081 7082 while (pos < size) { 7083 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 7084 7085 /* can we encode this? */ 7086 if (ch < limit) { 7087 /* no overflow check, because we know that the space is enough */ 7088 *str++ = (char)ch; 7089 ++pos; 7090 } 7091 else { 7092 Py_ssize_t newpos, i; 7093 /* startpos for collecting unencodable chars */ 7094 Py_ssize_t collstart = pos; 7095 Py_ssize_t collend = collstart + 1; 7096 /* find all unecodable characters */ 7097 7098 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 7099 ++collend; 7100 7101 /* Only overallocate the buffer if it's not the last write */ 7102 writer.overallocate = (collend < size); 7103 7104 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 7105 if (error_handler == _Py_ERROR_UNKNOWN) 7106 error_handler = _Py_GetErrorHandler(errors); 7107 7108 switch (error_handler) { 7109 case _Py_ERROR_STRICT: 7110 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 7111 goto onError; 7112 7113 case _Py_ERROR_REPLACE: 7114 memset(str, '?', collend - collstart); 7115 str += (collend - collstart); 7116 /* fall through */ 7117 case _Py_ERROR_IGNORE: 7118 pos = collend; 7119 break; 7120 7121 case _Py_ERROR_BACKSLASHREPLACE: 7122 /* subtract preallocated bytes */ 7123 writer.min_size -= (collend - collstart); 7124 str = backslashreplace(&writer, str, 7125 unicode, collstart, collend); 7126 if (str == NULL) 7127 goto onError; 7128 pos = collend; 7129 break; 7130 7131 case _Py_ERROR_XMLCHARREFREPLACE: 7132 /* subtract preallocated bytes */ 7133 writer.min_size -= (collend - collstart); 7134 str = xmlcharrefreplace(&writer, str, 7135 unicode, collstart, collend); 7136 if (str == NULL) 7137 goto onError; 7138 pos = collend; 7139 break; 7140 7141 case _Py_ERROR_SURROGATEESCAPE: 7142 for (i = collstart; i < collend; ++i) { 7143 ch = PyUnicode_READ(kind, data, i); 7144 if (ch < 0xdc80 || 0xdcff < ch) { 7145 /* Not a UTF-8b surrogate */ 7146 break; 7147 } 7148 *str++ = (char)(ch - 0xdc00); 7149 ++pos; 7150 } 7151 if (i >= collend) 7152 break; 7153 collstart = pos; 7154 assert(collstart != collend); 7155 /* fall through */ 7156 7157 default: 7158 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, 7159 encoding, reason, unicode, &exc, 7160 collstart, collend, &newpos); 7161 if (rep == NULL) 7162 goto onError; 7163 7164 if (newpos < collstart) { 7165 writer.overallocate = 1; 7166 str = _PyBytesWriter_Prepare(&writer, str, 7167 collstart - newpos); 7168 if (str == NULL) 7169 goto onError; 7170 } 7171 else { 7172 /* subtract preallocated bytes */ 7173 writer.min_size -= newpos - collstart; 7174 /* Only overallocate the buffer if it's not the last write */ 7175 writer.overallocate = (newpos < size); 7176 } 7177 7178 if (PyBytes_Check(rep)) { 7179 /* Directly copy bytes result to output. */ 7180 str = _PyBytesWriter_WriteBytes(&writer, str, 7181 PyBytes_AS_STRING(rep), 7182 PyBytes_GET_SIZE(rep)); 7183 } 7184 else { 7185 assert(PyUnicode_Check(rep)); 7186 7187 if (PyUnicode_READY(rep) < 0) 7188 goto onError; 7189 7190 if (limit == 256 ? 7191 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : 7192 !PyUnicode_IS_ASCII(rep)) 7193 { 7194 /* Not all characters are smaller than limit */ 7195 raise_encode_exception(&exc, encoding, unicode, 7196 collstart, collend, reason); 7197 goto onError; 7198 } 7199 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 7200 str = _PyBytesWriter_WriteBytes(&writer, str, 7201 PyUnicode_DATA(rep), 7202 PyUnicode_GET_LENGTH(rep)); 7203 } 7204 if (str == NULL) 7205 goto onError; 7206 7207 pos = newpos; 7208 Py_CLEAR(rep); 7209 } 7210 7211 /* If overallocation was disabled, ensure that it was the last 7212 write. Otherwise, we missed an optimization */ 7213 assert(writer.overallocate || pos == size); 7214 } 7215 } 7216 7217 Py_XDECREF(error_handler_obj); 7218 Py_XDECREF(exc); 7219 return _PyBytesWriter_Finish(&writer, str); 7220 7221 onError: 7222 Py_XDECREF(rep); 7223 _PyBytesWriter_Dealloc(&writer); 7224 Py_XDECREF(error_handler_obj); 7225 Py_XDECREF(exc); 7226 return NULL; 7227 } 7228 7229 PyObject * _PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7230 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 7231 { 7232 if (!PyUnicode_Check(unicode)) { 7233 PyErr_BadArgument(); 7234 return NULL; 7235 } 7236 if (PyUnicode_READY(unicode) == -1) 7237 return NULL; 7238 /* Fast path: if it is a one-byte string, construct 7239 bytes object directly. */ 7240 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 7241 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 7242 PyUnicode_GET_LENGTH(unicode)); 7243 /* Non-Latin-1 characters present. Defer to above function to 7244 raise the exception. */ 7245 return unicode_encode_ucs1(unicode, errors, 256); 7246 } 7247 7248 PyObject* PyUnicode_AsLatin1String(PyObject * unicode)7249 PyUnicode_AsLatin1String(PyObject *unicode) 7250 { 7251 return _PyUnicode_AsLatin1String(unicode, NULL); 7252 } 7253 7254 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 7255 7256 PyObject * PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7257 PyUnicode_DecodeASCII(const char *s, 7258 Py_ssize_t size, 7259 const char *errors) 7260 { 7261 const char *starts = s; 7262 const char *e = s + size; 7263 PyObject *error_handler_obj = NULL; 7264 PyObject *exc = NULL; 7265 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 7266 7267 if (size == 0) 7268 _Py_RETURN_UNICODE_EMPTY(); 7269 7270 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 7271 if (size == 1 && (unsigned char)s[0] < 128) { 7272 return get_latin1_char((unsigned char)s[0]); 7273 } 7274 7275 // Shortcut for simple case 7276 PyObject *u = PyUnicode_New(size, 127); 7277 if (u == NULL) { 7278 return NULL; 7279 } 7280 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u)); 7281 if (outpos == size) { 7282 return u; 7283 } 7284 7285 _PyUnicodeWriter writer; 7286 _PyUnicodeWriter_InitWithBuffer(&writer, u); 7287 writer.pos = outpos; 7288 7289 s += outpos; 7290 int kind = writer.kind; 7291 void *data = writer.data; 7292 Py_ssize_t startinpos, endinpos; 7293 7294 while (s < e) { 7295 unsigned char c = (unsigned char)*s; 7296 if (c < 128) { 7297 PyUnicode_WRITE(kind, data, writer.pos, c); 7298 writer.pos++; 7299 ++s; 7300 continue; 7301 } 7302 7303 /* byte outsize range 0x00..0x7f: call the error handler */ 7304 7305 if (error_handler == _Py_ERROR_UNKNOWN) 7306 error_handler = _Py_GetErrorHandler(errors); 7307 7308 switch (error_handler) 7309 { 7310 case _Py_ERROR_REPLACE: 7311 case _Py_ERROR_SURROGATEESCAPE: 7312 /* Fast-path: the error handler only writes one character, 7313 but we may switch to UCS2 at the first write */ 7314 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 7315 goto onError; 7316 kind = writer.kind; 7317 data = writer.data; 7318 7319 if (error_handler == _Py_ERROR_REPLACE) 7320 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 7321 else 7322 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 7323 writer.pos++; 7324 ++s; 7325 break; 7326 7327 case _Py_ERROR_IGNORE: 7328 ++s; 7329 break; 7330 7331 default: 7332 startinpos = s-starts; 7333 endinpos = startinpos + 1; 7334 if (unicode_decode_call_errorhandler_writer( 7335 errors, &error_handler_obj, 7336 "ascii", "ordinal not in range(128)", 7337 &starts, &e, &startinpos, &endinpos, &exc, &s, 7338 &writer)) 7339 goto onError; 7340 kind = writer.kind; 7341 data = writer.data; 7342 } 7343 } 7344 Py_XDECREF(error_handler_obj); 7345 Py_XDECREF(exc); 7346 return _PyUnicodeWriter_Finish(&writer); 7347 7348 onError: 7349 _PyUnicodeWriter_Dealloc(&writer); 7350 Py_XDECREF(error_handler_obj); 7351 Py_XDECREF(exc); 7352 return NULL; 7353 } 7354 7355 PyObject * _PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7356 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 7357 { 7358 if (!PyUnicode_Check(unicode)) { 7359 PyErr_BadArgument(); 7360 return NULL; 7361 } 7362 if (PyUnicode_READY(unicode) == -1) 7363 return NULL; 7364 /* Fast path: if it is an ASCII-only string, construct bytes object 7365 directly. Else defer to above function to raise the exception. */ 7366 if (PyUnicode_IS_ASCII(unicode)) 7367 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 7368 PyUnicode_GET_LENGTH(unicode)); 7369 return unicode_encode_ucs1(unicode, errors, 128); 7370 } 7371 7372 PyObject * PyUnicode_AsASCIIString(PyObject * unicode)7373 PyUnicode_AsASCIIString(PyObject *unicode) 7374 { 7375 return _PyUnicode_AsASCIIString(unicode, NULL); 7376 } 7377 7378 #ifdef MS_WINDOWS 7379 7380 /* --- MBCS codecs for Windows -------------------------------------------- */ 7381 7382 #if SIZEOF_INT < SIZEOF_SIZE_T 7383 #define NEED_RETRY 7384 #endif 7385 7386 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when 7387 transcoding from UTF-16), but INT_MAX / 4 performs better in 7388 both cases also and avoids partial characters overrunning the 7389 length limit in MultiByteToWideChar on Windows */ 7390 #define DECODING_CHUNK_SIZE (INT_MAX/4) 7391 7392 #ifndef WC_ERR_INVALID_CHARS 7393 # define WC_ERR_INVALID_CHARS 0x0080 7394 #endif 7395 7396 static const char* code_page_name(UINT code_page,PyObject ** obj)7397 code_page_name(UINT code_page, PyObject **obj) 7398 { 7399 *obj = NULL; 7400 if (code_page == CP_ACP) 7401 return "mbcs"; 7402 if (code_page == CP_UTF7) 7403 return "CP_UTF7"; 7404 if (code_page == CP_UTF8) 7405 return "CP_UTF8"; 7406 7407 *obj = PyBytes_FromFormat("cp%u", code_page); 7408 if (*obj == NULL) 7409 return NULL; 7410 return PyBytes_AS_STRING(*obj); 7411 } 7412 7413 static DWORD decode_code_page_flags(UINT code_page)7414 decode_code_page_flags(UINT code_page) 7415 { 7416 if (code_page == CP_UTF7) { 7417 /* The CP_UTF7 decoder only supports flags=0 */ 7418 return 0; 7419 } 7420 else 7421 return MB_ERR_INVALID_CHARS; 7422 } 7423 7424 /* 7425 * Decode a byte string from a Windows code page into unicode object in strict 7426 * mode. 7427 * 7428 * Returns consumed size if succeed, returns -2 on decode error, or raise an 7429 * OSError and returns -1 on other error. 7430 */ 7431 static int decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7432 decode_code_page_strict(UINT code_page, 7433 wchar_t **buf, 7434 Py_ssize_t *bufsize, 7435 const char *in, 7436 int insize) 7437 { 7438 DWORD flags = MB_ERR_INVALID_CHARS; 7439 wchar_t *out; 7440 DWORD outsize; 7441 7442 /* First get the size of the result */ 7443 assert(insize > 0); 7444 while ((outsize = MultiByteToWideChar(code_page, flags, 7445 in, insize, NULL, 0)) <= 0) 7446 { 7447 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) { 7448 goto error; 7449 } 7450 /* For some code pages (e.g. UTF-7) flags must be set to 0. */ 7451 flags = 0; 7452 } 7453 7454 /* Extend a wchar_t* buffer */ 7455 Py_ssize_t n = *bufsize; /* Get the current length */ 7456 if (widechar_resize(buf, bufsize, n + outsize) < 0) { 7457 return -1; 7458 } 7459 out = *buf + n; 7460 7461 /* Do the conversion */ 7462 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 7463 if (outsize <= 0) 7464 goto error; 7465 return insize; 7466 7467 error: 7468 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7469 return -2; 7470 PyErr_SetFromWindowsErr(0); 7471 return -1; 7472 } 7473 7474 /* 7475 * Decode a byte string from a code page into unicode object with an error 7476 * handler. 7477 * 7478 * Returns consumed size if succeed, or raise an OSError or 7479 * UnicodeDecodeError exception and returns -1 on error. 7480 */ 7481 static int decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7482 decode_code_page_errors(UINT code_page, 7483 wchar_t **buf, 7484 Py_ssize_t *bufsize, 7485 const char *in, const int size, 7486 const char *errors, int final) 7487 { 7488 const char *startin = in; 7489 const char *endin = in + size; 7490 DWORD flags = MB_ERR_INVALID_CHARS; 7491 /* Ideally, we should get reason from FormatMessage. This is the Windows 7492 2000 English version of the message. */ 7493 const char *reason = "No mapping for the Unicode character exists " 7494 "in the target code page."; 7495 /* each step cannot decode more than 1 character, but a character can be 7496 represented as a surrogate pair */ 7497 wchar_t buffer[2], *out; 7498 int insize; 7499 Py_ssize_t outsize; 7500 PyObject *errorHandler = NULL; 7501 PyObject *exc = NULL; 7502 PyObject *encoding_obj = NULL; 7503 const char *encoding; 7504 DWORD err; 7505 int ret = -1; 7506 7507 assert(size > 0); 7508 7509 encoding = code_page_name(code_page, &encoding_obj); 7510 if (encoding == NULL) 7511 return -1; 7512 7513 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 7514 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 7515 UnicodeDecodeError. */ 7516 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 7517 if (exc != NULL) { 7518 PyCodec_StrictErrors(exc); 7519 Py_CLEAR(exc); 7520 } 7521 goto error; 7522 } 7523 7524 /* Extend a wchar_t* buffer */ 7525 Py_ssize_t n = *bufsize; /* Get the current length */ 7526 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7527 PyErr_NoMemory(); 7528 goto error; 7529 } 7530 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) { 7531 goto error; 7532 } 7533 out = *buf + n; 7534 7535 /* Decode the byte string character per character */ 7536 while (in < endin) 7537 { 7538 /* Decode a character */ 7539 insize = 1; 7540 do 7541 { 7542 outsize = MultiByteToWideChar(code_page, flags, 7543 in, insize, 7544 buffer, Py_ARRAY_LENGTH(buffer)); 7545 if (outsize > 0) 7546 break; 7547 err = GetLastError(); 7548 if (err == ERROR_INVALID_FLAGS && flags) { 7549 /* For some code pages (e.g. UTF-7) flags must be set to 0. */ 7550 flags = 0; 7551 continue; 7552 } 7553 if (err != ERROR_NO_UNICODE_TRANSLATION 7554 && err != ERROR_INSUFFICIENT_BUFFER) 7555 { 7556 PyErr_SetFromWindowsErr(0); 7557 goto error; 7558 } 7559 insize++; 7560 } 7561 /* 4=maximum length of a UTF-8 sequence */ 7562 while (insize <= 4 && (in + insize) <= endin); 7563 7564 if (outsize <= 0) { 7565 Py_ssize_t startinpos, endinpos, outpos; 7566 7567 /* last character in partial decode? */ 7568 if (in + insize >= endin && !final) 7569 break; 7570 7571 startinpos = in - startin; 7572 endinpos = startinpos + 1; 7573 outpos = out - *buf; 7574 if (unicode_decode_call_errorhandler_wchar( 7575 errors, &errorHandler, 7576 encoding, reason, 7577 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7578 buf, bufsize, &outpos)) 7579 { 7580 goto error; 7581 } 7582 out = *buf + outpos; 7583 } 7584 else { 7585 in += insize; 7586 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7587 out += outsize; 7588 } 7589 } 7590 7591 /* Shrink the buffer */ 7592 assert(out - *buf <= *bufsize); 7593 *bufsize = out - *buf; 7594 /* (in - startin) <= size and size is an int */ 7595 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7596 7597 error: 7598 Py_XDECREF(encoding_obj); 7599 Py_XDECREF(errorHandler); 7600 Py_XDECREF(exc); 7601 return ret; 7602 } 7603 7604 static PyObject * decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7605 decode_code_page_stateful(int code_page, 7606 const char *s, Py_ssize_t size, 7607 const char *errors, Py_ssize_t *consumed) 7608 { 7609 wchar_t *buf = NULL; 7610 Py_ssize_t bufsize = 0; 7611 int chunk_size, final, converted, done; 7612 7613 if (code_page < 0) { 7614 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7615 return NULL; 7616 } 7617 if (size < 0) { 7618 PyErr_BadInternalCall(); 7619 return NULL; 7620 } 7621 7622 if (consumed) 7623 *consumed = 0; 7624 7625 do 7626 { 7627 #ifdef NEED_RETRY 7628 if (size > DECODING_CHUNK_SIZE) { 7629 chunk_size = DECODING_CHUNK_SIZE; 7630 final = 0; 7631 done = 0; 7632 } 7633 else 7634 #endif 7635 { 7636 chunk_size = (int)size; 7637 final = (consumed == NULL); 7638 done = 1; 7639 } 7640 7641 if (chunk_size == 0 && done) { 7642 if (buf != NULL) 7643 break; 7644 _Py_RETURN_UNICODE_EMPTY(); 7645 } 7646 7647 converted = decode_code_page_strict(code_page, &buf, &bufsize, 7648 s, chunk_size); 7649 if (converted == -2) 7650 converted = decode_code_page_errors(code_page, &buf, &bufsize, 7651 s, chunk_size, 7652 errors, final); 7653 assert(converted != 0 || done); 7654 7655 if (converted < 0) { 7656 PyMem_Free(buf); 7657 return NULL; 7658 } 7659 7660 if (consumed) 7661 *consumed += converted; 7662 7663 s += converted; 7664 size -= converted; 7665 } while (!done); 7666 7667 PyObject *v = PyUnicode_FromWideChar(buf, bufsize); 7668 PyMem_Free(buf); 7669 return v; 7670 } 7671 7672 PyObject * PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7673 PyUnicode_DecodeCodePageStateful(int code_page, 7674 const char *s, 7675 Py_ssize_t size, 7676 const char *errors, 7677 Py_ssize_t *consumed) 7678 { 7679 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7680 } 7681 7682 PyObject * PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7683 PyUnicode_DecodeMBCSStateful(const char *s, 7684 Py_ssize_t size, 7685 const char *errors, 7686 Py_ssize_t *consumed) 7687 { 7688 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7689 } 7690 7691 PyObject * PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7692 PyUnicode_DecodeMBCS(const char *s, 7693 Py_ssize_t size, 7694 const char *errors) 7695 { 7696 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7697 } 7698 7699 static DWORD encode_code_page_flags(UINT code_page,const char * errors)7700 encode_code_page_flags(UINT code_page, const char *errors) 7701 { 7702 if (code_page == CP_UTF8) { 7703 return WC_ERR_INVALID_CHARS; 7704 } 7705 else if (code_page == CP_UTF7) { 7706 /* CP_UTF7 only supports flags=0 */ 7707 return 0; 7708 } 7709 else { 7710 if (errors != NULL && strcmp(errors, "replace") == 0) 7711 return 0; 7712 else 7713 return WC_NO_BEST_FIT_CHARS; 7714 } 7715 } 7716 7717 /* 7718 * Encode a Unicode string to a Windows code page into a byte string in strict 7719 * mode. 7720 * 7721 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7722 * an OSError and returns -1 on other error. 7723 */ 7724 static int encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7725 encode_code_page_strict(UINT code_page, PyObject **outbytes, 7726 PyObject *unicode, Py_ssize_t offset, int len, 7727 const char* errors) 7728 { 7729 BOOL usedDefaultChar = FALSE; 7730 BOOL *pusedDefaultChar = &usedDefaultChar; 7731 int outsize; 7732 wchar_t *p; 7733 Py_ssize_t size; 7734 const DWORD flags = encode_code_page_flags(code_page, NULL); 7735 char *out; 7736 /* Create a substring so that we can get the UTF-16 representation 7737 of just the slice under consideration. */ 7738 PyObject *substring; 7739 int ret = -1; 7740 7741 assert(len > 0); 7742 7743 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7744 pusedDefaultChar = &usedDefaultChar; 7745 else 7746 pusedDefaultChar = NULL; 7747 7748 substring = PyUnicode_Substring(unicode, offset, offset+len); 7749 if (substring == NULL) 7750 return -1; 7751 #if USE_UNICODE_WCHAR_CACHE 7752 _Py_COMP_DIAG_PUSH 7753 _Py_COMP_DIAG_IGNORE_DEPR_DECLS 7754 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7755 if (p == NULL) { 7756 Py_DECREF(substring); 7757 return -1; 7758 } 7759 _Py_COMP_DIAG_POP 7760 #else /* USE_UNICODE_WCHAR_CACHE */ 7761 p = PyUnicode_AsWideCharString(substring, &size); 7762 Py_CLEAR(substring); 7763 if (p == NULL) { 7764 return -1; 7765 } 7766 #endif /* USE_UNICODE_WCHAR_CACHE */ 7767 assert(size <= INT_MAX); 7768 7769 /* First get the size of the result */ 7770 outsize = WideCharToMultiByte(code_page, flags, 7771 p, (int)size, 7772 NULL, 0, 7773 NULL, pusedDefaultChar); 7774 if (outsize <= 0) 7775 goto error; 7776 /* If we used a default char, then we failed! */ 7777 if (pusedDefaultChar && *pusedDefaultChar) { 7778 ret = -2; 7779 goto done; 7780 } 7781 7782 if (*outbytes == NULL) { 7783 /* Create string object */ 7784 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7785 if (*outbytes == NULL) { 7786 goto done; 7787 } 7788 out = PyBytes_AS_STRING(*outbytes); 7789 } 7790 else { 7791 /* Extend string object */ 7792 const Py_ssize_t n = PyBytes_Size(*outbytes); 7793 if (outsize > PY_SSIZE_T_MAX - n) { 7794 PyErr_NoMemory(); 7795 goto done; 7796 } 7797 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7798 goto done; 7799 } 7800 out = PyBytes_AS_STRING(*outbytes) + n; 7801 } 7802 7803 /* Do the conversion */ 7804 outsize = WideCharToMultiByte(code_page, flags, 7805 p, (int)size, 7806 out, outsize, 7807 NULL, pusedDefaultChar); 7808 if (outsize <= 0) 7809 goto error; 7810 if (pusedDefaultChar && *pusedDefaultChar) { 7811 ret = -2; 7812 goto done; 7813 } 7814 ret = 0; 7815 7816 done: 7817 #if USE_UNICODE_WCHAR_CACHE 7818 Py_DECREF(substring); 7819 #else /* USE_UNICODE_WCHAR_CACHE */ 7820 PyMem_Free(p); 7821 #endif /* USE_UNICODE_WCHAR_CACHE */ 7822 return ret; 7823 7824 error: 7825 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) { 7826 ret = -2; 7827 goto done; 7828 } 7829 PyErr_SetFromWindowsErr(0); 7830 goto done; 7831 } 7832 7833 /* 7834 * Encode a Unicode string to a Windows code page into a byte string using an 7835 * error handler. 7836 * 7837 * Returns consumed characters if succeed, or raise an OSError and returns 7838 * -1 on other error. 7839 */ 7840 static int encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7841 encode_code_page_errors(UINT code_page, PyObject **outbytes, 7842 PyObject *unicode, Py_ssize_t unicode_offset, 7843 Py_ssize_t insize, const char* errors) 7844 { 7845 const DWORD flags = encode_code_page_flags(code_page, errors); 7846 Py_ssize_t pos = unicode_offset; 7847 Py_ssize_t endin = unicode_offset + insize; 7848 /* Ideally, we should get reason from FormatMessage. This is the Windows 7849 2000 English version of the message. */ 7850 const char *reason = "invalid character"; 7851 /* 4=maximum length of a UTF-8 sequence */ 7852 char buffer[4]; 7853 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7854 Py_ssize_t outsize; 7855 char *out; 7856 PyObject *errorHandler = NULL; 7857 PyObject *exc = NULL; 7858 PyObject *encoding_obj = NULL; 7859 const char *encoding; 7860 Py_ssize_t newpos, newoutsize; 7861 PyObject *rep; 7862 int ret = -1; 7863 7864 assert(insize > 0); 7865 7866 encoding = code_page_name(code_page, &encoding_obj); 7867 if (encoding == NULL) 7868 return -1; 7869 7870 if (errors == NULL || strcmp(errors, "strict") == 0) { 7871 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7872 then we raise a UnicodeEncodeError. */ 7873 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7874 if (exc != NULL) { 7875 PyCodec_StrictErrors(exc); 7876 Py_DECREF(exc); 7877 } 7878 Py_XDECREF(encoding_obj); 7879 return -1; 7880 } 7881 7882 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7883 pusedDefaultChar = &usedDefaultChar; 7884 else 7885 pusedDefaultChar = NULL; 7886 7887 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7888 PyErr_NoMemory(); 7889 goto error; 7890 } 7891 outsize = insize * Py_ARRAY_LENGTH(buffer); 7892 7893 if (*outbytes == NULL) { 7894 /* Create string object */ 7895 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7896 if (*outbytes == NULL) 7897 goto error; 7898 out = PyBytes_AS_STRING(*outbytes); 7899 } 7900 else { 7901 /* Extend string object */ 7902 Py_ssize_t n = PyBytes_Size(*outbytes); 7903 if (n > PY_SSIZE_T_MAX - outsize) { 7904 PyErr_NoMemory(); 7905 goto error; 7906 } 7907 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7908 goto error; 7909 out = PyBytes_AS_STRING(*outbytes) + n; 7910 } 7911 7912 /* Encode the string character per character */ 7913 while (pos < endin) 7914 { 7915 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7916 wchar_t chars[2]; 7917 int charsize; 7918 if (ch < 0x10000) { 7919 chars[0] = (wchar_t)ch; 7920 charsize = 1; 7921 } 7922 else { 7923 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7924 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7925 charsize = 2; 7926 } 7927 7928 outsize = WideCharToMultiByte(code_page, flags, 7929 chars, charsize, 7930 buffer, Py_ARRAY_LENGTH(buffer), 7931 NULL, pusedDefaultChar); 7932 if (outsize > 0) { 7933 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7934 { 7935 pos++; 7936 memcpy(out, buffer, outsize); 7937 out += outsize; 7938 continue; 7939 } 7940 } 7941 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7942 PyErr_SetFromWindowsErr(0); 7943 goto error; 7944 } 7945 7946 rep = unicode_encode_call_errorhandler( 7947 errors, &errorHandler, encoding, reason, 7948 unicode, &exc, 7949 pos, pos + 1, &newpos); 7950 if (rep == NULL) 7951 goto error; 7952 7953 Py_ssize_t morebytes = pos - newpos; 7954 if (PyBytes_Check(rep)) { 7955 outsize = PyBytes_GET_SIZE(rep); 7956 morebytes += outsize; 7957 if (morebytes > 0) { 7958 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7959 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; 7960 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7961 Py_DECREF(rep); 7962 goto error; 7963 } 7964 out = PyBytes_AS_STRING(*outbytes) + offset; 7965 } 7966 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7967 out += outsize; 7968 } 7969 else { 7970 Py_ssize_t i; 7971 enum PyUnicode_Kind kind; 7972 const void *data; 7973 7974 if (PyUnicode_READY(rep) == -1) { 7975 Py_DECREF(rep); 7976 goto error; 7977 } 7978 7979 outsize = PyUnicode_GET_LENGTH(rep); 7980 morebytes += outsize; 7981 if (morebytes > 0) { 7982 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7983 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; 7984 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7985 Py_DECREF(rep); 7986 goto error; 7987 } 7988 out = PyBytes_AS_STRING(*outbytes) + offset; 7989 } 7990 kind = PyUnicode_KIND(rep); 7991 data = PyUnicode_DATA(rep); 7992 for (i=0; i < outsize; i++) { 7993 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7994 if (ch > 127) { 7995 raise_encode_exception(&exc, 7996 encoding, unicode, 7997 pos, pos + 1, 7998 "unable to encode error handler result to ASCII"); 7999 Py_DECREF(rep); 8000 goto error; 8001 } 8002 *out = (unsigned char)ch; 8003 out++; 8004 } 8005 } 8006 pos = newpos; 8007 Py_DECREF(rep); 8008 } 8009 /* write a NUL byte */ 8010 *out = 0; 8011 outsize = out - PyBytes_AS_STRING(*outbytes); 8012 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 8013 if (_PyBytes_Resize(outbytes, outsize) < 0) 8014 goto error; 8015 ret = 0; 8016 8017 error: 8018 Py_XDECREF(encoding_obj); 8019 Py_XDECREF(errorHandler); 8020 Py_XDECREF(exc); 8021 return ret; 8022 } 8023 8024 static PyObject * encode_code_page(int code_page,PyObject * unicode,const char * errors)8025 encode_code_page(int code_page, 8026 PyObject *unicode, 8027 const char *errors) 8028 { 8029 Py_ssize_t len; 8030 PyObject *outbytes = NULL; 8031 Py_ssize_t offset; 8032 int chunk_len, ret, done; 8033 8034 if (!PyUnicode_Check(unicode)) { 8035 PyErr_BadArgument(); 8036 return NULL; 8037 } 8038 8039 if (PyUnicode_READY(unicode) == -1) 8040 return NULL; 8041 len = PyUnicode_GET_LENGTH(unicode); 8042 8043 if (code_page < 0) { 8044 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 8045 return NULL; 8046 } 8047 8048 if (len == 0) 8049 return PyBytes_FromStringAndSize(NULL, 0); 8050 8051 offset = 0; 8052 do 8053 { 8054 #ifdef NEED_RETRY 8055 if (len > DECODING_CHUNK_SIZE) { 8056 chunk_len = DECODING_CHUNK_SIZE; 8057 done = 0; 8058 } 8059 else 8060 #endif 8061 { 8062 chunk_len = (int)len; 8063 done = 1; 8064 } 8065 8066 ret = encode_code_page_strict(code_page, &outbytes, 8067 unicode, offset, chunk_len, 8068 errors); 8069 if (ret == -2) 8070 ret = encode_code_page_errors(code_page, &outbytes, 8071 unicode, offset, 8072 chunk_len, errors); 8073 if (ret < 0) { 8074 Py_XDECREF(outbytes); 8075 return NULL; 8076 } 8077 8078 offset += chunk_len; 8079 len -= chunk_len; 8080 } while (!done); 8081 8082 return outbytes; 8083 } 8084 8085 PyObject * PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8086 PyUnicode_EncodeCodePage(int code_page, 8087 PyObject *unicode, 8088 const char *errors) 8089 { 8090 return encode_code_page(code_page, unicode, errors); 8091 } 8092 8093 PyObject * PyUnicode_AsMBCSString(PyObject * unicode)8094 PyUnicode_AsMBCSString(PyObject *unicode) 8095 { 8096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 8097 } 8098 8099 #undef NEED_RETRY 8100 8101 #endif /* MS_WINDOWS */ 8102 8103 /* --- Character Mapping Codec -------------------------------------------- */ 8104 8105 static int charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8106 charmap_decode_string(const char *s, 8107 Py_ssize_t size, 8108 PyObject *mapping, 8109 const char *errors, 8110 _PyUnicodeWriter *writer) 8111 { 8112 const char *starts = s; 8113 const char *e; 8114 Py_ssize_t startinpos, endinpos; 8115 PyObject *errorHandler = NULL, *exc = NULL; 8116 Py_ssize_t maplen; 8117 enum PyUnicode_Kind mapkind; 8118 const void *mapdata; 8119 Py_UCS4 x; 8120 unsigned char ch; 8121 8122 if (PyUnicode_READY(mapping) == -1) 8123 return -1; 8124 8125 maplen = PyUnicode_GET_LENGTH(mapping); 8126 mapdata = PyUnicode_DATA(mapping); 8127 mapkind = PyUnicode_KIND(mapping); 8128 8129 e = s + size; 8130 8131 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 8132 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 8133 * is disabled in encoding aliases, latin1 is preferred because 8134 * its implementation is faster. */ 8135 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata; 8136 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 8137 Py_UCS4 maxchar = writer->maxchar; 8138 8139 assert (writer->kind == PyUnicode_1BYTE_KIND); 8140 while (s < e) { 8141 ch = *s; 8142 x = mapdata_ucs1[ch]; 8143 if (x > maxchar) { 8144 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 8145 goto onError; 8146 maxchar = writer->maxchar; 8147 outdata = (Py_UCS1 *)writer->data; 8148 } 8149 outdata[writer->pos] = x; 8150 writer->pos++; 8151 ++s; 8152 } 8153 return 0; 8154 } 8155 8156 while (s < e) { 8157 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 8158 enum PyUnicode_Kind outkind = writer->kind; 8159 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata; 8160 if (outkind == PyUnicode_1BYTE_KIND) { 8161 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 8162 Py_UCS4 maxchar = writer->maxchar; 8163 while (s < e) { 8164 ch = *s; 8165 x = mapdata_ucs2[ch]; 8166 if (x > maxchar) 8167 goto Error; 8168 outdata[writer->pos] = x; 8169 writer->pos++; 8170 ++s; 8171 } 8172 break; 8173 } 8174 else if (outkind == PyUnicode_2BYTE_KIND) { 8175 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 8176 while (s < e) { 8177 ch = *s; 8178 x = mapdata_ucs2[ch]; 8179 if (x == 0xFFFE) 8180 goto Error; 8181 outdata[writer->pos] = x; 8182 writer->pos++; 8183 ++s; 8184 } 8185 break; 8186 } 8187 } 8188 ch = *s; 8189 8190 if (ch < maplen) 8191 x = PyUnicode_READ(mapkind, mapdata, ch); 8192 else 8193 x = 0xfffe; /* invalid value */ 8194 Error: 8195 if (x == 0xfffe) 8196 { 8197 /* undefined mapping */ 8198 startinpos = s-starts; 8199 endinpos = startinpos+1; 8200 if (unicode_decode_call_errorhandler_writer( 8201 errors, &errorHandler, 8202 "charmap", "character maps to <undefined>", 8203 &starts, &e, &startinpos, &endinpos, &exc, &s, 8204 writer)) { 8205 goto onError; 8206 } 8207 continue; 8208 } 8209 8210 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 8211 goto onError; 8212 ++s; 8213 } 8214 Py_XDECREF(errorHandler); 8215 Py_XDECREF(exc); 8216 return 0; 8217 8218 onError: 8219 Py_XDECREF(errorHandler); 8220 Py_XDECREF(exc); 8221 return -1; 8222 } 8223 8224 static int charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8225 charmap_decode_mapping(const char *s, 8226 Py_ssize_t size, 8227 PyObject *mapping, 8228 const char *errors, 8229 _PyUnicodeWriter *writer) 8230 { 8231 const char *starts = s; 8232 const char *e; 8233 Py_ssize_t startinpos, endinpos; 8234 PyObject *errorHandler = NULL, *exc = NULL; 8235 unsigned char ch; 8236 PyObject *key, *item = NULL; 8237 8238 e = s + size; 8239 8240 while (s < e) { 8241 ch = *s; 8242 8243 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 8244 key = PyLong_FromLong((long)ch); 8245 if (key == NULL) 8246 goto onError; 8247 8248 item = PyObject_GetItem(mapping, key); 8249 Py_DECREF(key); 8250 if (item == NULL) { 8251 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8252 /* No mapping found means: mapping is undefined. */ 8253 PyErr_Clear(); 8254 goto Undefined; 8255 } else 8256 goto onError; 8257 } 8258 8259 /* Apply mapping */ 8260 if (item == Py_None) 8261 goto Undefined; 8262 if (PyLong_Check(item)) { 8263 long value = PyLong_AS_LONG(item); 8264 if (value == 0xFFFE) 8265 goto Undefined; 8266 if (value < 0 || value > MAX_UNICODE) { 8267 PyErr_Format(PyExc_TypeError, 8268 "character mapping must be in range(0x%x)", 8269 (unsigned long)MAX_UNICODE + 1); 8270 goto onError; 8271 } 8272 8273 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 8274 goto onError; 8275 } 8276 else if (PyUnicode_Check(item)) { 8277 if (PyUnicode_READY(item) == -1) 8278 goto onError; 8279 if (PyUnicode_GET_LENGTH(item) == 1) { 8280 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 8281 if (value == 0xFFFE) 8282 goto Undefined; 8283 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 8284 goto onError; 8285 } 8286 else { 8287 writer->overallocate = 1; 8288 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 8289 goto onError; 8290 } 8291 } 8292 else { 8293 /* wrong return value */ 8294 PyErr_SetString(PyExc_TypeError, 8295 "character mapping must return integer, None or str"); 8296 goto onError; 8297 } 8298 Py_CLEAR(item); 8299 ++s; 8300 continue; 8301 8302 Undefined: 8303 /* undefined mapping */ 8304 Py_CLEAR(item); 8305 startinpos = s-starts; 8306 endinpos = startinpos+1; 8307 if (unicode_decode_call_errorhandler_writer( 8308 errors, &errorHandler, 8309 "charmap", "character maps to <undefined>", 8310 &starts, &e, &startinpos, &endinpos, &exc, &s, 8311 writer)) { 8312 goto onError; 8313 } 8314 } 8315 Py_XDECREF(errorHandler); 8316 Py_XDECREF(exc); 8317 return 0; 8318 8319 onError: 8320 Py_XDECREF(item); 8321 Py_XDECREF(errorHandler); 8322 Py_XDECREF(exc); 8323 return -1; 8324 } 8325 8326 PyObject * PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8327 PyUnicode_DecodeCharmap(const char *s, 8328 Py_ssize_t size, 8329 PyObject *mapping, 8330 const char *errors) 8331 { 8332 _PyUnicodeWriter writer; 8333 8334 /* Default to Latin-1 */ 8335 if (mapping == NULL) 8336 return PyUnicode_DecodeLatin1(s, size, errors); 8337 8338 if (size == 0) 8339 _Py_RETURN_UNICODE_EMPTY(); 8340 _PyUnicodeWriter_Init(&writer); 8341 writer.min_length = size; 8342 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 8343 goto onError; 8344 8345 if (PyUnicode_CheckExact(mapping)) { 8346 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 8347 goto onError; 8348 } 8349 else { 8350 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 8351 goto onError; 8352 } 8353 return _PyUnicodeWriter_Finish(&writer); 8354 8355 onError: 8356 _PyUnicodeWriter_Dealloc(&writer); 8357 return NULL; 8358 } 8359 8360 /* Charmap encoding: the lookup table */ 8361 8362 /*[clinic input] 8363 class EncodingMap "struct encoding_map *" "&EncodingMapType" 8364 [clinic start generated code]*/ 8365 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/ 8366 8367 struct encoding_map { 8368 PyObject_HEAD 8369 unsigned char level1[32]; 8370 int count2, count3; 8371 unsigned char level23[1]; 8372 }; 8373 8374 /*[clinic input] 8375 EncodingMap.size 8376 8377 Return the size (in bytes) of this object. 8378 [clinic start generated code]*/ 8379 8380 static PyObject * EncodingMap_size_impl(struct encoding_map * self)8381 EncodingMap_size_impl(struct encoding_map *self) 8382 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/ 8383 { 8384 return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 + 8385 128*self->count3); 8386 } 8387 8388 static PyMethodDef encoding_map_methods[] = { 8389 ENCODINGMAP_SIZE_METHODDEF 8390 {NULL, NULL} 8391 }; 8392 8393 static PyTypeObject EncodingMapType = { 8394 PyVarObject_HEAD_INIT(NULL, 0) 8395 .tp_name = "EncodingMap", 8396 .tp_basicsize = sizeof(struct encoding_map), 8397 /* methods */ 8398 .tp_flags = Py_TPFLAGS_DEFAULT, 8399 .tp_methods = encoding_map_methods, 8400 }; 8401 8402 PyObject* PyUnicode_BuildEncodingMap(PyObject * string)8403 PyUnicode_BuildEncodingMap(PyObject* string) 8404 { 8405 PyObject *result; 8406 struct encoding_map *mresult; 8407 int i; 8408 int need_dict = 0; 8409 unsigned char level1[32]; 8410 unsigned char level2[512]; 8411 unsigned char *mlevel1, *mlevel2, *mlevel3; 8412 int count2 = 0, count3 = 0; 8413 int kind; 8414 const void *data; 8415 Py_ssize_t length; 8416 Py_UCS4 ch; 8417 8418 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 8419 PyErr_BadArgument(); 8420 return NULL; 8421 } 8422 kind = PyUnicode_KIND(string); 8423 data = PyUnicode_DATA(string); 8424 length = PyUnicode_GET_LENGTH(string); 8425 length = Py_MIN(length, 256); 8426 memset(level1, 0xFF, sizeof level1); 8427 memset(level2, 0xFF, sizeof level2); 8428 8429 /* If there isn't a one-to-one mapping of NULL to \0, 8430 or if there are non-BMP characters, we need to use 8431 a mapping dictionary. */ 8432 if (PyUnicode_READ(kind, data, 0) != 0) 8433 need_dict = 1; 8434 for (i = 1; i < length; i++) { 8435 int l1, l2; 8436 ch = PyUnicode_READ(kind, data, i); 8437 if (ch == 0 || ch > 0xFFFF) { 8438 need_dict = 1; 8439 break; 8440 } 8441 if (ch == 0xFFFE) 8442 /* unmapped character */ 8443 continue; 8444 l1 = ch >> 11; 8445 l2 = ch >> 7; 8446 if (level1[l1] == 0xFF) 8447 level1[l1] = count2++; 8448 if (level2[l2] == 0xFF) 8449 level2[l2] = count3++; 8450 } 8451 8452 if (count2 >= 0xFF || count3 >= 0xFF) 8453 need_dict = 1; 8454 8455 if (need_dict) { 8456 PyObject *result = PyDict_New(); 8457 PyObject *key, *value; 8458 if (!result) 8459 return NULL; 8460 for (i = 0; i < length; i++) { 8461 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 8462 value = PyLong_FromLong(i); 8463 if (!key || !value) 8464 goto failed1; 8465 if (PyDict_SetItem(result, key, value) == -1) 8466 goto failed1; 8467 Py_DECREF(key); 8468 Py_DECREF(value); 8469 } 8470 return result; 8471 failed1: 8472 Py_XDECREF(key); 8473 Py_XDECREF(value); 8474 Py_DECREF(result); 8475 return NULL; 8476 } 8477 8478 /* Create a three-level trie */ 8479 result = PyObject_Malloc(sizeof(struct encoding_map) + 8480 16*count2 + 128*count3 - 1); 8481 if (!result) { 8482 return PyErr_NoMemory(); 8483 } 8484 8485 _PyObject_Init(result, &EncodingMapType); 8486 mresult = (struct encoding_map*)result; 8487 mresult->count2 = count2; 8488 mresult->count3 = count3; 8489 mlevel1 = mresult->level1; 8490 mlevel2 = mresult->level23; 8491 mlevel3 = mresult->level23 + 16*count2; 8492 memcpy(mlevel1, level1, 32); 8493 memset(mlevel2, 0xFF, 16*count2); 8494 memset(mlevel3, 0, 128*count3); 8495 count3 = 0; 8496 for (i = 1; i < length; i++) { 8497 int o1, o2, o3, i2, i3; 8498 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8499 if (ch == 0xFFFE) 8500 /* unmapped character */ 8501 continue; 8502 o1 = ch>>11; 8503 o2 = (ch>>7) & 0xF; 8504 i2 = 16*mlevel1[o1] + o2; 8505 if (mlevel2[i2] == 0xFF) 8506 mlevel2[i2] = count3++; 8507 o3 = ch & 0x7F; 8508 i3 = 128*mlevel2[i2] + o3; 8509 mlevel3[i3] = i; 8510 } 8511 return result; 8512 } 8513 8514 static int encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8515 encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 8516 { 8517 struct encoding_map *map = (struct encoding_map*)mapping; 8518 int l1 = c>>11; 8519 int l2 = (c>>7) & 0xF; 8520 int l3 = c & 0x7F; 8521 int i; 8522 8523 if (c > 0xFFFF) 8524 return -1; 8525 if (c == 0) 8526 return 0; 8527 /* level 1*/ 8528 i = map->level1[l1]; 8529 if (i == 0xFF) { 8530 return -1; 8531 } 8532 /* level 2*/ 8533 i = map->level23[16*i+l2]; 8534 if (i == 0xFF) { 8535 return -1; 8536 } 8537 /* level 3 */ 8538 i = map->level23[16*map->count2 + 128*i + l3]; 8539 if (i == 0) { 8540 return -1; 8541 } 8542 return i; 8543 } 8544 8545 /* Lookup the character ch in the mapping. If the character 8546 can't be found, Py_None is returned (or NULL, if another 8547 error occurred). */ 8548 static PyObject * charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8549 charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8550 { 8551 PyObject *w = PyLong_FromLong((long)c); 8552 PyObject *x; 8553 8554 if (w == NULL) 8555 return NULL; 8556 x = PyObject_GetItem(mapping, w); 8557 Py_DECREF(w); 8558 if (x == NULL) { 8559 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8560 /* No mapping found means: mapping is undefined. */ 8561 PyErr_Clear(); 8562 Py_RETURN_NONE; 8563 } else 8564 return NULL; 8565 } 8566 else if (x == Py_None) 8567 return x; 8568 else if (PyLong_Check(x)) { 8569 long value = PyLong_AS_LONG(x); 8570 if (value < 0 || value > 255) { 8571 PyErr_SetString(PyExc_TypeError, 8572 "character mapping must be in range(256)"); 8573 Py_DECREF(x); 8574 return NULL; 8575 } 8576 return x; 8577 } 8578 else if (PyBytes_Check(x)) 8579 return x; 8580 else { 8581 /* wrong return value */ 8582 PyErr_Format(PyExc_TypeError, 8583 "character mapping must return integer, bytes or None, not %.400s", 8584 Py_TYPE(x)->tp_name); 8585 Py_DECREF(x); 8586 return NULL; 8587 } 8588 } 8589 8590 static int charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8591 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8592 { 8593 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8594 /* exponentially overallocate to minimize reallocations */ 8595 if (requiredsize < 2*outsize) 8596 requiredsize = 2*outsize; 8597 if (_PyBytes_Resize(outobj, requiredsize)) 8598 return -1; 8599 return 0; 8600 } 8601 8602 typedef enum charmapencode_result { 8603 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8604 } charmapencode_result; 8605 /* lookup the character, put the result in the output string and adjust 8606 various state variables. Resize the output bytes object if not enough 8607 space is available. Return a new reference to the object that 8608 was put in the output buffer, or Py_None, if the mapping was undefined 8609 (in which case no character was written) or NULL, if a 8610 reallocation error occurred. The caller must decref the result */ 8611 static charmapencode_result charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8612 charmapencode_output(Py_UCS4 c, PyObject *mapping, 8613 PyObject **outobj, Py_ssize_t *outpos) 8614 { 8615 PyObject *rep; 8616 char *outstart; 8617 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8618 8619 if (Py_IS_TYPE(mapping, &EncodingMapType)) { 8620 int res = encoding_map_lookup(c, mapping); 8621 Py_ssize_t requiredsize = *outpos+1; 8622 if (res == -1) 8623 return enc_FAILED; 8624 if (outsize<requiredsize) 8625 if (charmapencode_resize(outobj, outpos, requiredsize)) 8626 return enc_EXCEPTION; 8627 outstart = PyBytes_AS_STRING(*outobj); 8628 outstart[(*outpos)++] = (char)res; 8629 return enc_SUCCESS; 8630 } 8631 8632 rep = charmapencode_lookup(c, mapping); 8633 if (rep==NULL) 8634 return enc_EXCEPTION; 8635 else if (rep==Py_None) { 8636 Py_DECREF(rep); 8637 return enc_FAILED; 8638 } else { 8639 if (PyLong_Check(rep)) { 8640 Py_ssize_t requiredsize = *outpos+1; 8641 if (outsize<requiredsize) 8642 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8643 Py_DECREF(rep); 8644 return enc_EXCEPTION; 8645 } 8646 outstart = PyBytes_AS_STRING(*outobj); 8647 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8648 } 8649 else { 8650 const char *repchars = PyBytes_AS_STRING(rep); 8651 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8652 Py_ssize_t requiredsize = *outpos+repsize; 8653 if (outsize<requiredsize) 8654 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8655 Py_DECREF(rep); 8656 return enc_EXCEPTION; 8657 } 8658 outstart = PyBytes_AS_STRING(*outobj); 8659 memcpy(outstart + *outpos, repchars, repsize); 8660 *outpos += repsize; 8661 } 8662 } 8663 Py_DECREF(rep); 8664 return enc_SUCCESS; 8665 } 8666 8667 /* handle an error in PyUnicode_EncodeCharmap 8668 Return 0 on success, -1 on error */ 8669 static int charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8670 charmap_encoding_error( 8671 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8672 PyObject **exceptionObject, 8673 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8674 PyObject **res, Py_ssize_t *respos) 8675 { 8676 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8677 Py_ssize_t size, repsize; 8678 Py_ssize_t newpos; 8679 enum PyUnicode_Kind kind; 8680 const void *data; 8681 Py_ssize_t index; 8682 /* startpos for collecting unencodable chars */ 8683 Py_ssize_t collstartpos = *inpos; 8684 Py_ssize_t collendpos = *inpos+1; 8685 Py_ssize_t collpos; 8686 const char *encoding = "charmap"; 8687 const char *reason = "character maps to <undefined>"; 8688 charmapencode_result x; 8689 Py_UCS4 ch; 8690 int val; 8691 8692 if (PyUnicode_READY(unicode) == -1) 8693 return -1; 8694 size = PyUnicode_GET_LENGTH(unicode); 8695 /* find all unencodable characters */ 8696 while (collendpos < size) { 8697 PyObject *rep; 8698 if (Py_IS_TYPE(mapping, &EncodingMapType)) { 8699 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8700 val = encoding_map_lookup(ch, mapping); 8701 if (val != -1) 8702 break; 8703 ++collendpos; 8704 continue; 8705 } 8706 8707 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8708 rep = charmapencode_lookup(ch, mapping); 8709 if (rep==NULL) 8710 return -1; 8711 else if (rep!=Py_None) { 8712 Py_DECREF(rep); 8713 break; 8714 } 8715 Py_DECREF(rep); 8716 ++collendpos; 8717 } 8718 /* cache callback name lookup 8719 * (if not done yet, i.e. it's the first error) */ 8720 if (*error_handler == _Py_ERROR_UNKNOWN) 8721 *error_handler = _Py_GetErrorHandler(errors); 8722 8723 switch (*error_handler) { 8724 case _Py_ERROR_STRICT: 8725 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8726 return -1; 8727 8728 case _Py_ERROR_REPLACE: 8729 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8730 x = charmapencode_output('?', mapping, res, respos); 8731 if (x==enc_EXCEPTION) { 8732 return -1; 8733 } 8734 else if (x==enc_FAILED) { 8735 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8736 return -1; 8737 } 8738 } 8739 /* fall through */ 8740 case _Py_ERROR_IGNORE: 8741 *inpos = collendpos; 8742 break; 8743 8744 case _Py_ERROR_XMLCHARREFREPLACE: 8745 /* generate replacement (temporarily (mis)uses p) */ 8746 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8747 char buffer[2+29+1+1]; 8748 char *cp; 8749 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8750 for (cp = buffer; *cp; ++cp) { 8751 x = charmapencode_output(*cp, mapping, res, respos); 8752 if (x==enc_EXCEPTION) 8753 return -1; 8754 else if (x==enc_FAILED) { 8755 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8756 return -1; 8757 } 8758 } 8759 } 8760 *inpos = collendpos; 8761 break; 8762 8763 default: 8764 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8765 encoding, reason, unicode, exceptionObject, 8766 collstartpos, collendpos, &newpos); 8767 if (repunicode == NULL) 8768 return -1; 8769 if (PyBytes_Check(repunicode)) { 8770 /* Directly copy bytes result to output. */ 8771 Py_ssize_t outsize = PyBytes_Size(*res); 8772 Py_ssize_t requiredsize; 8773 repsize = PyBytes_Size(repunicode); 8774 requiredsize = *respos + repsize; 8775 if (requiredsize > outsize) 8776 /* Make room for all additional bytes. */ 8777 if (charmapencode_resize(res, respos, requiredsize)) { 8778 Py_DECREF(repunicode); 8779 return -1; 8780 } 8781 memcpy(PyBytes_AsString(*res) + *respos, 8782 PyBytes_AsString(repunicode), repsize); 8783 *respos += repsize; 8784 *inpos = newpos; 8785 Py_DECREF(repunicode); 8786 break; 8787 } 8788 /* generate replacement */ 8789 if (PyUnicode_READY(repunicode) == -1) { 8790 Py_DECREF(repunicode); 8791 return -1; 8792 } 8793 repsize = PyUnicode_GET_LENGTH(repunicode); 8794 data = PyUnicode_DATA(repunicode); 8795 kind = PyUnicode_KIND(repunicode); 8796 for (index = 0; index < repsize; index++) { 8797 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8798 x = charmapencode_output(repch, mapping, res, respos); 8799 if (x==enc_EXCEPTION) { 8800 Py_DECREF(repunicode); 8801 return -1; 8802 } 8803 else if (x==enc_FAILED) { 8804 Py_DECREF(repunicode); 8805 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8806 return -1; 8807 } 8808 } 8809 *inpos = newpos; 8810 Py_DECREF(repunicode); 8811 } 8812 return 0; 8813 } 8814 8815 PyObject * _PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8816 _PyUnicode_EncodeCharmap(PyObject *unicode, 8817 PyObject *mapping, 8818 const char *errors) 8819 { 8820 /* output object */ 8821 PyObject *res = NULL; 8822 /* current input position */ 8823 Py_ssize_t inpos = 0; 8824 Py_ssize_t size; 8825 /* current output position */ 8826 Py_ssize_t respos = 0; 8827 PyObject *error_handler_obj = NULL; 8828 PyObject *exc = NULL; 8829 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8830 const void *data; 8831 int kind; 8832 8833 if (PyUnicode_READY(unicode) == -1) 8834 return NULL; 8835 size = PyUnicode_GET_LENGTH(unicode); 8836 data = PyUnicode_DATA(unicode); 8837 kind = PyUnicode_KIND(unicode); 8838 8839 /* Default to Latin-1 */ 8840 if (mapping == NULL) 8841 return unicode_encode_ucs1(unicode, errors, 256); 8842 8843 /* allocate enough for a simple encoding without 8844 replacements, if we need more, we'll resize */ 8845 res = PyBytes_FromStringAndSize(NULL, size); 8846 if (res == NULL) 8847 goto onError; 8848 if (size == 0) 8849 return res; 8850 8851 while (inpos<size) { 8852 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8853 /* try to encode it */ 8854 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8855 if (x==enc_EXCEPTION) /* error */ 8856 goto onError; 8857 if (x==enc_FAILED) { /* unencodable character */ 8858 if (charmap_encoding_error(unicode, &inpos, mapping, 8859 &exc, 8860 &error_handler, &error_handler_obj, errors, 8861 &res, &respos)) { 8862 goto onError; 8863 } 8864 } 8865 else 8866 /* done with this character => adjust input position */ 8867 ++inpos; 8868 } 8869 8870 /* Resize if we allocated to much */ 8871 if (respos<PyBytes_GET_SIZE(res)) 8872 if (_PyBytes_Resize(&res, respos) < 0) 8873 goto onError; 8874 8875 Py_XDECREF(exc); 8876 Py_XDECREF(error_handler_obj); 8877 return res; 8878 8879 onError: 8880 Py_XDECREF(res); 8881 Py_XDECREF(exc); 8882 Py_XDECREF(error_handler_obj); 8883 return NULL; 8884 } 8885 8886 PyObject * PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8887 PyUnicode_AsCharmapString(PyObject *unicode, 8888 PyObject *mapping) 8889 { 8890 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8891 PyErr_BadArgument(); 8892 return NULL; 8893 } 8894 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8895 } 8896 8897 /* create or adjust a UnicodeTranslateError */ 8898 static void make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8899 make_translate_exception(PyObject **exceptionObject, 8900 PyObject *unicode, 8901 Py_ssize_t startpos, Py_ssize_t endpos, 8902 const char *reason) 8903 { 8904 if (*exceptionObject == NULL) { 8905 *exceptionObject = _PyUnicodeTranslateError_Create( 8906 unicode, startpos, endpos, reason); 8907 } 8908 else { 8909 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8910 goto onError; 8911 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8912 goto onError; 8913 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8914 goto onError; 8915 return; 8916 onError: 8917 Py_CLEAR(*exceptionObject); 8918 } 8919 } 8920 8921 /* error handling callback helper: 8922 build arguments, call the callback and check the arguments, 8923 put the result into newpos and return the replacement string, which 8924 has to be freed by the caller */ 8925 static PyObject * unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8926 unicode_translate_call_errorhandler(const char *errors, 8927 PyObject **errorHandler, 8928 const char *reason, 8929 PyObject *unicode, PyObject **exceptionObject, 8930 Py_ssize_t startpos, Py_ssize_t endpos, 8931 Py_ssize_t *newpos) 8932 { 8933 static const char *argparse = "Un;translating error handler must return (str, int) tuple"; 8934 8935 Py_ssize_t i_newpos; 8936 PyObject *restuple; 8937 PyObject *resunicode; 8938 8939 if (*errorHandler == NULL) { 8940 *errorHandler = PyCodec_LookupError(errors); 8941 if (*errorHandler == NULL) 8942 return NULL; 8943 } 8944 8945 make_translate_exception(exceptionObject, 8946 unicode, startpos, endpos, reason); 8947 if (*exceptionObject == NULL) 8948 return NULL; 8949 8950 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject); 8951 if (restuple == NULL) 8952 return NULL; 8953 if (!PyTuple_Check(restuple)) { 8954 PyErr_SetString(PyExc_TypeError, &argparse[3]); 8955 Py_DECREF(restuple); 8956 return NULL; 8957 } 8958 if (!PyArg_ParseTuple(restuple, argparse, 8959 &resunicode, &i_newpos)) { 8960 Py_DECREF(restuple); 8961 return NULL; 8962 } 8963 if (i_newpos<0) 8964 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8965 else 8966 *newpos = i_newpos; 8967 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8968 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8969 Py_DECREF(restuple); 8970 return NULL; 8971 } 8972 Py_INCREF(resunicode); 8973 Py_DECREF(restuple); 8974 return resunicode; 8975 } 8976 8977 /* Lookup the character ch in the mapping and put the result in result, 8978 which must be decrefed by the caller. 8979 Return 0 on success, -1 on error */ 8980 static int charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8981 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8982 { 8983 PyObject *w = PyLong_FromLong((long)c); 8984 PyObject *x; 8985 8986 if (w == NULL) 8987 return -1; 8988 x = PyObject_GetItem(mapping, w); 8989 Py_DECREF(w); 8990 if (x == NULL) { 8991 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8992 /* No mapping found means: use 1:1 mapping. */ 8993 PyErr_Clear(); 8994 *result = NULL; 8995 return 0; 8996 } else 8997 return -1; 8998 } 8999 else if (x == Py_None) { 9000 *result = x; 9001 return 0; 9002 } 9003 else if (PyLong_Check(x)) { 9004 long value = PyLong_AS_LONG(x); 9005 if (value < 0 || value > MAX_UNICODE) { 9006 PyErr_Format(PyExc_ValueError, 9007 "character mapping must be in range(0x%x)", 9008 MAX_UNICODE+1); 9009 Py_DECREF(x); 9010 return -1; 9011 } 9012 *result = x; 9013 return 0; 9014 } 9015 else if (PyUnicode_Check(x)) { 9016 *result = x; 9017 return 0; 9018 } 9019 else { 9020 /* wrong return value */ 9021 PyErr_SetString(PyExc_TypeError, 9022 "character mapping must return integer, None or str"); 9023 Py_DECREF(x); 9024 return -1; 9025 } 9026 } 9027 9028 /* lookup the character, write the result into the writer. 9029 Return 1 if the result was written into the writer, return 0 if the mapping 9030 was undefined, raise an exception return -1 on error. */ 9031 static int charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9032 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 9033 _PyUnicodeWriter *writer) 9034 { 9035 PyObject *item; 9036 9037 if (charmaptranslate_lookup(ch, mapping, &item)) 9038 return -1; 9039 9040 if (item == NULL) { 9041 /* not found => default to 1:1 mapping */ 9042 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 9043 return -1; 9044 } 9045 return 1; 9046 } 9047 9048 if (item == Py_None) { 9049 Py_DECREF(item); 9050 return 0; 9051 } 9052 9053 if (PyLong_Check(item)) { 9054 long ch = (Py_UCS4)PyLong_AS_LONG(item); 9055 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 9056 used it */ 9057 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 9058 Py_DECREF(item); 9059 return -1; 9060 } 9061 Py_DECREF(item); 9062 return 1; 9063 } 9064 9065 if (!PyUnicode_Check(item)) { 9066 Py_DECREF(item); 9067 return -1; 9068 } 9069 9070 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 9071 Py_DECREF(item); 9072 return -1; 9073 } 9074 9075 Py_DECREF(item); 9076 return 1; 9077 } 9078 9079 static int unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9080 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 9081 Py_UCS1 *translate) 9082 { 9083 PyObject *item = NULL; 9084 int ret = 0; 9085 9086 if (charmaptranslate_lookup(ch, mapping, &item)) { 9087 return -1; 9088 } 9089 9090 if (item == Py_None) { 9091 /* deletion */ 9092 translate[ch] = 0xfe; 9093 } 9094 else if (item == NULL) { 9095 /* not found => default to 1:1 mapping */ 9096 translate[ch] = ch; 9097 return 1; 9098 } 9099 else if (PyLong_Check(item)) { 9100 long replace = PyLong_AS_LONG(item); 9101 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 9102 used it */ 9103 if (127 < replace) { 9104 /* invalid character or character outside ASCII: 9105 skip the fast translate */ 9106 goto exit; 9107 } 9108 translate[ch] = (Py_UCS1)replace; 9109 } 9110 else if (PyUnicode_Check(item)) { 9111 Py_UCS4 replace; 9112 9113 if (PyUnicode_READY(item) == -1) { 9114 Py_DECREF(item); 9115 return -1; 9116 } 9117 if (PyUnicode_GET_LENGTH(item) != 1) 9118 goto exit; 9119 9120 replace = PyUnicode_READ_CHAR(item, 0); 9121 if (replace > 127) 9122 goto exit; 9123 translate[ch] = (Py_UCS1)replace; 9124 } 9125 else { 9126 /* not None, NULL, long or unicode */ 9127 goto exit; 9128 } 9129 ret = 1; 9130 9131 exit: 9132 Py_DECREF(item); 9133 return ret; 9134 } 9135 9136 /* Fast path for ascii => ascii translation. Return 1 if the whole string 9137 was translated into writer, return 0 if the input string was partially 9138 translated into writer, raise an exception and return -1 on error. */ 9139 static int unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9140 unicode_fast_translate(PyObject *input, PyObject *mapping, 9141 _PyUnicodeWriter *writer, int ignore, 9142 Py_ssize_t *input_pos) 9143 { 9144 Py_UCS1 ascii_table[128], ch, ch2; 9145 Py_ssize_t len; 9146 const Py_UCS1 *in, *end; 9147 Py_UCS1 *out; 9148 int res = 0; 9149 9150 len = PyUnicode_GET_LENGTH(input); 9151 9152 memset(ascii_table, 0xff, 128); 9153 9154 in = PyUnicode_1BYTE_DATA(input); 9155 end = in + len; 9156 9157 assert(PyUnicode_IS_ASCII(writer->buffer)); 9158 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 9159 out = PyUnicode_1BYTE_DATA(writer->buffer); 9160 9161 for (; in < end; in++) { 9162 ch = *in; 9163 ch2 = ascii_table[ch]; 9164 if (ch2 == 0xff) { 9165 int translate = unicode_fast_translate_lookup(mapping, ch, 9166 ascii_table); 9167 if (translate < 0) 9168 return -1; 9169 if (translate == 0) 9170 goto exit; 9171 ch2 = ascii_table[ch]; 9172 } 9173 if (ch2 == 0xfe) { 9174 if (ignore) 9175 continue; 9176 goto exit; 9177 } 9178 assert(ch2 < 128); 9179 *out = ch2; 9180 out++; 9181 } 9182 res = 1; 9183 9184 exit: 9185 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 9186 *input_pos = in - PyUnicode_1BYTE_DATA(input); 9187 return res; 9188 } 9189 9190 static PyObject * _PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9191 _PyUnicode_TranslateCharmap(PyObject *input, 9192 PyObject *mapping, 9193 const char *errors) 9194 { 9195 /* input object */ 9196 const void *data; 9197 Py_ssize_t size, i; 9198 int kind; 9199 /* output buffer */ 9200 _PyUnicodeWriter writer; 9201 /* error handler */ 9202 const char *reason = "character maps to <undefined>"; 9203 PyObject *errorHandler = NULL; 9204 PyObject *exc = NULL; 9205 int ignore; 9206 int res; 9207 9208 if (mapping == NULL) { 9209 PyErr_BadArgument(); 9210 return NULL; 9211 } 9212 9213 if (PyUnicode_READY(input) == -1) 9214 return NULL; 9215 data = PyUnicode_DATA(input); 9216 kind = PyUnicode_KIND(input); 9217 size = PyUnicode_GET_LENGTH(input); 9218 9219 if (size == 0) 9220 return PyUnicode_FromObject(input); 9221 9222 /* allocate enough for a simple 1:1 translation without 9223 replacements, if we need more, we'll resize */ 9224 _PyUnicodeWriter_Init(&writer); 9225 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 9226 goto onError; 9227 9228 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 9229 9230 if (PyUnicode_READY(input) == -1) 9231 return NULL; 9232 if (PyUnicode_IS_ASCII(input)) { 9233 res = unicode_fast_translate(input, mapping, &writer, ignore, &i); 9234 if (res < 0) { 9235 _PyUnicodeWriter_Dealloc(&writer); 9236 return NULL; 9237 } 9238 if (res == 1) 9239 return _PyUnicodeWriter_Finish(&writer); 9240 } 9241 else { 9242 i = 0; 9243 } 9244 9245 while (i<size) { 9246 /* try to encode it */ 9247 int translate; 9248 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 9249 Py_ssize_t newpos; 9250 /* startpos for collecting untranslatable chars */ 9251 Py_ssize_t collstart; 9252 Py_ssize_t collend; 9253 Py_UCS4 ch; 9254 9255 ch = PyUnicode_READ(kind, data, i); 9256 translate = charmaptranslate_output(ch, mapping, &writer); 9257 if (translate < 0) 9258 goto onError; 9259 9260 if (translate != 0) { 9261 /* it worked => adjust input pointer */ 9262 ++i; 9263 continue; 9264 } 9265 9266 /* untranslatable character */ 9267 collstart = i; 9268 collend = i+1; 9269 9270 /* find all untranslatable characters */ 9271 while (collend < size) { 9272 PyObject *x; 9273 ch = PyUnicode_READ(kind, data, collend); 9274 if (charmaptranslate_lookup(ch, mapping, &x)) 9275 goto onError; 9276 Py_XDECREF(x); 9277 if (x != Py_None) 9278 break; 9279 ++collend; 9280 } 9281 9282 if (ignore) { 9283 i = collend; 9284 } 9285 else { 9286 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 9287 reason, input, &exc, 9288 collstart, collend, &newpos); 9289 if (repunicode == NULL) 9290 goto onError; 9291 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 9292 Py_DECREF(repunicode); 9293 goto onError; 9294 } 9295 Py_DECREF(repunicode); 9296 i = newpos; 9297 } 9298 } 9299 Py_XDECREF(exc); 9300 Py_XDECREF(errorHandler); 9301 return _PyUnicodeWriter_Finish(&writer); 9302 9303 onError: 9304 _PyUnicodeWriter_Dealloc(&writer); 9305 Py_XDECREF(exc); 9306 Py_XDECREF(errorHandler); 9307 return NULL; 9308 } 9309 9310 PyObject * PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9311 PyUnicode_Translate(PyObject *str, 9312 PyObject *mapping, 9313 const char *errors) 9314 { 9315 if (ensure_unicode(str) < 0) 9316 return NULL; 9317 return _PyUnicode_TranslateCharmap(str, mapping, errors); 9318 } 9319 9320 PyObject * _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9321 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 9322 { 9323 if (!PyUnicode_Check(unicode)) { 9324 PyErr_BadInternalCall(); 9325 return NULL; 9326 } 9327 if (PyUnicode_READY(unicode) == -1) 9328 return NULL; 9329 if (PyUnicode_IS_ASCII(unicode)) { 9330 /* If the string is already ASCII, just return the same string */ 9331 Py_INCREF(unicode); 9332 return unicode; 9333 } 9334 9335 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); 9336 PyObject *result = PyUnicode_New(len, 127); 9337 if (result == NULL) { 9338 return NULL; 9339 } 9340 9341 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result); 9342 int kind = PyUnicode_KIND(unicode); 9343 const void *data = PyUnicode_DATA(unicode); 9344 Py_ssize_t i; 9345 for (i = 0; i < len; ++i) { 9346 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9347 if (ch < 127) { 9348 out[i] = ch; 9349 } 9350 else if (Py_UNICODE_ISSPACE(ch)) { 9351 out[i] = ' '; 9352 } 9353 else { 9354 int decimal = Py_UNICODE_TODECIMAL(ch); 9355 if (decimal < 0) { 9356 out[i] = '?'; 9357 out[i+1] = '\0'; 9358 _PyUnicode_LENGTH(result) = i + 1; 9359 break; 9360 } 9361 out[i] = '0' + decimal; 9362 } 9363 } 9364 9365 assert(_PyUnicode_CheckConsistency(result, 1)); 9366 return result; 9367 } 9368 9369 /* --- Helpers ------------------------------------------------------------ */ 9370 9371 /* helper macro to fixup start/end slice values */ 9372 #define ADJUST_INDICES(start, end, len) \ 9373 if (end > len) \ 9374 end = len; \ 9375 else if (end < 0) { \ 9376 end += len; \ 9377 if (end < 0) \ 9378 end = 0; \ 9379 } \ 9380 if (start < 0) { \ 9381 start += len; \ 9382 if (start < 0) \ 9383 start = 0; \ 9384 } 9385 9386 static Py_ssize_t any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9387 any_find_slice(PyObject* s1, PyObject* s2, 9388 Py_ssize_t start, 9389 Py_ssize_t end, 9390 int direction) 9391 { 9392 int kind1, kind2; 9393 const void *buf1, *buf2; 9394 Py_ssize_t len1, len2, result; 9395 9396 kind1 = PyUnicode_KIND(s1); 9397 kind2 = PyUnicode_KIND(s2); 9398 if (kind1 < kind2) 9399 return -1; 9400 9401 len1 = PyUnicode_GET_LENGTH(s1); 9402 len2 = PyUnicode_GET_LENGTH(s2); 9403 ADJUST_INDICES(start, end, len1); 9404 if (end - start < len2) 9405 return -1; 9406 9407 buf1 = PyUnicode_DATA(s1); 9408 buf2 = PyUnicode_DATA(s2); 9409 if (len2 == 1) { 9410 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9411 result = findchar((const char *)buf1 + kind1*start, 9412 kind1, end - start, ch, direction); 9413 if (result == -1) 9414 return -1; 9415 else 9416 return start + result; 9417 } 9418 9419 if (kind2 != kind1) { 9420 buf2 = unicode_askind(kind2, buf2, len2, kind1); 9421 if (!buf2) 9422 return -2; 9423 } 9424 9425 if (direction > 0) { 9426 switch (kind1) { 9427 case PyUnicode_1BYTE_KIND: 9428 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9429 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9430 else 9431 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9432 break; 9433 case PyUnicode_2BYTE_KIND: 9434 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9435 break; 9436 case PyUnicode_4BYTE_KIND: 9437 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9438 break; 9439 default: 9440 Py_UNREACHABLE(); 9441 } 9442 } 9443 else { 9444 switch (kind1) { 9445 case PyUnicode_1BYTE_KIND: 9446 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9447 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9448 else 9449 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9450 break; 9451 case PyUnicode_2BYTE_KIND: 9452 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9453 break; 9454 case PyUnicode_4BYTE_KIND: 9455 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9456 break; 9457 default: 9458 Py_UNREACHABLE(); 9459 } 9460 } 9461 9462 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2))); 9463 if (kind2 != kind1) 9464 PyMem_Free((void *)buf2); 9465 9466 return result; 9467 } 9468 9469 /* _PyUnicode_InsertThousandsGrouping() helper functions */ 9470 #include "stringlib/localeutil.h" 9471 9472 /** 9473 * InsertThousandsGrouping: 9474 * @writer: Unicode writer. 9475 * @n_buffer: Number of characters in @buffer. 9476 * @digits: Digits we're reading from. If count is non-NULL, this is unused. 9477 * @d_pos: Start of digits string. 9478 * @n_digits: The number of digits in the string, in which we want 9479 * to put the grouping chars. 9480 * @min_width: The minimum width of the digits in the output string. 9481 * Output will be zero-padded on the left to fill. 9482 * @grouping: see definition in localeconv(). 9483 * @thousands_sep: see definition in localeconv(). 9484 * 9485 * There are 2 modes: counting and filling. If @writer is NULL, 9486 * we are in counting mode, else filling mode. 9487 * If counting, the required buffer size is returned. 9488 * If filling, we know the buffer will be large enough, so we don't 9489 * need to pass in the buffer size. 9490 * Inserts thousand grouping characters (as defined by grouping and 9491 * thousands_sep) into @writer. 9492 * 9493 * Return value: -1 on error, number of characters otherwise. 9494 **/ 9495 Py_ssize_t _PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9496 _PyUnicode_InsertThousandsGrouping( 9497 _PyUnicodeWriter *writer, 9498 Py_ssize_t n_buffer, 9499 PyObject *digits, 9500 Py_ssize_t d_pos, 9501 Py_ssize_t n_digits, 9502 Py_ssize_t min_width, 9503 const char *grouping, 9504 PyObject *thousands_sep, 9505 Py_UCS4 *maxchar) 9506 { 9507 min_width = Py_MAX(0, min_width); 9508 if (writer) { 9509 assert(digits != NULL); 9510 assert(maxchar == NULL); 9511 } 9512 else { 9513 assert(digits == NULL); 9514 assert(maxchar != NULL); 9515 } 9516 assert(0 <= d_pos); 9517 assert(0 <= n_digits); 9518 assert(grouping != NULL); 9519 9520 if (digits != NULL) { 9521 if (PyUnicode_READY(digits) == -1) { 9522 return -1; 9523 } 9524 } 9525 if (PyUnicode_READY(thousands_sep) == -1) { 9526 return -1; 9527 } 9528 9529 Py_ssize_t count = 0; 9530 Py_ssize_t n_zeros; 9531 int loop_broken = 0; 9532 int use_separator = 0; /* First time through, don't append the 9533 separator. They only go between 9534 groups. */ 9535 Py_ssize_t buffer_pos; 9536 Py_ssize_t digits_pos; 9537 Py_ssize_t len; 9538 Py_ssize_t n_chars; 9539 Py_ssize_t remaining = n_digits; /* Number of chars remaining to 9540 be looked at */ 9541 /* A generator that returns all of the grouping widths, until it 9542 returns 0. */ 9543 GroupGenerator groupgen; 9544 GroupGenerator_init(&groupgen, grouping); 9545 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9546 9547 /* if digits are not grouped, thousands separator 9548 should be an empty string */ 9549 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0)); 9550 9551 digits_pos = d_pos + n_digits; 9552 if (writer) { 9553 buffer_pos = writer->pos + n_buffer; 9554 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer)); 9555 assert(digits_pos <= PyUnicode_GET_LENGTH(digits)); 9556 } 9557 else { 9558 buffer_pos = n_buffer; 9559 } 9560 9561 if (!writer) { 9562 *maxchar = 127; 9563 } 9564 9565 while ((len = GroupGenerator_next(&groupgen)) > 0) { 9566 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1)); 9567 n_zeros = Py_MAX(0, len - remaining); 9568 n_chars = Py_MAX(0, Py_MIN(remaining, len)); 9569 9570 /* Use n_zero zero's and n_chars chars */ 9571 9572 /* Count only, don't do anything. */ 9573 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; 9574 9575 /* Copy into the writer. */ 9576 InsertThousandsGrouping_fill(writer, &buffer_pos, 9577 digits, &digits_pos, 9578 n_chars, n_zeros, 9579 use_separator ? thousands_sep : NULL, 9580 thousands_sep_len, maxchar); 9581 9582 /* Use a separator next time. */ 9583 use_separator = 1; 9584 9585 remaining -= n_chars; 9586 min_width -= len; 9587 9588 if (remaining <= 0 && min_width <= 0) { 9589 loop_broken = 1; 9590 break; 9591 } 9592 min_width -= thousands_sep_len; 9593 } 9594 if (!loop_broken) { 9595 /* We left the loop without using a break statement. */ 9596 9597 len = Py_MAX(Py_MAX(remaining, min_width), 1); 9598 n_zeros = Py_MAX(0, len - remaining); 9599 n_chars = Py_MAX(0, Py_MIN(remaining, len)); 9600 9601 /* Use n_zero zero's and n_chars chars */ 9602 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; 9603 9604 /* Copy into the writer. */ 9605 InsertThousandsGrouping_fill(writer, &buffer_pos, 9606 digits, &digits_pos, 9607 n_chars, n_zeros, 9608 use_separator ? thousands_sep : NULL, 9609 thousands_sep_len, maxchar); 9610 } 9611 return count; 9612 } 9613 9614 9615 Py_ssize_t PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9616 PyUnicode_Count(PyObject *str, 9617 PyObject *substr, 9618 Py_ssize_t start, 9619 Py_ssize_t end) 9620 { 9621 Py_ssize_t result; 9622 int kind1, kind2; 9623 const void *buf1 = NULL, *buf2 = NULL; 9624 Py_ssize_t len1, len2; 9625 9626 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9627 return -1; 9628 9629 kind1 = PyUnicode_KIND(str); 9630 kind2 = PyUnicode_KIND(substr); 9631 if (kind1 < kind2) 9632 return 0; 9633 9634 len1 = PyUnicode_GET_LENGTH(str); 9635 len2 = PyUnicode_GET_LENGTH(substr); 9636 ADJUST_INDICES(start, end, len1); 9637 if (end - start < len2) 9638 return 0; 9639 9640 buf1 = PyUnicode_DATA(str); 9641 buf2 = PyUnicode_DATA(substr); 9642 if (kind2 != kind1) { 9643 buf2 = unicode_askind(kind2, buf2, len2, kind1); 9644 if (!buf2) 9645 goto onError; 9646 } 9647 9648 switch (kind1) { 9649 case PyUnicode_1BYTE_KIND: 9650 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 9651 result = asciilib_count( 9652 ((const Py_UCS1*)buf1) + start, end - start, 9653 buf2, len2, PY_SSIZE_T_MAX 9654 ); 9655 else 9656 result = ucs1lib_count( 9657 ((const Py_UCS1*)buf1) + start, end - start, 9658 buf2, len2, PY_SSIZE_T_MAX 9659 ); 9660 break; 9661 case PyUnicode_2BYTE_KIND: 9662 result = ucs2lib_count( 9663 ((const Py_UCS2*)buf1) + start, end - start, 9664 buf2, len2, PY_SSIZE_T_MAX 9665 ); 9666 break; 9667 case PyUnicode_4BYTE_KIND: 9668 result = ucs4lib_count( 9669 ((const Py_UCS4*)buf1) + start, end - start, 9670 buf2, len2, PY_SSIZE_T_MAX 9671 ); 9672 break; 9673 default: 9674 Py_UNREACHABLE(); 9675 } 9676 9677 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 9678 if (kind2 != kind1) 9679 PyMem_Free((void *)buf2); 9680 9681 return result; 9682 onError: 9683 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 9684 if (kind2 != kind1) 9685 PyMem_Free((void *)buf2); 9686 return -1; 9687 } 9688 9689 Py_ssize_t PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9690 PyUnicode_Find(PyObject *str, 9691 PyObject *substr, 9692 Py_ssize_t start, 9693 Py_ssize_t end, 9694 int direction) 9695 { 9696 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9697 return -2; 9698 9699 return any_find_slice(str, substr, start, end, direction); 9700 } 9701 9702 Py_ssize_t PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9703 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9704 Py_ssize_t start, Py_ssize_t end, 9705 int direction) 9706 { 9707 int kind; 9708 Py_ssize_t len, result; 9709 if (PyUnicode_READY(str) == -1) 9710 return -2; 9711 len = PyUnicode_GET_LENGTH(str); 9712 ADJUST_INDICES(start, end, len); 9713 if (end - start < 1) 9714 return -1; 9715 kind = PyUnicode_KIND(str); 9716 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9717 kind, end-start, ch, direction); 9718 if (result == -1) 9719 return -1; 9720 else 9721 return start + result; 9722 } 9723 9724 static int tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9725 tailmatch(PyObject *self, 9726 PyObject *substring, 9727 Py_ssize_t start, 9728 Py_ssize_t end, 9729 int direction) 9730 { 9731 int kind_self; 9732 int kind_sub; 9733 const void *data_self; 9734 const void *data_sub; 9735 Py_ssize_t offset; 9736 Py_ssize_t i; 9737 Py_ssize_t end_sub; 9738 9739 if (PyUnicode_READY(self) == -1 || 9740 PyUnicode_READY(substring) == -1) 9741 return -1; 9742 9743 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9744 end -= PyUnicode_GET_LENGTH(substring); 9745 if (end < start) 9746 return 0; 9747 9748 if (PyUnicode_GET_LENGTH(substring) == 0) 9749 return 1; 9750 9751 kind_self = PyUnicode_KIND(self); 9752 data_self = PyUnicode_DATA(self); 9753 kind_sub = PyUnicode_KIND(substring); 9754 data_sub = PyUnicode_DATA(substring); 9755 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9756 9757 if (direction > 0) 9758 offset = end; 9759 else 9760 offset = start; 9761 9762 if (PyUnicode_READ(kind_self, data_self, offset) == 9763 PyUnicode_READ(kind_sub, data_sub, 0) && 9764 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9765 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9766 /* If both are of the same kind, memcmp is sufficient */ 9767 if (kind_self == kind_sub) { 9768 return ! memcmp((char *)data_self + 9769 (offset * PyUnicode_KIND(substring)), 9770 data_sub, 9771 PyUnicode_GET_LENGTH(substring) * 9772 PyUnicode_KIND(substring)); 9773 } 9774 /* otherwise we have to compare each character by first accessing it */ 9775 else { 9776 /* We do not need to compare 0 and len(substring)-1 because 9777 the if statement above ensured already that they are equal 9778 when we end up here. */ 9779 for (i = 1; i < end_sub; ++i) { 9780 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9781 PyUnicode_READ(kind_sub, data_sub, i)) 9782 return 0; 9783 } 9784 return 1; 9785 } 9786 } 9787 9788 return 0; 9789 } 9790 9791 Py_ssize_t PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9792 PyUnicode_Tailmatch(PyObject *str, 9793 PyObject *substr, 9794 Py_ssize_t start, 9795 Py_ssize_t end, 9796 int direction) 9797 { 9798 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9799 return -1; 9800 9801 return tailmatch(str, substr, start, end, direction); 9802 } 9803 9804 static PyObject * ascii_upper_or_lower(PyObject * self,int lower)9805 ascii_upper_or_lower(PyObject *self, int lower) 9806 { 9807 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9808 const char *data = PyUnicode_DATA(self); 9809 char *resdata; 9810 PyObject *res; 9811 9812 res = PyUnicode_New(len, 127); 9813 if (res == NULL) 9814 return NULL; 9815 resdata = PyUnicode_DATA(res); 9816 if (lower) 9817 _Py_bytes_lower(resdata, data, len); 9818 else 9819 _Py_bytes_upper(resdata, data, len); 9820 return res; 9821 } 9822 9823 static Py_UCS4 handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9824 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i) 9825 { 9826 Py_ssize_t j; 9827 int final_sigma; 9828 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9829 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9830 9831 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9832 9833 where ! is a negation and \p{xxx} is a character with property xxx. 9834 */ 9835 for (j = i - 1; j >= 0; j--) { 9836 c = PyUnicode_READ(kind, data, j); 9837 if (!_PyUnicode_IsCaseIgnorable(c)) 9838 break; 9839 } 9840 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9841 if (final_sigma) { 9842 for (j = i + 1; j < length; j++) { 9843 c = PyUnicode_READ(kind, data, j); 9844 if (!_PyUnicode_IsCaseIgnorable(c)) 9845 break; 9846 } 9847 final_sigma = j == length || !_PyUnicode_IsCased(c); 9848 } 9849 return (final_sigma) ? 0x3C2 : 0x3C3; 9850 } 9851 9852 static int lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9853 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, 9854 Py_UCS4 c, Py_UCS4 *mapped) 9855 { 9856 /* Obscure special case. */ 9857 if (c == 0x3A3) { 9858 mapped[0] = handle_capital_sigma(kind, data, length, i); 9859 return 1; 9860 } 9861 return _PyUnicode_ToLowerFull(c, mapped); 9862 } 9863 9864 static Py_ssize_t do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9865 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9866 { 9867 Py_ssize_t i, k = 0; 9868 int n_res, j; 9869 Py_UCS4 c, mapped[3]; 9870 9871 c = PyUnicode_READ(kind, data, 0); 9872 n_res = _PyUnicode_ToTitleFull(c, mapped); 9873 for (j = 0; j < n_res; j++) { 9874 *maxchar = Py_MAX(*maxchar, mapped[j]); 9875 res[k++] = mapped[j]; 9876 } 9877 for (i = 1; i < length; i++) { 9878 c = PyUnicode_READ(kind, data, i); 9879 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9880 for (j = 0; j < n_res; j++) { 9881 *maxchar = Py_MAX(*maxchar, mapped[j]); 9882 res[k++] = mapped[j]; 9883 } 9884 } 9885 return k; 9886 } 9887 9888 static Py_ssize_t do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9889 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9890 Py_ssize_t i, k = 0; 9891 9892 for (i = 0; i < length; i++) { 9893 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9894 int n_res, j; 9895 if (Py_UNICODE_ISUPPER(c)) { 9896 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9897 } 9898 else if (Py_UNICODE_ISLOWER(c)) { 9899 n_res = _PyUnicode_ToUpperFull(c, mapped); 9900 } 9901 else { 9902 n_res = 1; 9903 mapped[0] = c; 9904 } 9905 for (j = 0; j < n_res; j++) { 9906 *maxchar = Py_MAX(*maxchar, mapped[j]); 9907 res[k++] = mapped[j]; 9908 } 9909 } 9910 return k; 9911 } 9912 9913 static Py_ssize_t do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9914 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, 9915 Py_UCS4 *maxchar, int lower) 9916 { 9917 Py_ssize_t i, k = 0; 9918 9919 for (i = 0; i < length; i++) { 9920 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9921 int n_res, j; 9922 if (lower) 9923 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9924 else 9925 n_res = _PyUnicode_ToUpperFull(c, mapped); 9926 for (j = 0; j < n_res; j++) { 9927 *maxchar = Py_MAX(*maxchar, mapped[j]); 9928 res[k++] = mapped[j]; 9929 } 9930 } 9931 return k; 9932 } 9933 9934 static Py_ssize_t do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9935 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9936 { 9937 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9938 } 9939 9940 static Py_ssize_t do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9941 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9942 { 9943 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9944 } 9945 9946 static Py_ssize_t do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9948 { 9949 Py_ssize_t i, k = 0; 9950 9951 for (i = 0; i < length; i++) { 9952 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9953 Py_UCS4 mapped[3]; 9954 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9955 for (j = 0; j < n_res; j++) { 9956 *maxchar = Py_MAX(*maxchar, mapped[j]); 9957 res[k++] = mapped[j]; 9958 } 9959 } 9960 return k; 9961 } 9962 9963 static Py_ssize_t do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9964 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9965 { 9966 Py_ssize_t i, k = 0; 9967 int previous_is_cased; 9968 9969 previous_is_cased = 0; 9970 for (i = 0; i < length; i++) { 9971 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9972 Py_UCS4 mapped[3]; 9973 int n_res, j; 9974 9975 if (previous_is_cased) 9976 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9977 else 9978 n_res = _PyUnicode_ToTitleFull(c, mapped); 9979 9980 for (j = 0; j < n_res; j++) { 9981 *maxchar = Py_MAX(*maxchar, mapped[j]); 9982 res[k++] = mapped[j]; 9983 } 9984 9985 previous_is_cased = _PyUnicode_IsCased(c); 9986 } 9987 return k; 9988 } 9989 9990 static PyObject * case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9991 case_operation(PyObject *self, 9992 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9993 { 9994 PyObject *res = NULL; 9995 Py_ssize_t length, newlength = 0; 9996 int kind, outkind; 9997 const void *data; 9998 void *outdata; 9999 Py_UCS4 maxchar = 0, *tmp, *tmpend; 10000 10001 assert(PyUnicode_IS_READY(self)); 10002 10003 kind = PyUnicode_KIND(self); 10004 data = PyUnicode_DATA(self); 10005 length = PyUnicode_GET_LENGTH(self); 10006 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 10007 PyErr_SetString(PyExc_OverflowError, "string is too long"); 10008 return NULL; 10009 } 10010 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length); 10011 if (tmp == NULL) 10012 return PyErr_NoMemory(); 10013 newlength = perform(kind, data, length, tmp, &maxchar); 10014 res = PyUnicode_New(newlength, maxchar); 10015 if (res == NULL) 10016 goto leave; 10017 tmpend = tmp + newlength; 10018 outdata = PyUnicode_DATA(res); 10019 outkind = PyUnicode_KIND(res); 10020 switch (outkind) { 10021 case PyUnicode_1BYTE_KIND: 10022 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 10023 break; 10024 case PyUnicode_2BYTE_KIND: 10025 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 10026 break; 10027 case PyUnicode_4BYTE_KIND: 10028 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 10029 break; 10030 default: 10031 Py_UNREACHABLE(); 10032 } 10033 leave: 10034 PyMem_Free(tmp); 10035 return res; 10036 } 10037 10038 PyObject * PyUnicode_Join(PyObject * separator,PyObject * seq)10039 PyUnicode_Join(PyObject *separator, PyObject *seq) 10040 { 10041 PyObject *res; 10042 PyObject *fseq; 10043 Py_ssize_t seqlen; 10044 PyObject **items; 10045 10046 fseq = PySequence_Fast(seq, "can only join an iterable"); 10047 if (fseq == NULL) { 10048 return NULL; 10049 } 10050 10051 /* NOTE: the following code can't call back into Python code, 10052 * so we are sure that fseq won't be mutated. 10053 */ 10054 10055 items = PySequence_Fast_ITEMS(fseq); 10056 seqlen = PySequence_Fast_GET_SIZE(fseq); 10057 res = _PyUnicode_JoinArray(separator, items, seqlen); 10058 Py_DECREF(fseq); 10059 return res; 10060 } 10061 10062 PyObject * _PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10063 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen) 10064 { 10065 PyObject *res = NULL; /* the result */ 10066 PyObject *sep = NULL; 10067 Py_ssize_t seplen; 10068 PyObject *item; 10069 Py_ssize_t sz, i, res_offset; 10070 Py_UCS4 maxchar; 10071 Py_UCS4 item_maxchar; 10072 int use_memcpy; 10073 unsigned char *res_data = NULL, *sep_data = NULL; 10074 PyObject *last_obj; 10075 unsigned int kind = 0; 10076 10077 /* If empty sequence, return u"". */ 10078 if (seqlen == 0) { 10079 _Py_RETURN_UNICODE_EMPTY(); 10080 } 10081 10082 /* If singleton sequence with an exact Unicode, return that. */ 10083 last_obj = NULL; 10084 if (seqlen == 1) { 10085 if (PyUnicode_CheckExact(items[0])) { 10086 res = items[0]; 10087 Py_INCREF(res); 10088 return res; 10089 } 10090 seplen = 0; 10091 maxchar = 0; 10092 } 10093 else { 10094 /* Set up sep and seplen */ 10095 if (separator == NULL) { 10096 /* fall back to a blank space separator */ 10097 sep = PyUnicode_FromOrdinal(' '); 10098 if (!sep) 10099 goto onError; 10100 seplen = 1; 10101 maxchar = 32; 10102 } 10103 else { 10104 if (!PyUnicode_Check(separator)) { 10105 PyErr_Format(PyExc_TypeError, 10106 "separator: expected str instance," 10107 " %.80s found", 10108 Py_TYPE(separator)->tp_name); 10109 goto onError; 10110 } 10111 if (PyUnicode_READY(separator)) 10112 goto onError; 10113 sep = separator; 10114 seplen = PyUnicode_GET_LENGTH(separator); 10115 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 10116 /* inc refcount to keep this code path symmetric with the 10117 above case of a blank separator */ 10118 Py_INCREF(sep); 10119 } 10120 last_obj = sep; 10121 } 10122 10123 /* There are at least two things to join, or else we have a subclass 10124 * of str in the sequence. 10125 * Do a pre-pass to figure out the total amount of space we'll 10126 * need (sz), and see whether all argument are strings. 10127 */ 10128 sz = 0; 10129 #ifdef Py_DEBUG 10130 use_memcpy = 0; 10131 #else 10132 use_memcpy = 1; 10133 #endif 10134 for (i = 0; i < seqlen; i++) { 10135 size_t add_sz; 10136 item = items[i]; 10137 if (!PyUnicode_Check(item)) { 10138 PyErr_Format(PyExc_TypeError, 10139 "sequence item %zd: expected str instance," 10140 " %.80s found", 10141 i, Py_TYPE(item)->tp_name); 10142 goto onError; 10143 } 10144 if (PyUnicode_READY(item) == -1) 10145 goto onError; 10146 add_sz = PyUnicode_GET_LENGTH(item); 10147 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 10148 maxchar = Py_MAX(maxchar, item_maxchar); 10149 if (i != 0) { 10150 add_sz += seplen; 10151 } 10152 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) { 10153 PyErr_SetString(PyExc_OverflowError, 10154 "join() result is too long for a Python string"); 10155 goto onError; 10156 } 10157 sz += add_sz; 10158 if (use_memcpy && last_obj != NULL) { 10159 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 10160 use_memcpy = 0; 10161 } 10162 last_obj = item; 10163 } 10164 10165 res = PyUnicode_New(sz, maxchar); 10166 if (res == NULL) 10167 goto onError; 10168 10169 /* Catenate everything. */ 10170 #ifdef Py_DEBUG 10171 use_memcpy = 0; 10172 #else 10173 if (use_memcpy) { 10174 res_data = PyUnicode_1BYTE_DATA(res); 10175 kind = PyUnicode_KIND(res); 10176 if (seplen != 0) 10177 sep_data = PyUnicode_1BYTE_DATA(sep); 10178 } 10179 #endif 10180 if (use_memcpy) { 10181 for (i = 0; i < seqlen; ++i) { 10182 Py_ssize_t itemlen; 10183 item = items[i]; 10184 10185 /* Copy item, and maybe the separator. */ 10186 if (i && seplen != 0) { 10187 memcpy(res_data, 10188 sep_data, 10189 kind * seplen); 10190 res_data += kind * seplen; 10191 } 10192 10193 itemlen = PyUnicode_GET_LENGTH(item); 10194 if (itemlen != 0) { 10195 memcpy(res_data, 10196 PyUnicode_DATA(item), 10197 kind * itemlen); 10198 res_data += kind * itemlen; 10199 } 10200 } 10201 assert(res_data == PyUnicode_1BYTE_DATA(res) 10202 + kind * PyUnicode_GET_LENGTH(res)); 10203 } 10204 else { 10205 for (i = 0, res_offset = 0; i < seqlen; ++i) { 10206 Py_ssize_t itemlen; 10207 item = items[i]; 10208 10209 /* Copy item, and maybe the separator. */ 10210 if (i && seplen != 0) { 10211 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 10212 res_offset += seplen; 10213 } 10214 10215 itemlen = PyUnicode_GET_LENGTH(item); 10216 if (itemlen != 0) { 10217 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 10218 res_offset += itemlen; 10219 } 10220 } 10221 assert(res_offset == PyUnicode_GET_LENGTH(res)); 10222 } 10223 10224 Py_XDECREF(sep); 10225 assert(_PyUnicode_CheckConsistency(res, 1)); 10226 return res; 10227 10228 onError: 10229 Py_XDECREF(sep); 10230 Py_XDECREF(res); 10231 return NULL; 10232 } 10233 10234 void _PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10235 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10236 Py_UCS4 fill_char) 10237 { 10238 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 10239 void *data = PyUnicode_DATA(unicode); 10240 assert(PyUnicode_IS_READY(unicode)); 10241 assert(unicode_modifiable(unicode)); 10242 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 10243 assert(start >= 0); 10244 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 10245 unicode_fill(kind, data, fill_char, start, length); 10246 } 10247 10248 Py_ssize_t PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10249 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10250 Py_UCS4 fill_char) 10251 { 10252 Py_ssize_t maxlen; 10253 10254 if (!PyUnicode_Check(unicode)) { 10255 PyErr_BadInternalCall(); 10256 return -1; 10257 } 10258 if (PyUnicode_READY(unicode) == -1) 10259 return -1; 10260 if (unicode_check_modifiable(unicode)) 10261 return -1; 10262 10263 if (start < 0) { 10264 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10265 return -1; 10266 } 10267 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 10268 PyErr_SetString(PyExc_ValueError, 10269 "fill character is bigger than " 10270 "the string maximum character"); 10271 return -1; 10272 } 10273 10274 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 10275 length = Py_MIN(maxlen, length); 10276 if (length <= 0) 10277 return 0; 10278 10279 _PyUnicode_FastFill(unicode, start, length, fill_char); 10280 return length; 10281 } 10282 10283 static PyObject * pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10284 pad(PyObject *self, 10285 Py_ssize_t left, 10286 Py_ssize_t right, 10287 Py_UCS4 fill) 10288 { 10289 PyObject *u; 10290 Py_UCS4 maxchar; 10291 int kind; 10292 void *data; 10293 10294 if (left < 0) 10295 left = 0; 10296 if (right < 0) 10297 right = 0; 10298 10299 if (left == 0 && right == 0) 10300 return unicode_result_unchanged(self); 10301 10302 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 10303 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 10304 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 10305 return NULL; 10306 } 10307 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10308 maxchar = Py_MAX(maxchar, fill); 10309 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10310 if (!u) 10311 return NULL; 10312 10313 kind = PyUnicode_KIND(u); 10314 data = PyUnicode_DATA(u); 10315 if (left) 10316 unicode_fill(kind, data, fill, 0, left); 10317 if (right) 10318 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10319 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10320 assert(_PyUnicode_CheckConsistency(u, 1)); 10321 return u; 10322 } 10323 10324 PyObject * PyUnicode_Splitlines(PyObject * string,int keepends)10325 PyUnicode_Splitlines(PyObject *string, int keepends) 10326 { 10327 PyObject *list; 10328 10329 if (ensure_unicode(string) < 0) 10330 return NULL; 10331 10332 switch (PyUnicode_KIND(string)) { 10333 case PyUnicode_1BYTE_KIND: 10334 if (PyUnicode_IS_ASCII(string)) 10335 list = asciilib_splitlines( 10336 string, PyUnicode_1BYTE_DATA(string), 10337 PyUnicode_GET_LENGTH(string), keepends); 10338 else 10339 list = ucs1lib_splitlines( 10340 string, PyUnicode_1BYTE_DATA(string), 10341 PyUnicode_GET_LENGTH(string), keepends); 10342 break; 10343 case PyUnicode_2BYTE_KIND: 10344 list = ucs2lib_splitlines( 10345 string, PyUnicode_2BYTE_DATA(string), 10346 PyUnicode_GET_LENGTH(string), keepends); 10347 break; 10348 case PyUnicode_4BYTE_KIND: 10349 list = ucs4lib_splitlines( 10350 string, PyUnicode_4BYTE_DATA(string), 10351 PyUnicode_GET_LENGTH(string), keepends); 10352 break; 10353 default: 10354 Py_UNREACHABLE(); 10355 } 10356 return list; 10357 } 10358 10359 static PyObject * split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10360 split(PyObject *self, 10361 PyObject *substring, 10362 Py_ssize_t maxcount) 10363 { 10364 int kind1, kind2; 10365 const void *buf1, *buf2; 10366 Py_ssize_t len1, len2; 10367 PyObject* out; 10368 10369 if (maxcount < 0) 10370 maxcount = PY_SSIZE_T_MAX; 10371 10372 if (PyUnicode_READY(self) == -1) 10373 return NULL; 10374 10375 if (substring == NULL) 10376 switch (PyUnicode_KIND(self)) { 10377 case PyUnicode_1BYTE_KIND: 10378 if (PyUnicode_IS_ASCII(self)) 10379 return asciilib_split_whitespace( 10380 self, PyUnicode_1BYTE_DATA(self), 10381 PyUnicode_GET_LENGTH(self), maxcount 10382 ); 10383 else 10384 return ucs1lib_split_whitespace( 10385 self, PyUnicode_1BYTE_DATA(self), 10386 PyUnicode_GET_LENGTH(self), maxcount 10387 ); 10388 case PyUnicode_2BYTE_KIND: 10389 return ucs2lib_split_whitespace( 10390 self, PyUnicode_2BYTE_DATA(self), 10391 PyUnicode_GET_LENGTH(self), maxcount 10392 ); 10393 case PyUnicode_4BYTE_KIND: 10394 return ucs4lib_split_whitespace( 10395 self, PyUnicode_4BYTE_DATA(self), 10396 PyUnicode_GET_LENGTH(self), maxcount 10397 ); 10398 default: 10399 Py_UNREACHABLE(); 10400 } 10401 10402 if (PyUnicode_READY(substring) == -1) 10403 return NULL; 10404 10405 kind1 = PyUnicode_KIND(self); 10406 kind2 = PyUnicode_KIND(substring); 10407 len1 = PyUnicode_GET_LENGTH(self); 10408 len2 = PyUnicode_GET_LENGTH(substring); 10409 if (kind1 < kind2 || len1 < len2) { 10410 out = PyList_New(1); 10411 if (out == NULL) 10412 return NULL; 10413 Py_INCREF(self); 10414 PyList_SET_ITEM(out, 0, self); 10415 return out; 10416 } 10417 buf1 = PyUnicode_DATA(self); 10418 buf2 = PyUnicode_DATA(substring); 10419 if (kind2 != kind1) { 10420 buf2 = unicode_askind(kind2, buf2, len2, kind1); 10421 if (!buf2) 10422 return NULL; 10423 } 10424 10425 switch (kind1) { 10426 case PyUnicode_1BYTE_KIND: 10427 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10428 out = asciilib_split( 10429 self, buf1, len1, buf2, len2, maxcount); 10430 else 10431 out = ucs1lib_split( 10432 self, buf1, len1, buf2, len2, maxcount); 10433 break; 10434 case PyUnicode_2BYTE_KIND: 10435 out = ucs2lib_split( 10436 self, buf1, len1, buf2, len2, maxcount); 10437 break; 10438 case PyUnicode_4BYTE_KIND: 10439 out = ucs4lib_split( 10440 self, buf1, len1, buf2, len2, maxcount); 10441 break; 10442 default: 10443 out = NULL; 10444 } 10445 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring))); 10446 if (kind2 != kind1) 10447 PyMem_Free((void *)buf2); 10448 return out; 10449 } 10450 10451 static PyObject * rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10452 rsplit(PyObject *self, 10453 PyObject *substring, 10454 Py_ssize_t maxcount) 10455 { 10456 int kind1, kind2; 10457 const void *buf1, *buf2; 10458 Py_ssize_t len1, len2; 10459 PyObject* out; 10460 10461 if (maxcount < 0) 10462 maxcount = PY_SSIZE_T_MAX; 10463 10464 if (PyUnicode_READY(self) == -1) 10465 return NULL; 10466 10467 if (substring == NULL) 10468 switch (PyUnicode_KIND(self)) { 10469 case PyUnicode_1BYTE_KIND: 10470 if (PyUnicode_IS_ASCII(self)) 10471 return asciilib_rsplit_whitespace( 10472 self, PyUnicode_1BYTE_DATA(self), 10473 PyUnicode_GET_LENGTH(self), maxcount 10474 ); 10475 else 10476 return ucs1lib_rsplit_whitespace( 10477 self, PyUnicode_1BYTE_DATA(self), 10478 PyUnicode_GET_LENGTH(self), maxcount 10479 ); 10480 case PyUnicode_2BYTE_KIND: 10481 return ucs2lib_rsplit_whitespace( 10482 self, PyUnicode_2BYTE_DATA(self), 10483 PyUnicode_GET_LENGTH(self), maxcount 10484 ); 10485 case PyUnicode_4BYTE_KIND: 10486 return ucs4lib_rsplit_whitespace( 10487 self, PyUnicode_4BYTE_DATA(self), 10488 PyUnicode_GET_LENGTH(self), maxcount 10489 ); 10490 default: 10491 Py_UNREACHABLE(); 10492 } 10493 10494 if (PyUnicode_READY(substring) == -1) 10495 return NULL; 10496 10497 kind1 = PyUnicode_KIND(self); 10498 kind2 = PyUnicode_KIND(substring); 10499 len1 = PyUnicode_GET_LENGTH(self); 10500 len2 = PyUnicode_GET_LENGTH(substring); 10501 if (kind1 < kind2 || len1 < len2) { 10502 out = PyList_New(1); 10503 if (out == NULL) 10504 return NULL; 10505 Py_INCREF(self); 10506 PyList_SET_ITEM(out, 0, self); 10507 return out; 10508 } 10509 buf1 = PyUnicode_DATA(self); 10510 buf2 = PyUnicode_DATA(substring); 10511 if (kind2 != kind1) { 10512 buf2 = unicode_askind(kind2, buf2, len2, kind1); 10513 if (!buf2) 10514 return NULL; 10515 } 10516 10517 switch (kind1) { 10518 case PyUnicode_1BYTE_KIND: 10519 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10520 out = asciilib_rsplit( 10521 self, buf1, len1, buf2, len2, maxcount); 10522 else 10523 out = ucs1lib_rsplit( 10524 self, buf1, len1, buf2, len2, maxcount); 10525 break; 10526 case PyUnicode_2BYTE_KIND: 10527 out = ucs2lib_rsplit( 10528 self, buf1, len1, buf2, len2, maxcount); 10529 break; 10530 case PyUnicode_4BYTE_KIND: 10531 out = ucs4lib_rsplit( 10532 self, buf1, len1, buf2, len2, maxcount); 10533 break; 10534 default: 10535 out = NULL; 10536 } 10537 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring))); 10538 if (kind2 != kind1) 10539 PyMem_Free((void *)buf2); 10540 return out; 10541 } 10542 10543 static Py_ssize_t anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10544 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1, 10545 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10546 { 10547 switch (kind) { 10548 case PyUnicode_1BYTE_KIND: 10549 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10550 return asciilib_find(buf1, len1, buf2, len2, offset); 10551 else 10552 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10553 case PyUnicode_2BYTE_KIND: 10554 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10555 case PyUnicode_4BYTE_KIND: 10556 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10557 } 10558 Py_UNREACHABLE(); 10559 } 10560 10561 static Py_ssize_t anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10562 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, 10563 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10564 { 10565 switch (kind) { 10566 case PyUnicode_1BYTE_KIND: 10567 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10568 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10569 else 10570 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10571 case PyUnicode_2BYTE_KIND: 10572 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10573 case PyUnicode_4BYTE_KIND: 10574 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10575 } 10576 Py_UNREACHABLE(); 10577 } 10578 10579 static void replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10580 replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10581 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10582 { 10583 int kind = PyUnicode_KIND(u); 10584 void *data = PyUnicode_DATA(u); 10585 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10586 if (kind == PyUnicode_1BYTE_KIND) { 10587 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10588 (Py_UCS1 *)data + len, 10589 u1, u2, maxcount); 10590 } 10591 else if (kind == PyUnicode_2BYTE_KIND) { 10592 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10593 (Py_UCS2 *)data + len, 10594 u1, u2, maxcount); 10595 } 10596 else { 10597 assert(kind == PyUnicode_4BYTE_KIND); 10598 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10599 (Py_UCS4 *)data + len, 10600 u1, u2, maxcount); 10601 } 10602 } 10603 10604 static PyObject * replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10605 replace(PyObject *self, PyObject *str1, 10606 PyObject *str2, Py_ssize_t maxcount) 10607 { 10608 PyObject *u; 10609 const char *sbuf = PyUnicode_DATA(self); 10610 const void *buf1 = PyUnicode_DATA(str1); 10611 const void *buf2 = PyUnicode_DATA(str2); 10612 int srelease = 0, release1 = 0, release2 = 0; 10613 int skind = PyUnicode_KIND(self); 10614 int kind1 = PyUnicode_KIND(str1); 10615 int kind2 = PyUnicode_KIND(str2); 10616 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10617 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10618 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10619 int mayshrink; 10620 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10621 10622 if (slen < len1) 10623 goto nothing; 10624 10625 if (maxcount < 0) 10626 maxcount = PY_SSIZE_T_MAX; 10627 else if (maxcount == 0) 10628 goto nothing; 10629 10630 if (str1 == str2) 10631 goto nothing; 10632 10633 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10634 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10635 if (maxchar < maxchar_str1) 10636 /* substring too wide to be present */ 10637 goto nothing; 10638 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10639 /* Replacing str1 with str2 may cause a maxchar reduction in the 10640 result string. */ 10641 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10642 maxchar = Py_MAX(maxchar, maxchar_str2); 10643 10644 if (len1 == len2) { 10645 /* same length */ 10646 if (len1 == 0) 10647 goto nothing; 10648 if (len1 == 1) { 10649 /* replace characters */ 10650 Py_UCS4 u1, u2; 10651 Py_ssize_t pos; 10652 10653 u1 = PyUnicode_READ(kind1, buf1, 0); 10654 pos = findchar(sbuf, skind, slen, u1, 1); 10655 if (pos < 0) 10656 goto nothing; 10657 u2 = PyUnicode_READ(kind2, buf2, 0); 10658 u = PyUnicode_New(slen, maxchar); 10659 if (!u) 10660 goto error; 10661 10662 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10663 replace_1char_inplace(u, pos, u1, u2, maxcount); 10664 } 10665 else { 10666 int rkind = skind; 10667 char *res; 10668 Py_ssize_t i; 10669 10670 if (kind1 < rkind) { 10671 /* widen substring */ 10672 buf1 = unicode_askind(kind1, buf1, len1, rkind); 10673 if (!buf1) goto error; 10674 release1 = 1; 10675 } 10676 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10677 if (i < 0) 10678 goto nothing; 10679 if (rkind > kind2) { 10680 /* widen replacement */ 10681 buf2 = unicode_askind(kind2, buf2, len2, rkind); 10682 if (!buf2) goto error; 10683 release2 = 1; 10684 } 10685 else if (rkind < kind2) { 10686 /* widen self and buf1 */ 10687 rkind = kind2; 10688 if (release1) { 10689 assert(buf1 != PyUnicode_DATA(str1)); 10690 PyMem_Free((void *)buf1); 10691 buf1 = PyUnicode_DATA(str1); 10692 release1 = 0; 10693 } 10694 sbuf = unicode_askind(skind, sbuf, slen, rkind); 10695 if (!sbuf) goto error; 10696 srelease = 1; 10697 buf1 = unicode_askind(kind1, buf1, len1, rkind); 10698 if (!buf1) goto error; 10699 release1 = 1; 10700 } 10701 u = PyUnicode_New(slen, maxchar); 10702 if (!u) 10703 goto error; 10704 assert(PyUnicode_KIND(u) == rkind); 10705 res = PyUnicode_DATA(u); 10706 10707 memcpy(res, sbuf, rkind * slen); 10708 /* change everything in-place, starting with this one */ 10709 memcpy(res + rkind * i, 10710 buf2, 10711 rkind * len2); 10712 i += len1; 10713 10714 while ( --maxcount > 0) { 10715 i = anylib_find(rkind, self, 10716 sbuf+rkind*i, slen-i, 10717 str1, buf1, len1, i); 10718 if (i == -1) 10719 break; 10720 memcpy(res + rkind * i, 10721 buf2, 10722 rkind * len2); 10723 i += len1; 10724 } 10725 } 10726 } 10727 else { 10728 Py_ssize_t n, i, j, ires; 10729 Py_ssize_t new_size; 10730 int rkind = skind; 10731 char *res; 10732 10733 if (kind1 < rkind) { 10734 /* widen substring */ 10735 buf1 = unicode_askind(kind1, buf1, len1, rkind); 10736 if (!buf1) goto error; 10737 release1 = 1; 10738 } 10739 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10740 if (n == 0) 10741 goto nothing; 10742 if (kind2 < rkind) { 10743 /* widen replacement */ 10744 buf2 = unicode_askind(kind2, buf2, len2, rkind); 10745 if (!buf2) goto error; 10746 release2 = 1; 10747 } 10748 else if (kind2 > rkind) { 10749 /* widen self and buf1 */ 10750 rkind = kind2; 10751 sbuf = unicode_askind(skind, sbuf, slen, rkind); 10752 if (!sbuf) goto error; 10753 srelease = 1; 10754 if (release1) { 10755 assert(buf1 != PyUnicode_DATA(str1)); 10756 PyMem_Free((void *)buf1); 10757 buf1 = PyUnicode_DATA(str1); 10758 release1 = 0; 10759 } 10760 buf1 = unicode_askind(kind1, buf1, len1, rkind); 10761 if (!buf1) goto error; 10762 release1 = 1; 10763 } 10764 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10765 PyUnicode_GET_LENGTH(str1)); */ 10766 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10767 PyErr_SetString(PyExc_OverflowError, 10768 "replace string is too long"); 10769 goto error; 10770 } 10771 new_size = slen + n * (len2 - len1); 10772 if (new_size == 0) { 10773 u = unicode_new_empty(); 10774 goto done; 10775 } 10776 if (new_size > (PY_SSIZE_T_MAX / rkind)) { 10777 PyErr_SetString(PyExc_OverflowError, 10778 "replace string is too long"); 10779 goto error; 10780 } 10781 u = PyUnicode_New(new_size, maxchar); 10782 if (!u) 10783 goto error; 10784 assert(PyUnicode_KIND(u) == rkind); 10785 res = PyUnicode_DATA(u); 10786 ires = i = 0; 10787 if (len1 > 0) { 10788 while (n-- > 0) { 10789 /* look for next match */ 10790 j = anylib_find(rkind, self, 10791 sbuf + rkind * i, slen-i, 10792 str1, buf1, len1, i); 10793 if (j == -1) 10794 break; 10795 else if (j > i) { 10796 /* copy unchanged part [i:j] */ 10797 memcpy(res + rkind * ires, 10798 sbuf + rkind * i, 10799 rkind * (j-i)); 10800 ires += j - i; 10801 } 10802 /* copy substitution string */ 10803 if (len2 > 0) { 10804 memcpy(res + rkind * ires, 10805 buf2, 10806 rkind * len2); 10807 ires += len2; 10808 } 10809 i = j + len1; 10810 } 10811 if (i < slen) 10812 /* copy tail [i:] */ 10813 memcpy(res + rkind * ires, 10814 sbuf + rkind * i, 10815 rkind * (slen-i)); 10816 } 10817 else { 10818 /* interleave */ 10819 while (n > 0) { 10820 memcpy(res + rkind * ires, 10821 buf2, 10822 rkind * len2); 10823 ires += len2; 10824 if (--n <= 0) 10825 break; 10826 memcpy(res + rkind * ires, 10827 sbuf + rkind * i, 10828 rkind); 10829 ires++; 10830 i++; 10831 } 10832 memcpy(res + rkind * ires, 10833 sbuf + rkind * i, 10834 rkind * (slen-i)); 10835 } 10836 } 10837 10838 if (mayshrink) { 10839 unicode_adjust_maxchar(&u); 10840 if (u == NULL) 10841 goto error; 10842 } 10843 10844 done: 10845 assert(srelease == (sbuf != PyUnicode_DATA(self))); 10846 assert(release1 == (buf1 != PyUnicode_DATA(str1))); 10847 assert(release2 == (buf2 != PyUnicode_DATA(str2))); 10848 if (srelease) 10849 PyMem_Free((void *)sbuf); 10850 if (release1) 10851 PyMem_Free((void *)buf1); 10852 if (release2) 10853 PyMem_Free((void *)buf2); 10854 assert(_PyUnicode_CheckConsistency(u, 1)); 10855 return u; 10856 10857 nothing: 10858 /* nothing to replace; return original string (when possible) */ 10859 assert(srelease == (sbuf != PyUnicode_DATA(self))); 10860 assert(release1 == (buf1 != PyUnicode_DATA(str1))); 10861 assert(release2 == (buf2 != PyUnicode_DATA(str2))); 10862 if (srelease) 10863 PyMem_Free((void *)sbuf); 10864 if (release1) 10865 PyMem_Free((void *)buf1); 10866 if (release2) 10867 PyMem_Free((void *)buf2); 10868 return unicode_result_unchanged(self); 10869 10870 error: 10871 assert(srelease == (sbuf != PyUnicode_DATA(self))); 10872 assert(release1 == (buf1 != PyUnicode_DATA(str1))); 10873 assert(release2 == (buf2 != PyUnicode_DATA(str2))); 10874 if (srelease) 10875 PyMem_Free((void *)sbuf); 10876 if (release1) 10877 PyMem_Free((void *)buf1); 10878 if (release2) 10879 PyMem_Free((void *)buf2); 10880 return NULL; 10881 } 10882 10883 /* --- Unicode Object Methods --------------------------------------------- */ 10884 10885 /*[clinic input] 10886 str.title as unicode_title 10887 10888 Return a version of the string where each word is titlecased. 10889 10890 More specifically, words start with uppercased characters and all remaining 10891 cased characters have lower case. 10892 [clinic start generated code]*/ 10893 10894 static PyObject * unicode_title_impl(PyObject * self)10895 unicode_title_impl(PyObject *self) 10896 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/ 10897 { 10898 if (PyUnicode_READY(self) == -1) 10899 return NULL; 10900 return case_operation(self, do_title); 10901 } 10902 10903 /*[clinic input] 10904 str.capitalize as unicode_capitalize 10905 10906 Return a capitalized version of the string. 10907 10908 More specifically, make the first character have upper case and the rest lower 10909 case. 10910 [clinic start generated code]*/ 10911 10912 static PyObject * unicode_capitalize_impl(PyObject * self)10913 unicode_capitalize_impl(PyObject *self) 10914 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/ 10915 { 10916 if (PyUnicode_READY(self) == -1) 10917 return NULL; 10918 if (PyUnicode_GET_LENGTH(self) == 0) 10919 return unicode_result_unchanged(self); 10920 return case_operation(self, do_capitalize); 10921 } 10922 10923 /*[clinic input] 10924 str.casefold as unicode_casefold 10925 10926 Return a version of the string suitable for caseless comparisons. 10927 [clinic start generated code]*/ 10928 10929 static PyObject * unicode_casefold_impl(PyObject * self)10930 unicode_casefold_impl(PyObject *self) 10931 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/ 10932 { 10933 if (PyUnicode_READY(self) == -1) 10934 return NULL; 10935 if (PyUnicode_IS_ASCII(self)) 10936 return ascii_upper_or_lower(self, 1); 10937 return case_operation(self, do_casefold); 10938 } 10939 10940 10941 /* Argument converter. Accepts a single Unicode character. */ 10942 10943 static int convert_uc(PyObject * obj,void * addr)10944 convert_uc(PyObject *obj, void *addr) 10945 { 10946 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10947 10948 if (!PyUnicode_Check(obj)) { 10949 PyErr_Format(PyExc_TypeError, 10950 "The fill character must be a unicode character, " 10951 "not %.100s", Py_TYPE(obj)->tp_name); 10952 return 0; 10953 } 10954 if (PyUnicode_READY(obj) < 0) 10955 return 0; 10956 if (PyUnicode_GET_LENGTH(obj) != 1) { 10957 PyErr_SetString(PyExc_TypeError, 10958 "The fill character must be exactly one character long"); 10959 return 0; 10960 } 10961 *fillcharloc = PyUnicode_READ_CHAR(obj, 0); 10962 return 1; 10963 } 10964 10965 /*[clinic input] 10966 str.center as unicode_center 10967 10968 width: Py_ssize_t 10969 fillchar: Py_UCS4 = ' ' 10970 / 10971 10972 Return a centered string of length width. 10973 10974 Padding is done using the specified fill character (default is a space). 10975 [clinic start generated code]*/ 10976 10977 static PyObject * unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10978 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 10979 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/ 10980 { 10981 Py_ssize_t marg, left; 10982 10983 if (PyUnicode_READY(self) == -1) 10984 return NULL; 10985 10986 if (PyUnicode_GET_LENGTH(self) >= width) 10987 return unicode_result_unchanged(self); 10988 10989 marg = width - PyUnicode_GET_LENGTH(self); 10990 left = marg / 2 + (marg & width & 1); 10991 10992 return pad(self, left, marg - left, fillchar); 10993 } 10994 10995 /* This function assumes that str1 and str2 are readied by the caller. */ 10996 10997 static int unicode_compare(PyObject * str1,PyObject * str2)10998 unicode_compare(PyObject *str1, PyObject *str2) 10999 { 11000 #define COMPARE(TYPE1, TYPE2) \ 11001 do { \ 11002 TYPE1* p1 = (TYPE1 *)data1; \ 11003 TYPE2* p2 = (TYPE2 *)data2; \ 11004 TYPE1* end = p1 + len; \ 11005 Py_UCS4 c1, c2; \ 11006 for (; p1 != end; p1++, p2++) { \ 11007 c1 = *p1; \ 11008 c2 = *p2; \ 11009 if (c1 != c2) \ 11010 return (c1 < c2) ? -1 : 1; \ 11011 } \ 11012 } \ 11013 while (0) 11014 11015 int kind1, kind2; 11016 const void *data1, *data2; 11017 Py_ssize_t len1, len2, len; 11018 11019 kind1 = PyUnicode_KIND(str1); 11020 kind2 = PyUnicode_KIND(str2); 11021 data1 = PyUnicode_DATA(str1); 11022 data2 = PyUnicode_DATA(str2); 11023 len1 = PyUnicode_GET_LENGTH(str1); 11024 len2 = PyUnicode_GET_LENGTH(str2); 11025 len = Py_MIN(len1, len2); 11026 11027 switch(kind1) { 11028 case PyUnicode_1BYTE_KIND: 11029 { 11030 switch(kind2) { 11031 case PyUnicode_1BYTE_KIND: 11032 { 11033 int cmp = memcmp(data1, data2, len); 11034 /* normalize result of memcmp() into the range [-1; 1] */ 11035 if (cmp < 0) 11036 return -1; 11037 if (cmp > 0) 11038 return 1; 11039 break; 11040 } 11041 case PyUnicode_2BYTE_KIND: 11042 COMPARE(Py_UCS1, Py_UCS2); 11043 break; 11044 case PyUnicode_4BYTE_KIND: 11045 COMPARE(Py_UCS1, Py_UCS4); 11046 break; 11047 default: 11048 Py_UNREACHABLE(); 11049 } 11050 break; 11051 } 11052 case PyUnicode_2BYTE_KIND: 11053 { 11054 switch(kind2) { 11055 case PyUnicode_1BYTE_KIND: 11056 COMPARE(Py_UCS2, Py_UCS1); 11057 break; 11058 case PyUnicode_2BYTE_KIND: 11059 { 11060 COMPARE(Py_UCS2, Py_UCS2); 11061 break; 11062 } 11063 case PyUnicode_4BYTE_KIND: 11064 COMPARE(Py_UCS2, Py_UCS4); 11065 break; 11066 default: 11067 Py_UNREACHABLE(); 11068 } 11069 break; 11070 } 11071 case PyUnicode_4BYTE_KIND: 11072 { 11073 switch(kind2) { 11074 case PyUnicode_1BYTE_KIND: 11075 COMPARE(Py_UCS4, Py_UCS1); 11076 break; 11077 case PyUnicode_2BYTE_KIND: 11078 COMPARE(Py_UCS4, Py_UCS2); 11079 break; 11080 case PyUnicode_4BYTE_KIND: 11081 { 11082 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 11083 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 11084 /* normalize result of wmemcmp() into the range [-1; 1] */ 11085 if (cmp < 0) 11086 return -1; 11087 if (cmp > 0) 11088 return 1; 11089 #else 11090 COMPARE(Py_UCS4, Py_UCS4); 11091 #endif 11092 break; 11093 } 11094 default: 11095 Py_UNREACHABLE(); 11096 } 11097 break; 11098 } 11099 default: 11100 Py_UNREACHABLE(); 11101 } 11102 11103 if (len1 == len2) 11104 return 0; 11105 if (len1 < len2) 11106 return -1; 11107 else 11108 return 1; 11109 11110 #undef COMPARE 11111 } 11112 11113 static int unicode_compare_eq(PyObject * str1,PyObject * str2)11114 unicode_compare_eq(PyObject *str1, PyObject *str2) 11115 { 11116 int kind; 11117 const void *data1, *data2; 11118 Py_ssize_t len; 11119 int cmp; 11120 11121 len = PyUnicode_GET_LENGTH(str1); 11122 if (PyUnicode_GET_LENGTH(str2) != len) 11123 return 0; 11124 kind = PyUnicode_KIND(str1); 11125 if (PyUnicode_KIND(str2) != kind) 11126 return 0; 11127 data1 = PyUnicode_DATA(str1); 11128 data2 = PyUnicode_DATA(str2); 11129 11130 cmp = memcmp(data1, data2, len * kind); 11131 return (cmp == 0); 11132 } 11133 11134 int _PyUnicode_Equal(PyObject * str1,PyObject * str2)11135 _PyUnicode_Equal(PyObject *str1, PyObject *str2) 11136 { 11137 assert(PyUnicode_Check(str1)); 11138 assert(PyUnicode_Check(str2)); 11139 if (str1 == str2) { 11140 return 1; 11141 } 11142 if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) { 11143 return -1; 11144 } 11145 return unicode_compare_eq(str1, str2); 11146 } 11147 11148 11149 int PyUnicode_Compare(PyObject * left,PyObject * right)11150 PyUnicode_Compare(PyObject *left, PyObject *right) 11151 { 11152 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 11153 if (PyUnicode_READY(left) == -1 || 11154 PyUnicode_READY(right) == -1) 11155 return -1; 11156 11157 /* a string is equal to itself */ 11158 if (left == right) 11159 return 0; 11160 11161 return unicode_compare(left, right); 11162 } 11163 PyErr_Format(PyExc_TypeError, 11164 "Can't compare %.100s and %.100s", 11165 Py_TYPE(left)->tp_name, 11166 Py_TYPE(right)->tp_name); 11167 return -1; 11168 } 11169 11170 int PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11171 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 11172 { 11173 Py_ssize_t i; 11174 int kind; 11175 Py_UCS4 chr; 11176 const unsigned char *ustr = (const unsigned char *)str; 11177 11178 assert(_PyUnicode_CHECK(uni)); 11179 if (!PyUnicode_IS_READY(uni)) { 11180 const wchar_t *ws = _PyUnicode_WSTR(uni); 11181 /* Compare Unicode string and source character set string */ 11182 for (i = 0; (chr = ws[i]) && ustr[i]; i++) { 11183 if (chr != ustr[i]) 11184 return (chr < ustr[i]) ? -1 : 1; 11185 } 11186 /* This check keeps Python strings that end in '\0' from comparing equal 11187 to C strings identical up to that point. */ 11188 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr) 11189 return 1; /* uni is longer */ 11190 if (ustr[i]) 11191 return -1; /* str is longer */ 11192 return 0; 11193 } 11194 kind = PyUnicode_KIND(uni); 11195 if (kind == PyUnicode_1BYTE_KIND) { 11196 const void *data = PyUnicode_1BYTE_DATA(uni); 11197 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 11198 size_t len, len2 = strlen(str); 11199 int cmp; 11200 11201 len = Py_MIN(len1, len2); 11202 cmp = memcmp(data, str, len); 11203 if (cmp != 0) { 11204 if (cmp < 0) 11205 return -1; 11206 else 11207 return 1; 11208 } 11209 if (len1 > len2) 11210 return 1; /* uni is longer */ 11211 if (len1 < len2) 11212 return -1; /* str is longer */ 11213 return 0; 11214 } 11215 else { 11216 const void *data = PyUnicode_DATA(uni); 11217 /* Compare Unicode string and source character set string */ 11218 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 11219 if (chr != (unsigned char)str[i]) 11220 return (chr < (unsigned char)(str[i])) ? -1 : 1; 11221 /* This check keeps Python strings that end in '\0' from comparing equal 11222 to C strings identical up to that point. */ 11223 if (PyUnicode_GET_LENGTH(uni) != i || chr) 11224 return 1; /* uni is longer */ 11225 if (str[i]) 11226 return -1; /* str is longer */ 11227 return 0; 11228 } 11229 } 11230 11231 static int non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11232 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) 11233 { 11234 size_t i, len; 11235 const wchar_t *p; 11236 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); 11237 if (strlen(str) != len) 11238 return 0; 11239 p = _PyUnicode_WSTR(unicode); 11240 assert(p); 11241 for (i = 0; i < len; i++) { 11242 unsigned char c = (unsigned char)str[i]; 11243 if (c >= 128 || p[i] != (wchar_t)c) 11244 return 0; 11245 } 11246 return 1; 11247 } 11248 11249 int _PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11250 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) 11251 { 11252 size_t len; 11253 assert(_PyUnicode_CHECK(unicode)); 11254 assert(str); 11255 #ifndef NDEBUG 11256 for (const char *p = str; *p; p++) { 11257 assert((unsigned char)*p < 128); 11258 } 11259 #endif 11260 if (PyUnicode_READY(unicode) == -1) { 11261 /* Memory error or bad data */ 11262 PyErr_Clear(); 11263 return non_ready_unicode_equal_to_ascii_string(unicode, str); 11264 } 11265 if (!PyUnicode_IS_ASCII(unicode)) 11266 return 0; 11267 len = (size_t)PyUnicode_GET_LENGTH(unicode); 11268 return strlen(str) == len && 11269 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; 11270 } 11271 11272 int _PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11273 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) 11274 { 11275 PyObject *right_uni; 11276 11277 assert(_PyUnicode_CHECK(left)); 11278 assert(right->string); 11279 #ifndef NDEBUG 11280 for (const char *p = right->string; *p; p++) { 11281 assert((unsigned char)*p < 128); 11282 } 11283 #endif 11284 11285 if (PyUnicode_READY(left) == -1) { 11286 /* memory error or bad data */ 11287 PyErr_Clear(); 11288 return non_ready_unicode_equal_to_ascii_string(left, right->string); 11289 } 11290 11291 if (!PyUnicode_IS_ASCII(left)) 11292 return 0; 11293 11294 right_uni = _PyUnicode_FromId(right); /* borrowed */ 11295 if (right_uni == NULL) { 11296 /* memory error or bad data */ 11297 PyErr_Clear(); 11298 return _PyUnicode_EqualToASCIIString(left, right->string); 11299 } 11300 11301 if (left == right_uni) 11302 return 1; 11303 11304 if (PyUnicode_CHECK_INTERNED(left)) 11305 return 0; 11306 11307 assert(_PyUnicode_HASH(right_uni) != -1); 11308 Py_hash_t hash = _PyUnicode_HASH(left); 11309 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) { 11310 return 0; 11311 } 11312 11313 return unicode_compare_eq(left, right_uni); 11314 } 11315 11316 PyObject * PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11317 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 11318 { 11319 int result; 11320 11321 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 11322 Py_RETURN_NOTIMPLEMENTED; 11323 11324 if (PyUnicode_READY(left) == -1 || 11325 PyUnicode_READY(right) == -1) 11326 return NULL; 11327 11328 if (left == right) { 11329 switch (op) { 11330 case Py_EQ: 11331 case Py_LE: 11332 case Py_GE: 11333 /* a string is equal to itself */ 11334 Py_RETURN_TRUE; 11335 case Py_NE: 11336 case Py_LT: 11337 case Py_GT: 11338 Py_RETURN_FALSE; 11339 default: 11340 PyErr_BadArgument(); 11341 return NULL; 11342 } 11343 } 11344 else if (op == Py_EQ || op == Py_NE) { 11345 result = unicode_compare_eq(left, right); 11346 result ^= (op == Py_NE); 11347 return PyBool_FromLong(result); 11348 } 11349 else { 11350 result = unicode_compare(left, right); 11351 Py_RETURN_RICHCOMPARE(result, 0, op); 11352 } 11353 } 11354 11355 int _PyUnicode_EQ(PyObject * aa,PyObject * bb)11356 _PyUnicode_EQ(PyObject *aa, PyObject *bb) 11357 { 11358 return unicode_eq(aa, bb); 11359 } 11360 11361 int PyUnicode_Contains(PyObject * str,PyObject * substr)11362 PyUnicode_Contains(PyObject *str, PyObject *substr) 11363 { 11364 int kind1, kind2; 11365 const void *buf1, *buf2; 11366 Py_ssize_t len1, len2; 11367 int result; 11368 11369 if (!PyUnicode_Check(substr)) { 11370 PyErr_Format(PyExc_TypeError, 11371 "'in <string>' requires string as left operand, not %.100s", 11372 Py_TYPE(substr)->tp_name); 11373 return -1; 11374 } 11375 if (PyUnicode_READY(substr) == -1) 11376 return -1; 11377 if (ensure_unicode(str) < 0) 11378 return -1; 11379 11380 kind1 = PyUnicode_KIND(str); 11381 kind2 = PyUnicode_KIND(substr); 11382 if (kind1 < kind2) 11383 return 0; 11384 len1 = PyUnicode_GET_LENGTH(str); 11385 len2 = PyUnicode_GET_LENGTH(substr); 11386 if (len1 < len2) 11387 return 0; 11388 buf1 = PyUnicode_DATA(str); 11389 buf2 = PyUnicode_DATA(substr); 11390 if (len2 == 1) { 11391 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 11392 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 11393 return result; 11394 } 11395 if (kind2 != kind1) { 11396 buf2 = unicode_askind(kind2, buf2, len2, kind1); 11397 if (!buf2) 11398 return -1; 11399 } 11400 11401 switch (kind1) { 11402 case PyUnicode_1BYTE_KIND: 11403 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11404 break; 11405 case PyUnicode_2BYTE_KIND: 11406 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11407 break; 11408 case PyUnicode_4BYTE_KIND: 11409 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11410 break; 11411 default: 11412 Py_UNREACHABLE(); 11413 } 11414 11415 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr))); 11416 if (kind2 != kind1) 11417 PyMem_Free((void *)buf2); 11418 11419 return result; 11420 } 11421 11422 /* Concat to string or Unicode object giving a new Unicode object. */ 11423 11424 PyObject * PyUnicode_Concat(PyObject * left,PyObject * right)11425 PyUnicode_Concat(PyObject *left, PyObject *right) 11426 { 11427 PyObject *result; 11428 Py_UCS4 maxchar, maxchar2; 11429 Py_ssize_t left_len, right_len, new_len; 11430 11431 if (ensure_unicode(left) < 0) 11432 return NULL; 11433 11434 if (!PyUnicode_Check(right)) { 11435 PyErr_Format(PyExc_TypeError, 11436 "can only concatenate str (not \"%.200s\") to str", 11437 Py_TYPE(right)->tp_name); 11438 return NULL; 11439 } 11440 if (PyUnicode_READY(right) < 0) 11441 return NULL; 11442 11443 /* Shortcuts */ 11444 PyObject *empty = unicode_get_empty(); // Borrowed reference 11445 if (left == empty) { 11446 return PyUnicode_FromObject(right); 11447 } 11448 if (right == empty) { 11449 return PyUnicode_FromObject(left); 11450 } 11451 11452 left_len = PyUnicode_GET_LENGTH(left); 11453 right_len = PyUnicode_GET_LENGTH(right); 11454 if (left_len > PY_SSIZE_T_MAX - right_len) { 11455 PyErr_SetString(PyExc_OverflowError, 11456 "strings are too large to concat"); 11457 return NULL; 11458 } 11459 new_len = left_len + right_len; 11460 11461 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11462 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11463 maxchar = Py_MAX(maxchar, maxchar2); 11464 11465 /* Concat the two Unicode strings */ 11466 result = PyUnicode_New(new_len, maxchar); 11467 if (result == NULL) 11468 return NULL; 11469 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); 11470 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); 11471 assert(_PyUnicode_CheckConsistency(result, 1)); 11472 return result; 11473 } 11474 11475 void PyUnicode_Append(PyObject ** p_left,PyObject * right)11476 PyUnicode_Append(PyObject **p_left, PyObject *right) 11477 { 11478 PyObject *left, *res; 11479 Py_UCS4 maxchar, maxchar2; 11480 Py_ssize_t left_len, right_len, new_len; 11481 11482 if (p_left == NULL) { 11483 if (!PyErr_Occurred()) 11484 PyErr_BadInternalCall(); 11485 return; 11486 } 11487 left = *p_left; 11488 if (right == NULL || left == NULL 11489 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11490 if (!PyErr_Occurred()) 11491 PyErr_BadInternalCall(); 11492 goto error; 11493 } 11494 11495 if (PyUnicode_READY(left) == -1) 11496 goto error; 11497 if (PyUnicode_READY(right) == -1) 11498 goto error; 11499 11500 /* Shortcuts */ 11501 PyObject *empty = unicode_get_empty(); // Borrowed reference 11502 if (left == empty) { 11503 Py_DECREF(left); 11504 Py_INCREF(right); 11505 *p_left = right; 11506 return; 11507 } 11508 if (right == empty) { 11509 return; 11510 } 11511 11512 left_len = PyUnicode_GET_LENGTH(left); 11513 right_len = PyUnicode_GET_LENGTH(right); 11514 if (left_len > PY_SSIZE_T_MAX - right_len) { 11515 PyErr_SetString(PyExc_OverflowError, 11516 "strings are too large to concat"); 11517 goto error; 11518 } 11519 new_len = left_len + right_len; 11520 11521 if (unicode_modifiable(left) 11522 && PyUnicode_CheckExact(right) 11523 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11524 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11525 to change the structure size, but characters are stored just after 11526 the structure, and so it requires to move all characters which is 11527 not so different than duplicating the string. */ 11528 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11529 { 11530 /* append inplace */ 11531 if (unicode_resize(p_left, new_len) != 0) 11532 goto error; 11533 11534 /* copy 'right' into the newly allocated area of 'left' */ 11535 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11536 } 11537 else { 11538 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11539 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11540 maxchar = Py_MAX(maxchar, maxchar2); 11541 11542 /* Concat the two Unicode strings */ 11543 res = PyUnicode_New(new_len, maxchar); 11544 if (res == NULL) 11545 goto error; 11546 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11547 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11548 Py_DECREF(left); 11549 *p_left = res; 11550 } 11551 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11552 return; 11553 11554 error: 11555 Py_CLEAR(*p_left); 11556 } 11557 11558 void PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11559 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11560 { 11561 PyUnicode_Append(pleft, right); 11562 Py_XDECREF(right); 11563 } 11564 11565 /* 11566 Wraps stringlib_parse_args_finds() and additionally ensures that the 11567 first argument is a unicode object. 11568 */ 11569 11570 static inline int parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11571 parse_args_finds_unicode(const char * function_name, PyObject *args, 11572 PyObject **substring, 11573 Py_ssize_t *start, Py_ssize_t *end) 11574 { 11575 if(stringlib_parse_args_finds(function_name, args, substring, 11576 start, end)) { 11577 if (ensure_unicode(*substring) < 0) 11578 return 0; 11579 return 1; 11580 } 11581 return 0; 11582 } 11583 11584 PyDoc_STRVAR(count__doc__, 11585 "S.count(sub[, start[, end]]) -> int\n\ 11586 \n\ 11587 Return the number of non-overlapping occurrences of substring sub in\n\ 11588 string S[start:end]. Optional arguments start and end are\n\ 11589 interpreted as in slice notation."); 11590 11591 static PyObject * unicode_count(PyObject * self,PyObject * args)11592 unicode_count(PyObject *self, PyObject *args) 11593 { 11594 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11595 Py_ssize_t start = 0; 11596 Py_ssize_t end = PY_SSIZE_T_MAX; 11597 PyObject *result; 11598 int kind1, kind2; 11599 const void *buf1, *buf2; 11600 Py_ssize_t len1, len2, iresult; 11601 11602 if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 11603 return NULL; 11604 11605 kind1 = PyUnicode_KIND(self); 11606 kind2 = PyUnicode_KIND(substring); 11607 if (kind1 < kind2) 11608 return PyLong_FromLong(0); 11609 11610 len1 = PyUnicode_GET_LENGTH(self); 11611 len2 = PyUnicode_GET_LENGTH(substring); 11612 ADJUST_INDICES(start, end, len1); 11613 if (end - start < len2) 11614 return PyLong_FromLong(0); 11615 11616 buf1 = PyUnicode_DATA(self); 11617 buf2 = PyUnicode_DATA(substring); 11618 if (kind2 != kind1) { 11619 buf2 = unicode_askind(kind2, buf2, len2, kind1); 11620 if (!buf2) 11621 return NULL; 11622 } 11623 switch (kind1) { 11624 case PyUnicode_1BYTE_KIND: 11625 iresult = ucs1lib_count( 11626 ((const Py_UCS1*)buf1) + start, end - start, 11627 buf2, len2, PY_SSIZE_T_MAX 11628 ); 11629 break; 11630 case PyUnicode_2BYTE_KIND: 11631 iresult = ucs2lib_count( 11632 ((const Py_UCS2*)buf1) + start, end - start, 11633 buf2, len2, PY_SSIZE_T_MAX 11634 ); 11635 break; 11636 case PyUnicode_4BYTE_KIND: 11637 iresult = ucs4lib_count( 11638 ((const Py_UCS4*)buf1) + start, end - start, 11639 buf2, len2, PY_SSIZE_T_MAX 11640 ); 11641 break; 11642 default: 11643 Py_UNREACHABLE(); 11644 } 11645 11646 result = PyLong_FromSsize_t(iresult); 11647 11648 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); 11649 if (kind2 != kind1) 11650 PyMem_Free((void *)buf2); 11651 11652 return result; 11653 } 11654 11655 /*[clinic input] 11656 str.encode as unicode_encode 11657 11658 encoding: str(c_default="NULL") = 'utf-8' 11659 The encoding in which to encode the string. 11660 errors: str(c_default="NULL") = 'strict' 11661 The error handling scheme to use for encoding errors. 11662 The default is 'strict' meaning that encoding errors raise a 11663 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 11664 'xmlcharrefreplace' as well as any other name registered with 11665 codecs.register_error that can handle UnicodeEncodeErrors. 11666 11667 Encode the string using the codec registered for encoding. 11668 [clinic start generated code]*/ 11669 11670 static PyObject * unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11671 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors) 11672 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/ 11673 { 11674 return PyUnicode_AsEncodedString(self, encoding, errors); 11675 } 11676 11677 /*[clinic input] 11678 str.expandtabs as unicode_expandtabs 11679 11680 tabsize: int = 8 11681 11682 Return a copy where all tab characters are expanded using spaces. 11683 11684 If tabsize is not given, a tab size of 8 characters is assumed. 11685 [clinic start generated code]*/ 11686 11687 static PyObject * unicode_expandtabs_impl(PyObject * self,int tabsize)11688 unicode_expandtabs_impl(PyObject *self, int tabsize) 11689 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/ 11690 { 11691 Py_ssize_t i, j, line_pos, src_len, incr; 11692 Py_UCS4 ch; 11693 PyObject *u; 11694 const void *src_data; 11695 void *dest_data; 11696 int kind; 11697 int found; 11698 11699 if (PyUnicode_READY(self) == -1) 11700 return NULL; 11701 11702 /* First pass: determine size of output string */ 11703 src_len = PyUnicode_GET_LENGTH(self); 11704 i = j = line_pos = 0; 11705 kind = PyUnicode_KIND(self); 11706 src_data = PyUnicode_DATA(self); 11707 found = 0; 11708 for (; i < src_len; i++) { 11709 ch = PyUnicode_READ(kind, src_data, i); 11710 if (ch == '\t') { 11711 found = 1; 11712 if (tabsize > 0) { 11713 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11714 if (j > PY_SSIZE_T_MAX - incr) 11715 goto overflow; 11716 line_pos += incr; 11717 j += incr; 11718 } 11719 } 11720 else { 11721 if (j > PY_SSIZE_T_MAX - 1) 11722 goto overflow; 11723 line_pos++; 11724 j++; 11725 if (ch == '\n' || ch == '\r') 11726 line_pos = 0; 11727 } 11728 } 11729 if (!found) 11730 return unicode_result_unchanged(self); 11731 11732 /* Second pass: create output string and fill it */ 11733 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11734 if (!u) 11735 return NULL; 11736 dest_data = PyUnicode_DATA(u); 11737 11738 i = j = line_pos = 0; 11739 11740 for (; i < src_len; i++) { 11741 ch = PyUnicode_READ(kind, src_data, i); 11742 if (ch == '\t') { 11743 if (tabsize > 0) { 11744 incr = tabsize - (line_pos % tabsize); 11745 line_pos += incr; 11746 unicode_fill(kind, dest_data, ' ', j, incr); 11747 j += incr; 11748 } 11749 } 11750 else { 11751 line_pos++; 11752 PyUnicode_WRITE(kind, dest_data, j, ch); 11753 j++; 11754 if (ch == '\n' || ch == '\r') 11755 line_pos = 0; 11756 } 11757 } 11758 assert (j == PyUnicode_GET_LENGTH(u)); 11759 return unicode_result(u); 11760 11761 overflow: 11762 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11763 return NULL; 11764 } 11765 11766 PyDoc_STRVAR(find__doc__, 11767 "S.find(sub[, start[, end]]) -> int\n\ 11768 \n\ 11769 Return the lowest index in S where substring sub is found,\n\ 11770 such that sub is contained within S[start:end]. Optional\n\ 11771 arguments start and end are interpreted as in slice notation.\n\ 11772 \n\ 11773 Return -1 on failure."); 11774 11775 static PyObject * unicode_find(PyObject * self,PyObject * args)11776 unicode_find(PyObject *self, PyObject *args) 11777 { 11778 /* initialize variables to prevent gcc warning */ 11779 PyObject *substring = NULL; 11780 Py_ssize_t start = 0; 11781 Py_ssize_t end = 0; 11782 Py_ssize_t result; 11783 11784 if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) 11785 return NULL; 11786 11787 if (PyUnicode_READY(self) == -1) 11788 return NULL; 11789 11790 result = any_find_slice(self, substring, start, end, 1); 11791 11792 if (result == -2) 11793 return NULL; 11794 11795 return PyLong_FromSsize_t(result); 11796 } 11797 11798 static PyObject * unicode_getitem(PyObject * self,Py_ssize_t index)11799 unicode_getitem(PyObject *self, Py_ssize_t index) 11800 { 11801 const void *data; 11802 enum PyUnicode_Kind kind; 11803 Py_UCS4 ch; 11804 11805 if (!PyUnicode_Check(self)) { 11806 PyErr_BadArgument(); 11807 return NULL; 11808 } 11809 if (PyUnicode_READY(self) == -1) { 11810 return NULL; 11811 } 11812 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11813 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11814 return NULL; 11815 } 11816 kind = PyUnicode_KIND(self); 11817 data = PyUnicode_DATA(self); 11818 ch = PyUnicode_READ(kind, data, index); 11819 return unicode_char(ch); 11820 } 11821 11822 /* Believe it or not, this produces the same value for ASCII strings 11823 as bytes_hash(). */ 11824 static Py_hash_t unicode_hash(PyObject * self)11825 unicode_hash(PyObject *self) 11826 { 11827 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11828 11829 #ifdef Py_DEBUG 11830 assert(_Py_HashSecret_Initialized); 11831 #endif 11832 if (_PyUnicode_HASH(self) != -1) 11833 return _PyUnicode_HASH(self); 11834 if (PyUnicode_READY(self) == -1) 11835 return -1; 11836 11837 x = _Py_HashBytes(PyUnicode_DATA(self), 11838 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11839 _PyUnicode_HASH(self) = x; 11840 return x; 11841 } 11842 11843 PyDoc_STRVAR(index__doc__, 11844 "S.index(sub[, start[, end]]) -> int\n\ 11845 \n\ 11846 Return the lowest index in S where substring sub is found,\n\ 11847 such that sub is contained within S[start:end]. Optional\n\ 11848 arguments start and end are interpreted as in slice notation.\n\ 11849 \n\ 11850 Raises ValueError when the substring is not found."); 11851 11852 static PyObject * unicode_index(PyObject * self,PyObject * args)11853 unicode_index(PyObject *self, PyObject *args) 11854 { 11855 /* initialize variables to prevent gcc warning */ 11856 Py_ssize_t result; 11857 PyObject *substring = NULL; 11858 Py_ssize_t start = 0; 11859 Py_ssize_t end = 0; 11860 11861 if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) 11862 return NULL; 11863 11864 if (PyUnicode_READY(self) == -1) 11865 return NULL; 11866 11867 result = any_find_slice(self, substring, start, end, 1); 11868 11869 if (result == -2) 11870 return NULL; 11871 11872 if (result < 0) { 11873 PyErr_SetString(PyExc_ValueError, "substring not found"); 11874 return NULL; 11875 } 11876 11877 return PyLong_FromSsize_t(result); 11878 } 11879 11880 /*[clinic input] 11881 str.isascii as unicode_isascii 11882 11883 Return True if all characters in the string are ASCII, False otherwise. 11884 11885 ASCII characters have code points in the range U+0000-U+007F. 11886 Empty string is ASCII too. 11887 [clinic start generated code]*/ 11888 11889 static PyObject * unicode_isascii_impl(PyObject * self)11890 unicode_isascii_impl(PyObject *self) 11891 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/ 11892 { 11893 if (PyUnicode_READY(self) == -1) { 11894 return NULL; 11895 } 11896 return PyBool_FromLong(PyUnicode_IS_ASCII(self)); 11897 } 11898 11899 /*[clinic input] 11900 str.islower as unicode_islower 11901 11902 Return True if the string is a lowercase string, False otherwise. 11903 11904 A string is lowercase if all cased characters in the string are lowercase and 11905 there is at least one cased character in the string. 11906 [clinic start generated code]*/ 11907 11908 static PyObject * unicode_islower_impl(PyObject * self)11909 unicode_islower_impl(PyObject *self) 11910 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/ 11911 { 11912 Py_ssize_t i, length; 11913 int kind; 11914 const void *data; 11915 int cased; 11916 11917 if (PyUnicode_READY(self) == -1) 11918 return NULL; 11919 length = PyUnicode_GET_LENGTH(self); 11920 kind = PyUnicode_KIND(self); 11921 data = PyUnicode_DATA(self); 11922 11923 /* Shortcut for single character strings */ 11924 if (length == 1) 11925 return PyBool_FromLong( 11926 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11927 11928 /* Special case for empty strings */ 11929 if (length == 0) 11930 Py_RETURN_FALSE; 11931 11932 cased = 0; 11933 for (i = 0; i < length; i++) { 11934 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11935 11936 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11937 Py_RETURN_FALSE; 11938 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11939 cased = 1; 11940 } 11941 return PyBool_FromLong(cased); 11942 } 11943 11944 /*[clinic input] 11945 str.isupper as unicode_isupper 11946 11947 Return True if the string is an uppercase string, False otherwise. 11948 11949 A string is uppercase if all cased characters in the string are uppercase and 11950 there is at least one cased character in the string. 11951 [clinic start generated code]*/ 11952 11953 static PyObject * unicode_isupper_impl(PyObject * self)11954 unicode_isupper_impl(PyObject *self) 11955 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/ 11956 { 11957 Py_ssize_t i, length; 11958 int kind; 11959 const void *data; 11960 int cased; 11961 11962 if (PyUnicode_READY(self) == -1) 11963 return NULL; 11964 length = PyUnicode_GET_LENGTH(self); 11965 kind = PyUnicode_KIND(self); 11966 data = PyUnicode_DATA(self); 11967 11968 /* Shortcut for single character strings */ 11969 if (length == 1) 11970 return PyBool_FromLong( 11971 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11972 11973 /* Special case for empty strings */ 11974 if (length == 0) 11975 Py_RETURN_FALSE; 11976 11977 cased = 0; 11978 for (i = 0; i < length; i++) { 11979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11980 11981 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11982 Py_RETURN_FALSE; 11983 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11984 cased = 1; 11985 } 11986 return PyBool_FromLong(cased); 11987 } 11988 11989 /*[clinic input] 11990 str.istitle as unicode_istitle 11991 11992 Return True if the string is a title-cased string, False otherwise. 11993 11994 In a title-cased string, upper- and title-case characters may only 11995 follow uncased characters and lowercase characters only cased ones. 11996 [clinic start generated code]*/ 11997 11998 static PyObject * unicode_istitle_impl(PyObject * self)11999 unicode_istitle_impl(PyObject *self) 12000 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/ 12001 { 12002 Py_ssize_t i, length; 12003 int kind; 12004 const void *data; 12005 int cased, previous_is_cased; 12006 12007 if (PyUnicode_READY(self) == -1) 12008 return NULL; 12009 length = PyUnicode_GET_LENGTH(self); 12010 kind = PyUnicode_KIND(self); 12011 data = PyUnicode_DATA(self); 12012 12013 /* Shortcut for single character strings */ 12014 if (length == 1) { 12015 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12016 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 12017 (Py_UNICODE_ISUPPER(ch) != 0)); 12018 } 12019 12020 /* Special case for empty strings */ 12021 if (length == 0) 12022 Py_RETURN_FALSE; 12023 12024 cased = 0; 12025 previous_is_cased = 0; 12026 for (i = 0; i < length; i++) { 12027 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12028 12029 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 12030 if (previous_is_cased) 12031 Py_RETURN_FALSE; 12032 previous_is_cased = 1; 12033 cased = 1; 12034 } 12035 else if (Py_UNICODE_ISLOWER(ch)) { 12036 if (!previous_is_cased) 12037 Py_RETURN_FALSE; 12038 previous_is_cased = 1; 12039 cased = 1; 12040 } 12041 else 12042 previous_is_cased = 0; 12043 } 12044 return PyBool_FromLong(cased); 12045 } 12046 12047 /*[clinic input] 12048 str.isspace as unicode_isspace 12049 12050 Return True if the string is a whitespace string, False otherwise. 12051 12052 A string is whitespace if all characters in the string are whitespace and there 12053 is at least one character in the string. 12054 [clinic start generated code]*/ 12055 12056 static PyObject * unicode_isspace_impl(PyObject * self)12057 unicode_isspace_impl(PyObject *self) 12058 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/ 12059 { 12060 Py_ssize_t i, length; 12061 int kind; 12062 const void *data; 12063 12064 if (PyUnicode_READY(self) == -1) 12065 return NULL; 12066 length = PyUnicode_GET_LENGTH(self); 12067 kind = PyUnicode_KIND(self); 12068 data = PyUnicode_DATA(self); 12069 12070 /* Shortcut for single character strings */ 12071 if (length == 1) 12072 return PyBool_FromLong( 12073 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 12074 12075 /* Special case for empty strings */ 12076 if (length == 0) 12077 Py_RETURN_FALSE; 12078 12079 for (i = 0; i < length; i++) { 12080 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12081 if (!Py_UNICODE_ISSPACE(ch)) 12082 Py_RETURN_FALSE; 12083 } 12084 Py_RETURN_TRUE; 12085 } 12086 12087 /*[clinic input] 12088 str.isalpha as unicode_isalpha 12089 12090 Return True if the string is an alphabetic string, False otherwise. 12091 12092 A string is alphabetic if all characters in the string are alphabetic and there 12093 is at least one character in the string. 12094 [clinic start generated code]*/ 12095 12096 static PyObject * unicode_isalpha_impl(PyObject * self)12097 unicode_isalpha_impl(PyObject *self) 12098 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/ 12099 { 12100 Py_ssize_t i, length; 12101 int kind; 12102 const void *data; 12103 12104 if (PyUnicode_READY(self) == -1) 12105 return NULL; 12106 length = PyUnicode_GET_LENGTH(self); 12107 kind = PyUnicode_KIND(self); 12108 data = PyUnicode_DATA(self); 12109 12110 /* Shortcut for single character strings */ 12111 if (length == 1) 12112 return PyBool_FromLong( 12113 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 12114 12115 /* Special case for empty strings */ 12116 if (length == 0) 12117 Py_RETURN_FALSE; 12118 12119 for (i = 0; i < length; i++) { 12120 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 12121 Py_RETURN_FALSE; 12122 } 12123 Py_RETURN_TRUE; 12124 } 12125 12126 /*[clinic input] 12127 str.isalnum as unicode_isalnum 12128 12129 Return True if the string is an alpha-numeric string, False otherwise. 12130 12131 A string is alpha-numeric if all characters in the string are alpha-numeric and 12132 there is at least one character in the string. 12133 [clinic start generated code]*/ 12134 12135 static PyObject * unicode_isalnum_impl(PyObject * self)12136 unicode_isalnum_impl(PyObject *self) 12137 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/ 12138 { 12139 int kind; 12140 const void *data; 12141 Py_ssize_t len, i; 12142 12143 if (PyUnicode_READY(self) == -1) 12144 return NULL; 12145 12146 kind = PyUnicode_KIND(self); 12147 data = PyUnicode_DATA(self); 12148 len = PyUnicode_GET_LENGTH(self); 12149 12150 /* Shortcut for single character strings */ 12151 if (len == 1) { 12152 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12153 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 12154 } 12155 12156 /* Special case for empty strings */ 12157 if (len == 0) 12158 Py_RETURN_FALSE; 12159 12160 for (i = 0; i < len; i++) { 12161 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12162 if (!Py_UNICODE_ISALNUM(ch)) 12163 Py_RETURN_FALSE; 12164 } 12165 Py_RETURN_TRUE; 12166 } 12167 12168 /*[clinic input] 12169 str.isdecimal as unicode_isdecimal 12170 12171 Return True if the string is a decimal string, False otherwise. 12172 12173 A string is a decimal string if all characters in the string are decimal and 12174 there is at least one character in the string. 12175 [clinic start generated code]*/ 12176 12177 static PyObject * unicode_isdecimal_impl(PyObject * self)12178 unicode_isdecimal_impl(PyObject *self) 12179 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/ 12180 { 12181 Py_ssize_t i, length; 12182 int kind; 12183 const void *data; 12184 12185 if (PyUnicode_READY(self) == -1) 12186 return NULL; 12187 length = PyUnicode_GET_LENGTH(self); 12188 kind = PyUnicode_KIND(self); 12189 data = PyUnicode_DATA(self); 12190 12191 /* Shortcut for single character strings */ 12192 if (length == 1) 12193 return PyBool_FromLong( 12194 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 12195 12196 /* Special case for empty strings */ 12197 if (length == 0) 12198 Py_RETURN_FALSE; 12199 12200 for (i = 0; i < length; i++) { 12201 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 12202 Py_RETURN_FALSE; 12203 } 12204 Py_RETURN_TRUE; 12205 } 12206 12207 /*[clinic input] 12208 str.isdigit as unicode_isdigit 12209 12210 Return True if the string is a digit string, False otherwise. 12211 12212 A string is a digit string if all characters in the string are digits and there 12213 is at least one character in the string. 12214 [clinic start generated code]*/ 12215 12216 static PyObject * unicode_isdigit_impl(PyObject * self)12217 unicode_isdigit_impl(PyObject *self) 12218 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/ 12219 { 12220 Py_ssize_t i, length; 12221 int kind; 12222 const void *data; 12223 12224 if (PyUnicode_READY(self) == -1) 12225 return NULL; 12226 length = PyUnicode_GET_LENGTH(self); 12227 kind = PyUnicode_KIND(self); 12228 data = PyUnicode_DATA(self); 12229 12230 /* Shortcut for single character strings */ 12231 if (length == 1) { 12232 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12233 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 12234 } 12235 12236 /* Special case for empty strings */ 12237 if (length == 0) 12238 Py_RETURN_FALSE; 12239 12240 for (i = 0; i < length; i++) { 12241 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 12242 Py_RETURN_FALSE; 12243 } 12244 Py_RETURN_TRUE; 12245 } 12246 12247 /*[clinic input] 12248 str.isnumeric as unicode_isnumeric 12249 12250 Return True if the string is a numeric string, False otherwise. 12251 12252 A string is numeric if all characters in the string are numeric and there is at 12253 least one character in the string. 12254 [clinic start generated code]*/ 12255 12256 static PyObject * unicode_isnumeric_impl(PyObject * self)12257 unicode_isnumeric_impl(PyObject *self) 12258 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/ 12259 { 12260 Py_ssize_t i, length; 12261 int kind; 12262 const void *data; 12263 12264 if (PyUnicode_READY(self) == -1) 12265 return NULL; 12266 length = PyUnicode_GET_LENGTH(self); 12267 kind = PyUnicode_KIND(self); 12268 data = PyUnicode_DATA(self); 12269 12270 /* Shortcut for single character strings */ 12271 if (length == 1) 12272 return PyBool_FromLong( 12273 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 12274 12275 /* Special case for empty strings */ 12276 if (length == 0) 12277 Py_RETURN_FALSE; 12278 12279 for (i = 0; i < length; i++) { 12280 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 12281 Py_RETURN_FALSE; 12282 } 12283 Py_RETURN_TRUE; 12284 } 12285 12286 Py_ssize_t _PyUnicode_ScanIdentifier(PyObject * self)12287 _PyUnicode_ScanIdentifier(PyObject *self) 12288 { 12289 Py_ssize_t i; 12290 if (PyUnicode_READY(self) == -1) 12291 return -1; 12292 12293 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 12294 if (len == 0) { 12295 /* an empty string is not a valid identifier */ 12296 return 0; 12297 } 12298 12299 int kind = PyUnicode_KIND(self); 12300 const void *data = PyUnicode_DATA(self); 12301 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12302 /* PEP 3131 says that the first character must be in 12303 XID_Start and subsequent characters in XID_Continue, 12304 and for the ASCII range, the 2.x rules apply (i.e 12305 start with letters and underscore, continue with 12306 letters, digits, underscore). However, given the current 12307 definition of XID_Start and XID_Continue, it is sufficient 12308 to check just for these, except that _ must be allowed 12309 as starting an identifier. */ 12310 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { 12311 return 0; 12312 } 12313 12314 for (i = 1; i < len; i++) { 12315 ch = PyUnicode_READ(kind, data, i); 12316 if (!_PyUnicode_IsXidContinue(ch)) { 12317 return i; 12318 } 12319 } 12320 return i; 12321 } 12322 12323 int PyUnicode_IsIdentifier(PyObject * self)12324 PyUnicode_IsIdentifier(PyObject *self) 12325 { 12326 if (PyUnicode_IS_READY(self)) { 12327 Py_ssize_t i = _PyUnicode_ScanIdentifier(self); 12328 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 12329 /* an empty string is not a valid identifier */ 12330 return len && i == len; 12331 } 12332 else { 12333 _Py_COMP_DIAG_PUSH 12334 _Py_COMP_DIAG_IGNORE_DEPR_DECLS 12335 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); 12336 if (len == 0) { 12337 /* an empty string is not a valid identifier */ 12338 return 0; 12339 } 12340 12341 const wchar_t *wstr = _PyUnicode_WSTR(self); 12342 Py_UCS4 ch = wstr[i++]; 12343 #if SIZEOF_WCHAR_T == 2 12344 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) 12345 && i < len 12346 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) 12347 { 12348 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); 12349 i++; 12350 } 12351 #endif 12352 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { 12353 return 0; 12354 } 12355 12356 while (i < len) { 12357 ch = wstr[i++]; 12358 #if SIZEOF_WCHAR_T == 2 12359 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) 12360 && i < len 12361 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) 12362 { 12363 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); 12364 i++; 12365 } 12366 #endif 12367 if (!_PyUnicode_IsXidContinue(ch)) { 12368 return 0; 12369 } 12370 } 12371 return 1; 12372 _Py_COMP_DIAG_POP 12373 } 12374 } 12375 12376 /*[clinic input] 12377 str.isidentifier as unicode_isidentifier 12378 12379 Return True if the string is a valid Python identifier, False otherwise. 12380 12381 Call keyword.iskeyword(s) to test whether string s is a reserved identifier, 12382 such as "def" or "class". 12383 [clinic start generated code]*/ 12384 12385 static PyObject * unicode_isidentifier_impl(PyObject * self)12386 unicode_isidentifier_impl(PyObject *self) 12387 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/ 12388 { 12389 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 12390 } 12391 12392 /*[clinic input] 12393 str.isprintable as unicode_isprintable 12394 12395 Return True if the string is printable, False otherwise. 12396 12397 A string is printable if all of its characters are considered printable in 12398 repr() or if it is empty. 12399 [clinic start generated code]*/ 12400 12401 static PyObject * unicode_isprintable_impl(PyObject * self)12402 unicode_isprintable_impl(PyObject *self) 12403 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/ 12404 { 12405 Py_ssize_t i, length; 12406 int kind; 12407 const void *data; 12408 12409 if (PyUnicode_READY(self) == -1) 12410 return NULL; 12411 length = PyUnicode_GET_LENGTH(self); 12412 kind = PyUnicode_KIND(self); 12413 data = PyUnicode_DATA(self); 12414 12415 /* Shortcut for single character strings */ 12416 if (length == 1) 12417 return PyBool_FromLong( 12418 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 12419 12420 for (i = 0; i < length; i++) { 12421 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 12422 Py_RETURN_FALSE; 12423 } 12424 } 12425 Py_RETURN_TRUE; 12426 } 12427 12428 /*[clinic input] 12429 str.join as unicode_join 12430 12431 iterable: object 12432 / 12433 12434 Concatenate any number of strings. 12435 12436 The string whose method is called is inserted in between each given string. 12437 The result is returned as a new string. 12438 12439 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs' 12440 [clinic start generated code]*/ 12441 12442 static PyObject * unicode_join(PyObject * self,PyObject * iterable)12443 unicode_join(PyObject *self, PyObject *iterable) 12444 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/ 12445 { 12446 return PyUnicode_Join(self, iterable); 12447 } 12448 12449 static Py_ssize_t unicode_length(PyObject * self)12450 unicode_length(PyObject *self) 12451 { 12452 if (PyUnicode_READY(self) == -1) 12453 return -1; 12454 return PyUnicode_GET_LENGTH(self); 12455 } 12456 12457 /*[clinic input] 12458 str.ljust as unicode_ljust 12459 12460 width: Py_ssize_t 12461 fillchar: Py_UCS4 = ' ' 12462 / 12463 12464 Return a left-justified string of length width. 12465 12466 Padding is done using the specified fill character (default is a space). 12467 [clinic start generated code]*/ 12468 12469 static PyObject * unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12470 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 12471 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/ 12472 { 12473 if (PyUnicode_READY(self) == -1) 12474 return NULL; 12475 12476 if (PyUnicode_GET_LENGTH(self) >= width) 12477 return unicode_result_unchanged(self); 12478 12479 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 12480 } 12481 12482 /*[clinic input] 12483 str.lower as unicode_lower 12484 12485 Return a copy of the string converted to lowercase. 12486 [clinic start generated code]*/ 12487 12488 static PyObject * unicode_lower_impl(PyObject * self)12489 unicode_lower_impl(PyObject *self) 12490 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/ 12491 { 12492 if (PyUnicode_READY(self) == -1) 12493 return NULL; 12494 if (PyUnicode_IS_ASCII(self)) 12495 return ascii_upper_or_lower(self, 1); 12496 return case_operation(self, do_lower); 12497 } 12498 12499 #define LEFTSTRIP 0 12500 #define RIGHTSTRIP 1 12501 #define BOTHSTRIP 2 12502 12503 /* Arrays indexed by above */ 12504 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"}; 12505 12506 #define STRIPNAME(i) (stripfuncnames[i]) 12507 12508 /* externally visible for str.strip(unicode) */ 12509 PyObject * _PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12510 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 12511 { 12512 const void *data; 12513 int kind; 12514 Py_ssize_t i, j, len; 12515 BLOOM_MASK sepmask; 12516 Py_ssize_t seplen; 12517 12518 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12519 return NULL; 12520 12521 kind = PyUnicode_KIND(self); 12522 data = PyUnicode_DATA(self); 12523 len = PyUnicode_GET_LENGTH(self); 12524 seplen = PyUnicode_GET_LENGTH(sepobj); 12525 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12526 PyUnicode_DATA(sepobj), 12527 seplen); 12528 12529 i = 0; 12530 if (striptype != RIGHTSTRIP) { 12531 while (i < len) { 12532 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12533 if (!BLOOM(sepmask, ch)) 12534 break; 12535 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12536 break; 12537 i++; 12538 } 12539 } 12540 12541 j = len; 12542 if (striptype != LEFTSTRIP) { 12543 j--; 12544 while (j >= i) { 12545 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12546 if (!BLOOM(sepmask, ch)) 12547 break; 12548 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12549 break; 12550 j--; 12551 } 12552 12553 j++; 12554 } 12555 12556 return PyUnicode_Substring(self, i, j); 12557 } 12558 12559 PyObject* PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12560 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12561 { 12562 const unsigned char *data; 12563 int kind; 12564 Py_ssize_t length; 12565 12566 if (PyUnicode_READY(self) == -1) 12567 return NULL; 12568 12569 length = PyUnicode_GET_LENGTH(self); 12570 end = Py_MIN(end, length); 12571 12572 if (start == 0 && end == length) 12573 return unicode_result_unchanged(self); 12574 12575 if (start < 0 || end < 0) { 12576 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12577 return NULL; 12578 } 12579 if (start >= length || end < start) 12580 _Py_RETURN_UNICODE_EMPTY(); 12581 12582 length = end - start; 12583 if (PyUnicode_IS_ASCII(self)) { 12584 data = PyUnicode_1BYTE_DATA(self); 12585 return _PyUnicode_FromASCII((const char*)(data + start), length); 12586 } 12587 else { 12588 kind = PyUnicode_KIND(self); 12589 data = PyUnicode_1BYTE_DATA(self); 12590 return PyUnicode_FromKindAndData(kind, 12591 data + kind * start, 12592 length); 12593 } 12594 } 12595 12596 static PyObject * do_strip(PyObject * self,int striptype)12597 do_strip(PyObject *self, int striptype) 12598 { 12599 Py_ssize_t len, i, j; 12600 12601 if (PyUnicode_READY(self) == -1) 12602 return NULL; 12603 12604 len = PyUnicode_GET_LENGTH(self); 12605 12606 if (PyUnicode_IS_ASCII(self)) { 12607 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12608 12609 i = 0; 12610 if (striptype != RIGHTSTRIP) { 12611 while (i < len) { 12612 Py_UCS1 ch = data[i]; 12613 if (!_Py_ascii_whitespace[ch]) 12614 break; 12615 i++; 12616 } 12617 } 12618 12619 j = len; 12620 if (striptype != LEFTSTRIP) { 12621 j--; 12622 while (j >= i) { 12623 Py_UCS1 ch = data[j]; 12624 if (!_Py_ascii_whitespace[ch]) 12625 break; 12626 j--; 12627 } 12628 j++; 12629 } 12630 } 12631 else { 12632 int kind = PyUnicode_KIND(self); 12633 const void *data = PyUnicode_DATA(self); 12634 12635 i = 0; 12636 if (striptype != RIGHTSTRIP) { 12637 while (i < len) { 12638 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12639 if (!Py_UNICODE_ISSPACE(ch)) 12640 break; 12641 i++; 12642 } 12643 } 12644 12645 j = len; 12646 if (striptype != LEFTSTRIP) { 12647 j--; 12648 while (j >= i) { 12649 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12650 if (!Py_UNICODE_ISSPACE(ch)) 12651 break; 12652 j--; 12653 } 12654 j++; 12655 } 12656 } 12657 12658 return PyUnicode_Substring(self, i, j); 12659 } 12660 12661 12662 static PyObject * do_argstrip(PyObject * self,int striptype,PyObject * sep)12663 do_argstrip(PyObject *self, int striptype, PyObject *sep) 12664 { 12665 if (sep != Py_None) { 12666 if (PyUnicode_Check(sep)) 12667 return _PyUnicode_XStrip(self, striptype, sep); 12668 else { 12669 PyErr_Format(PyExc_TypeError, 12670 "%s arg must be None or str", 12671 STRIPNAME(striptype)); 12672 return NULL; 12673 } 12674 } 12675 12676 return do_strip(self, striptype); 12677 } 12678 12679 12680 /*[clinic input] 12681 str.strip as unicode_strip 12682 12683 chars: object = None 12684 / 12685 12686 Return a copy of the string with leading and trailing whitespace removed. 12687 12688 If chars is given and not None, remove characters in chars instead. 12689 [clinic start generated code]*/ 12690 12691 static PyObject * unicode_strip_impl(PyObject * self,PyObject * chars)12692 unicode_strip_impl(PyObject *self, PyObject *chars) 12693 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/ 12694 { 12695 return do_argstrip(self, BOTHSTRIP, chars); 12696 } 12697 12698 12699 /*[clinic input] 12700 str.lstrip as unicode_lstrip 12701 12702 chars: object = None 12703 / 12704 12705 Return a copy of the string with leading whitespace removed. 12706 12707 If chars is given and not None, remove characters in chars instead. 12708 [clinic start generated code]*/ 12709 12710 static PyObject * unicode_lstrip_impl(PyObject * self,PyObject * chars)12711 unicode_lstrip_impl(PyObject *self, PyObject *chars) 12712 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/ 12713 { 12714 return do_argstrip(self, LEFTSTRIP, chars); 12715 } 12716 12717 12718 /*[clinic input] 12719 str.rstrip as unicode_rstrip 12720 12721 chars: object = None 12722 / 12723 12724 Return a copy of the string with trailing whitespace removed. 12725 12726 If chars is given and not None, remove characters in chars instead. 12727 [clinic start generated code]*/ 12728 12729 static PyObject * unicode_rstrip_impl(PyObject * self,PyObject * chars)12730 unicode_rstrip_impl(PyObject *self, PyObject *chars) 12731 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/ 12732 { 12733 return do_argstrip(self, RIGHTSTRIP, chars); 12734 } 12735 12736 12737 static PyObject* unicode_repeat(PyObject * str,Py_ssize_t len)12738 unicode_repeat(PyObject *str, Py_ssize_t len) 12739 { 12740 PyObject *u; 12741 Py_ssize_t nchars, n; 12742 12743 if (len < 1) 12744 _Py_RETURN_UNICODE_EMPTY(); 12745 12746 /* no repeat, return original string */ 12747 if (len == 1) 12748 return unicode_result_unchanged(str); 12749 12750 if (PyUnicode_READY(str) == -1) 12751 return NULL; 12752 12753 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12754 PyErr_SetString(PyExc_OverflowError, 12755 "repeated string is too long"); 12756 return NULL; 12757 } 12758 nchars = len * PyUnicode_GET_LENGTH(str); 12759 12760 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12761 if (!u) 12762 return NULL; 12763 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12764 12765 if (PyUnicode_GET_LENGTH(str) == 1) { 12766 int kind = PyUnicode_KIND(str); 12767 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12768 if (kind == PyUnicode_1BYTE_KIND) { 12769 void *to = PyUnicode_DATA(u); 12770 memset(to, (unsigned char)fill_char, len); 12771 } 12772 else if (kind == PyUnicode_2BYTE_KIND) { 12773 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12774 for (n = 0; n < len; ++n) 12775 ucs2[n] = fill_char; 12776 } else { 12777 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12778 assert(kind == PyUnicode_4BYTE_KIND); 12779 for (n = 0; n < len; ++n) 12780 ucs4[n] = fill_char; 12781 } 12782 } 12783 else { 12784 Py_ssize_t char_size = PyUnicode_KIND(str); 12785 char *to = (char *) PyUnicode_DATA(u); 12786 _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str), 12787 PyUnicode_GET_LENGTH(str) * char_size); 12788 } 12789 12790 assert(_PyUnicode_CheckConsistency(u, 1)); 12791 return u; 12792 } 12793 12794 PyObject * PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12795 PyUnicode_Replace(PyObject *str, 12796 PyObject *substr, 12797 PyObject *replstr, 12798 Py_ssize_t maxcount) 12799 { 12800 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || 12801 ensure_unicode(replstr) < 0) 12802 return NULL; 12803 return replace(str, substr, replstr, maxcount); 12804 } 12805 12806 /*[clinic input] 12807 str.replace as unicode_replace 12808 12809 old: unicode 12810 new: unicode 12811 count: Py_ssize_t = -1 12812 Maximum number of occurrences to replace. 12813 -1 (the default value) means replace all occurrences. 12814 / 12815 12816 Return a copy with all occurrences of substring old replaced by new. 12817 12818 If the optional argument count is given, only the first count occurrences are 12819 replaced. 12820 [clinic start generated code]*/ 12821 12822 static PyObject * unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12823 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new, 12824 Py_ssize_t count) 12825 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/ 12826 { 12827 if (PyUnicode_READY(self) == -1) 12828 return NULL; 12829 return replace(self, old, new, count); 12830 } 12831 12832 /*[clinic input] 12833 str.removeprefix as unicode_removeprefix 12834 12835 prefix: unicode 12836 / 12837 12838 Return a str with the given prefix string removed if present. 12839 12840 If the string starts with the prefix string, return string[len(prefix):]. 12841 Otherwise, return a copy of the original string. 12842 [clinic start generated code]*/ 12843 12844 static PyObject * unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12845 unicode_removeprefix_impl(PyObject *self, PyObject *prefix) 12846 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/ 12847 { 12848 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1); 12849 if (match == -1) { 12850 return NULL; 12851 } 12852 if (match) { 12853 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix), 12854 PyUnicode_GET_LENGTH(self)); 12855 } 12856 return unicode_result_unchanged(self); 12857 } 12858 12859 /*[clinic input] 12860 str.removesuffix as unicode_removesuffix 12861 12862 suffix: unicode 12863 / 12864 12865 Return a str with the given suffix string removed if present. 12866 12867 If the string ends with the suffix string and that suffix is not empty, 12868 return string[:-len(suffix)]. Otherwise, return a copy of the original 12869 string. 12870 [clinic start generated code]*/ 12871 12872 static PyObject * unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12873 unicode_removesuffix_impl(PyObject *self, PyObject *suffix) 12874 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/ 12875 { 12876 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1); 12877 if (match == -1) { 12878 return NULL; 12879 } 12880 if (match) { 12881 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self) 12882 - PyUnicode_GET_LENGTH(suffix)); 12883 } 12884 return unicode_result_unchanged(self); 12885 } 12886 12887 static PyObject * unicode_repr(PyObject * unicode)12888 unicode_repr(PyObject *unicode) 12889 { 12890 PyObject *repr; 12891 Py_ssize_t isize; 12892 Py_ssize_t osize, squote, dquote, i, o; 12893 Py_UCS4 max, quote; 12894 int ikind, okind, unchanged; 12895 const void *idata; 12896 void *odata; 12897 12898 if (PyUnicode_READY(unicode) == -1) 12899 return NULL; 12900 12901 isize = PyUnicode_GET_LENGTH(unicode); 12902 idata = PyUnicode_DATA(unicode); 12903 12904 /* Compute length of output, quote characters, and 12905 maximum character */ 12906 osize = 0; 12907 max = 127; 12908 squote = dquote = 0; 12909 ikind = PyUnicode_KIND(unicode); 12910 for (i = 0; i < isize; i++) { 12911 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12912 Py_ssize_t incr = 1; 12913 switch (ch) { 12914 case '\'': squote++; break; 12915 case '"': dquote++; break; 12916 case '\\': case '\t': case '\r': case '\n': 12917 incr = 2; 12918 break; 12919 default: 12920 /* Fast-path ASCII */ 12921 if (ch < ' ' || ch == 0x7f) 12922 incr = 4; /* \xHH */ 12923 else if (ch < 0x7f) 12924 ; 12925 else if (Py_UNICODE_ISPRINTABLE(ch)) 12926 max = ch > max ? ch : max; 12927 else if (ch < 0x100) 12928 incr = 4; /* \xHH */ 12929 else if (ch < 0x10000) 12930 incr = 6; /* \uHHHH */ 12931 else 12932 incr = 10; /* \uHHHHHHHH */ 12933 } 12934 if (osize > PY_SSIZE_T_MAX - incr) { 12935 PyErr_SetString(PyExc_OverflowError, 12936 "string is too long to generate repr"); 12937 return NULL; 12938 } 12939 osize += incr; 12940 } 12941 12942 quote = '\''; 12943 unchanged = (osize == isize); 12944 if (squote) { 12945 unchanged = 0; 12946 if (dquote) 12947 /* Both squote and dquote present. Use squote, 12948 and escape them */ 12949 osize += squote; 12950 else 12951 quote = '"'; 12952 } 12953 osize += 2; /* quotes */ 12954 12955 repr = PyUnicode_New(osize, max); 12956 if (repr == NULL) 12957 return NULL; 12958 okind = PyUnicode_KIND(repr); 12959 odata = PyUnicode_DATA(repr); 12960 12961 PyUnicode_WRITE(okind, odata, 0, quote); 12962 PyUnicode_WRITE(okind, odata, osize-1, quote); 12963 if (unchanged) { 12964 _PyUnicode_FastCopyCharacters(repr, 1, 12965 unicode, 0, 12966 isize); 12967 } 12968 else { 12969 for (i = 0, o = 1; i < isize; i++) { 12970 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12971 12972 /* Escape quotes and backslashes */ 12973 if ((ch == quote) || (ch == '\\')) { 12974 PyUnicode_WRITE(okind, odata, o++, '\\'); 12975 PyUnicode_WRITE(okind, odata, o++, ch); 12976 continue; 12977 } 12978 12979 /* Map special whitespace to '\t', \n', '\r' */ 12980 if (ch == '\t') { 12981 PyUnicode_WRITE(okind, odata, o++, '\\'); 12982 PyUnicode_WRITE(okind, odata, o++, 't'); 12983 } 12984 else if (ch == '\n') { 12985 PyUnicode_WRITE(okind, odata, o++, '\\'); 12986 PyUnicode_WRITE(okind, odata, o++, 'n'); 12987 } 12988 else if (ch == '\r') { 12989 PyUnicode_WRITE(okind, odata, o++, '\\'); 12990 PyUnicode_WRITE(okind, odata, o++, 'r'); 12991 } 12992 12993 /* Map non-printable US ASCII to '\xhh' */ 12994 else if (ch < ' ' || ch == 0x7F) { 12995 PyUnicode_WRITE(okind, odata, o++, '\\'); 12996 PyUnicode_WRITE(okind, odata, o++, 'x'); 12997 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12998 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12999 } 13000 13001 /* Copy ASCII characters as-is */ 13002 else if (ch < 0x7F) { 13003 PyUnicode_WRITE(okind, odata, o++, ch); 13004 } 13005 13006 /* Non-ASCII characters */ 13007 else { 13008 /* Map Unicode whitespace and control characters 13009 (categories Z* and C* except ASCII space) 13010 */ 13011 if (!Py_UNICODE_ISPRINTABLE(ch)) { 13012 PyUnicode_WRITE(okind, odata, o++, '\\'); 13013 /* Map 8-bit characters to '\xhh' */ 13014 if (ch <= 0xff) { 13015 PyUnicode_WRITE(okind, odata, o++, 'x'); 13016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 13017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 13018 } 13019 /* Map 16-bit characters to '\uxxxx' */ 13020 else if (ch <= 0xffff) { 13021 PyUnicode_WRITE(okind, odata, o++, 'u'); 13022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 13023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 13024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 13025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 13026 } 13027 /* Map 21-bit characters to '\U00xxxxxx' */ 13028 else { 13029 PyUnicode_WRITE(okind, odata, o++, 'U'); 13030 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 13031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 13032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 13033 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 13034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 13035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 13036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 13037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 13038 } 13039 } 13040 /* Copy characters as-is */ 13041 else { 13042 PyUnicode_WRITE(okind, odata, o++, ch); 13043 } 13044 } 13045 } 13046 } 13047 /* Closing quote already added at the beginning */ 13048 assert(_PyUnicode_CheckConsistency(repr, 1)); 13049 return repr; 13050 } 13051 13052 PyDoc_STRVAR(rfind__doc__, 13053 "S.rfind(sub[, start[, end]]) -> int\n\ 13054 \n\ 13055 Return the highest index in S where substring sub is found,\n\ 13056 such that sub is contained within S[start:end]. Optional\n\ 13057 arguments start and end are interpreted as in slice notation.\n\ 13058 \n\ 13059 Return -1 on failure."); 13060 13061 static PyObject * unicode_rfind(PyObject * self,PyObject * args)13062 unicode_rfind(PyObject *self, PyObject *args) 13063 { 13064 /* initialize variables to prevent gcc warning */ 13065 PyObject *substring = NULL; 13066 Py_ssize_t start = 0; 13067 Py_ssize_t end = 0; 13068 Py_ssize_t result; 13069 13070 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) 13071 return NULL; 13072 13073 if (PyUnicode_READY(self) == -1) 13074 return NULL; 13075 13076 result = any_find_slice(self, substring, start, end, -1); 13077 13078 if (result == -2) 13079 return NULL; 13080 13081 return PyLong_FromSsize_t(result); 13082 } 13083 13084 PyDoc_STRVAR(rindex__doc__, 13085 "S.rindex(sub[, start[, end]]) -> int\n\ 13086 \n\ 13087 Return the highest index in S where substring sub is found,\n\ 13088 such that sub is contained within S[start:end]. Optional\n\ 13089 arguments start and end are interpreted as in slice notation.\n\ 13090 \n\ 13091 Raises ValueError when the substring is not found."); 13092 13093 static PyObject * unicode_rindex(PyObject * self,PyObject * args)13094 unicode_rindex(PyObject *self, PyObject *args) 13095 { 13096 /* initialize variables to prevent gcc warning */ 13097 PyObject *substring = NULL; 13098 Py_ssize_t start = 0; 13099 Py_ssize_t end = 0; 13100 Py_ssize_t result; 13101 13102 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) 13103 return NULL; 13104 13105 if (PyUnicode_READY(self) == -1) 13106 return NULL; 13107 13108 result = any_find_slice(self, substring, start, end, -1); 13109 13110 if (result == -2) 13111 return NULL; 13112 13113 if (result < 0) { 13114 PyErr_SetString(PyExc_ValueError, "substring not found"); 13115 return NULL; 13116 } 13117 13118 return PyLong_FromSsize_t(result); 13119 } 13120 13121 /*[clinic input] 13122 str.rjust as unicode_rjust 13123 13124 width: Py_ssize_t 13125 fillchar: Py_UCS4 = ' ' 13126 / 13127 13128 Return a right-justified string of length width. 13129 13130 Padding is done using the specified fill character (default is a space). 13131 [clinic start generated code]*/ 13132 13133 static PyObject * unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13134 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 13135 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/ 13136 { 13137 if (PyUnicode_READY(self) == -1) 13138 return NULL; 13139 13140 if (PyUnicode_GET_LENGTH(self) >= width) 13141 return unicode_result_unchanged(self); 13142 13143 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 13144 } 13145 13146 PyObject * PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13147 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 13148 { 13149 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 13150 return NULL; 13151 13152 return split(s, sep, maxsplit); 13153 } 13154 13155 /*[clinic input] 13156 str.split as unicode_split 13157 13158 sep: object = None 13159 The separator used to split the string. 13160 13161 When set to None (the default value), will split on any whitespace 13162 character (including \\n \\r \\t \\f and spaces) and will discard 13163 empty strings from the result. 13164 maxsplit: Py_ssize_t = -1 13165 Maximum number of splits (starting from the left). 13166 -1 (the default value) means no limit. 13167 13168 Return a list of the substrings in the string, using sep as the separator string. 13169 13170 Note, str.split() is mainly useful for data that has been intentionally 13171 delimited. With natural text that includes punctuation, consider using 13172 the regular expression module. 13173 13174 [clinic start generated code]*/ 13175 13176 static PyObject * unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13177 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 13178 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/ 13179 { 13180 if (sep == Py_None) 13181 return split(self, NULL, maxsplit); 13182 if (PyUnicode_Check(sep)) 13183 return split(self, sep, maxsplit); 13184 13185 PyErr_Format(PyExc_TypeError, 13186 "must be str or None, not %.100s", 13187 Py_TYPE(sep)->tp_name); 13188 return NULL; 13189 } 13190 13191 PyObject * PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13192 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) 13193 { 13194 PyObject* out; 13195 int kind1, kind2; 13196 const void *buf1, *buf2; 13197 Py_ssize_t len1, len2; 13198 13199 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 13200 return NULL; 13201 13202 kind1 = PyUnicode_KIND(str_obj); 13203 kind2 = PyUnicode_KIND(sep_obj); 13204 len1 = PyUnicode_GET_LENGTH(str_obj); 13205 len2 = PyUnicode_GET_LENGTH(sep_obj); 13206 if (kind1 < kind2 || len1 < len2) { 13207 PyObject *empty = unicode_get_empty(); // Borrowed reference 13208 return PyTuple_Pack(3, str_obj, empty, empty); 13209 } 13210 buf1 = PyUnicode_DATA(str_obj); 13211 buf2 = PyUnicode_DATA(sep_obj); 13212 if (kind2 != kind1) { 13213 buf2 = unicode_askind(kind2, buf2, len2, kind1); 13214 if (!buf2) 13215 return NULL; 13216 } 13217 13218 switch (kind1) { 13219 case PyUnicode_1BYTE_KIND: 13220 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 13221 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 13222 else 13223 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 13224 break; 13225 case PyUnicode_2BYTE_KIND: 13226 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 13227 break; 13228 case PyUnicode_4BYTE_KIND: 13229 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 13230 break; 13231 default: 13232 Py_UNREACHABLE(); 13233 } 13234 13235 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj))); 13236 if (kind2 != kind1) 13237 PyMem_Free((void *)buf2); 13238 13239 return out; 13240 } 13241 13242 13243 PyObject * PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13244 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) 13245 { 13246 PyObject* out; 13247 int kind1, kind2; 13248 const void *buf1, *buf2; 13249 Py_ssize_t len1, len2; 13250 13251 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 13252 return NULL; 13253 13254 kind1 = PyUnicode_KIND(str_obj); 13255 kind2 = PyUnicode_KIND(sep_obj); 13256 len1 = PyUnicode_GET_LENGTH(str_obj); 13257 len2 = PyUnicode_GET_LENGTH(sep_obj); 13258 if (kind1 < kind2 || len1 < len2) { 13259 PyObject *empty = unicode_get_empty(); // Borrowed reference 13260 return PyTuple_Pack(3, empty, empty, str_obj); 13261 } 13262 buf1 = PyUnicode_DATA(str_obj); 13263 buf2 = PyUnicode_DATA(sep_obj); 13264 if (kind2 != kind1) { 13265 buf2 = unicode_askind(kind2, buf2, len2, kind1); 13266 if (!buf2) 13267 return NULL; 13268 } 13269 13270 switch (kind1) { 13271 case PyUnicode_1BYTE_KIND: 13272 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 13273 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13274 else 13275 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13276 break; 13277 case PyUnicode_2BYTE_KIND: 13278 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13279 break; 13280 case PyUnicode_4BYTE_KIND: 13281 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13282 break; 13283 default: 13284 Py_UNREACHABLE(); 13285 } 13286 13287 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj))); 13288 if (kind2 != kind1) 13289 PyMem_Free((void *)buf2); 13290 13291 return out; 13292 } 13293 13294 /*[clinic input] 13295 str.partition as unicode_partition 13296 13297 sep: object 13298 / 13299 13300 Partition the string into three parts using the given separator. 13301 13302 This will search for the separator in the string. If the separator is found, 13303 returns a 3-tuple containing the part before the separator, the separator 13304 itself, and the part after it. 13305 13306 If the separator is not found, returns a 3-tuple containing the original string 13307 and two empty strings. 13308 [clinic start generated code]*/ 13309 13310 static PyObject * unicode_partition(PyObject * self,PyObject * sep)13311 unicode_partition(PyObject *self, PyObject *sep) 13312 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/ 13313 { 13314 return PyUnicode_Partition(self, sep); 13315 } 13316 13317 /*[clinic input] 13318 str.rpartition as unicode_rpartition = str.partition 13319 13320 Partition the string into three parts using the given separator. 13321 13322 This will search for the separator in the string, starting at the end. If 13323 the separator is found, returns a 3-tuple containing the part before the 13324 separator, the separator itself, and the part after it. 13325 13326 If the separator is not found, returns a 3-tuple containing two empty strings 13327 and the original string. 13328 [clinic start generated code]*/ 13329 13330 static PyObject * unicode_rpartition(PyObject * self,PyObject * sep)13331 unicode_rpartition(PyObject *self, PyObject *sep) 13332 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/ 13333 { 13334 return PyUnicode_RPartition(self, sep); 13335 } 13336 13337 PyObject * PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13338 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 13339 { 13340 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 13341 return NULL; 13342 13343 return rsplit(s, sep, maxsplit); 13344 } 13345 13346 /*[clinic input] 13347 str.rsplit as unicode_rsplit = str.split 13348 13349 Return a list of the substrings in the string, using sep as the separator string. 13350 13351 Splitting starts at the end of the string and works to the front. 13352 [clinic start generated code]*/ 13353 13354 static PyObject * unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13355 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 13356 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/ 13357 { 13358 if (sep == Py_None) 13359 return rsplit(self, NULL, maxsplit); 13360 if (PyUnicode_Check(sep)) 13361 return rsplit(self, sep, maxsplit); 13362 13363 PyErr_Format(PyExc_TypeError, 13364 "must be str or None, not %.100s", 13365 Py_TYPE(sep)->tp_name); 13366 return NULL; 13367 } 13368 13369 /*[clinic input] 13370 str.splitlines as unicode_splitlines 13371 13372 keepends: bool(accept={int}) = False 13373 13374 Return a list of the lines in the string, breaking at line boundaries. 13375 13376 Line breaks are not included in the resulting list unless keepends is given and 13377 true. 13378 [clinic start generated code]*/ 13379 13380 static PyObject * unicode_splitlines_impl(PyObject * self,int keepends)13381 unicode_splitlines_impl(PyObject *self, int keepends) 13382 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/ 13383 { 13384 return PyUnicode_Splitlines(self, keepends); 13385 } 13386 13387 static unicode_str(PyObject * self)13388 PyObject *unicode_str(PyObject *self) 13389 { 13390 return unicode_result_unchanged(self); 13391 } 13392 13393 /*[clinic input] 13394 str.swapcase as unicode_swapcase 13395 13396 Convert uppercase characters to lowercase and lowercase characters to uppercase. 13397 [clinic start generated code]*/ 13398 13399 static PyObject * unicode_swapcase_impl(PyObject * self)13400 unicode_swapcase_impl(PyObject *self) 13401 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/ 13402 { 13403 if (PyUnicode_READY(self) == -1) 13404 return NULL; 13405 return case_operation(self, do_swapcase); 13406 } 13407 13408 /*[clinic input] 13409 13410 @staticmethod 13411 str.maketrans as unicode_maketrans 13412 13413 x: object 13414 13415 y: unicode=NULL 13416 13417 z: unicode=NULL 13418 13419 / 13420 13421 Return a translation table usable for str.translate(). 13422 13423 If there is only one argument, it must be a dictionary mapping Unicode 13424 ordinals (integers) or characters to Unicode ordinals, strings or None. 13425 Character keys will be then converted to ordinals. 13426 If there are two arguments, they must be strings of equal length, and 13427 in the resulting dictionary, each character in x will be mapped to the 13428 character at the same position in y. If there is a third argument, it 13429 must be a string, whose characters will be mapped to None in the result. 13430 [clinic start generated code]*/ 13431 13432 static PyObject * unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13433 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 13434 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 13435 { 13436 PyObject *new = NULL, *key, *value; 13437 Py_ssize_t i = 0; 13438 int res; 13439 13440 new = PyDict_New(); 13441 if (!new) 13442 return NULL; 13443 if (y != NULL) { 13444 int x_kind, y_kind, z_kind; 13445 const void *x_data, *y_data, *z_data; 13446 13447 /* x must be a string too, of equal length */ 13448 if (!PyUnicode_Check(x)) { 13449 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13450 "be a string if there is a second argument"); 13451 goto err; 13452 } 13453 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13454 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13455 "arguments must have equal length"); 13456 goto err; 13457 } 13458 /* create entries for translating chars in x to those in y */ 13459 x_kind = PyUnicode_KIND(x); 13460 y_kind = PyUnicode_KIND(y); 13461 x_data = PyUnicode_DATA(x); 13462 y_data = PyUnicode_DATA(y); 13463 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13464 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13465 if (!key) 13466 goto err; 13467 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13468 if (!value) { 13469 Py_DECREF(key); 13470 goto err; 13471 } 13472 res = PyDict_SetItem(new, key, value); 13473 Py_DECREF(key); 13474 Py_DECREF(value); 13475 if (res < 0) 13476 goto err; 13477 } 13478 /* create entries for deleting chars in z */ 13479 if (z != NULL) { 13480 z_kind = PyUnicode_KIND(z); 13481 z_data = PyUnicode_DATA(z); 13482 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13483 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13484 if (!key) 13485 goto err; 13486 res = PyDict_SetItem(new, key, Py_None); 13487 Py_DECREF(key); 13488 if (res < 0) 13489 goto err; 13490 } 13491 } 13492 } else { 13493 int kind; 13494 const void *data; 13495 13496 /* x must be a dict */ 13497 if (!PyDict_CheckExact(x)) { 13498 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13499 "to maketrans it must be a dict"); 13500 goto err; 13501 } 13502 /* copy entries into the new dict, converting string keys to int keys */ 13503 while (PyDict_Next(x, &i, &key, &value)) { 13504 if (PyUnicode_Check(key)) { 13505 /* convert string keys to integer keys */ 13506 PyObject *newkey; 13507 if (PyUnicode_GET_LENGTH(key) != 1) { 13508 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13509 "table must be of length 1"); 13510 goto err; 13511 } 13512 kind = PyUnicode_KIND(key); 13513 data = PyUnicode_DATA(key); 13514 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13515 if (!newkey) 13516 goto err; 13517 res = PyDict_SetItem(new, newkey, value); 13518 Py_DECREF(newkey); 13519 if (res < 0) 13520 goto err; 13521 } else if (PyLong_Check(key)) { 13522 /* just keep integer keys */ 13523 if (PyDict_SetItem(new, key, value) < 0) 13524 goto err; 13525 } else { 13526 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13527 "be strings or integers"); 13528 goto err; 13529 } 13530 } 13531 } 13532 return new; 13533 err: 13534 Py_DECREF(new); 13535 return NULL; 13536 } 13537 13538 /*[clinic input] 13539 str.translate as unicode_translate 13540 13541 table: object 13542 Translation table, which must be a mapping of Unicode ordinals to 13543 Unicode ordinals, strings, or None. 13544 / 13545 13546 Replace each character in the string using the given translation table. 13547 13548 The table must implement lookup/indexing via __getitem__, for instance a 13549 dictionary or list. If this operation raises LookupError, the character is 13550 left untouched. Characters mapped to None are deleted. 13551 [clinic start generated code]*/ 13552 13553 static PyObject * unicode_translate(PyObject * self,PyObject * table)13554 unicode_translate(PyObject *self, PyObject *table) 13555 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/ 13556 { 13557 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13558 } 13559 13560 /*[clinic input] 13561 str.upper as unicode_upper 13562 13563 Return a copy of the string converted to uppercase. 13564 [clinic start generated code]*/ 13565 13566 static PyObject * unicode_upper_impl(PyObject * self)13567 unicode_upper_impl(PyObject *self) 13568 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/ 13569 { 13570 if (PyUnicode_READY(self) == -1) 13571 return NULL; 13572 if (PyUnicode_IS_ASCII(self)) 13573 return ascii_upper_or_lower(self, 0); 13574 return case_operation(self, do_upper); 13575 } 13576 13577 /*[clinic input] 13578 str.zfill as unicode_zfill 13579 13580 width: Py_ssize_t 13581 / 13582 13583 Pad a numeric string with zeros on the left, to fill a field of the given width. 13584 13585 The string is never truncated. 13586 [clinic start generated code]*/ 13587 13588 static PyObject * unicode_zfill_impl(PyObject * self,Py_ssize_t width)13589 unicode_zfill_impl(PyObject *self, Py_ssize_t width) 13590 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/ 13591 { 13592 Py_ssize_t fill; 13593 PyObject *u; 13594 int kind; 13595 const void *data; 13596 Py_UCS4 chr; 13597 13598 if (PyUnicode_READY(self) == -1) 13599 return NULL; 13600 13601 if (PyUnicode_GET_LENGTH(self) >= width) 13602 return unicode_result_unchanged(self); 13603 13604 fill = width - PyUnicode_GET_LENGTH(self); 13605 13606 u = pad(self, fill, 0, '0'); 13607 13608 if (u == NULL) 13609 return NULL; 13610 13611 kind = PyUnicode_KIND(u); 13612 data = PyUnicode_DATA(u); 13613 chr = PyUnicode_READ(kind, data, fill); 13614 13615 if (chr == '+' || chr == '-') { 13616 /* move sign to beginning of string */ 13617 PyUnicode_WRITE(kind, data, 0, chr); 13618 PyUnicode_WRITE(kind, data, fill, '0'); 13619 } 13620 13621 assert(_PyUnicode_CheckConsistency(u, 1)); 13622 return u; 13623 } 13624 13625 PyDoc_STRVAR(startswith__doc__, 13626 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13627 \n\ 13628 Return True if S starts with the specified prefix, False otherwise.\n\ 13629 With optional start, test S beginning at that position.\n\ 13630 With optional end, stop comparing S at that position.\n\ 13631 prefix can also be a tuple of strings to try."); 13632 13633 static PyObject * unicode_startswith(PyObject * self,PyObject * args)13634 unicode_startswith(PyObject *self, 13635 PyObject *args) 13636 { 13637 PyObject *subobj; 13638 PyObject *substring; 13639 Py_ssize_t start = 0; 13640 Py_ssize_t end = PY_SSIZE_T_MAX; 13641 int result; 13642 13643 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13644 return NULL; 13645 if (PyTuple_Check(subobj)) { 13646 Py_ssize_t i; 13647 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13648 substring = PyTuple_GET_ITEM(subobj, i); 13649 if (!PyUnicode_Check(substring)) { 13650 PyErr_Format(PyExc_TypeError, 13651 "tuple for startswith must only contain str, " 13652 "not %.100s", 13653 Py_TYPE(substring)->tp_name); 13654 return NULL; 13655 } 13656 result = tailmatch(self, substring, start, end, -1); 13657 if (result == -1) 13658 return NULL; 13659 if (result) { 13660 Py_RETURN_TRUE; 13661 } 13662 } 13663 /* nothing matched */ 13664 Py_RETURN_FALSE; 13665 } 13666 if (!PyUnicode_Check(subobj)) { 13667 PyErr_Format(PyExc_TypeError, 13668 "startswith first arg must be str or " 13669 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13670 return NULL; 13671 } 13672 result = tailmatch(self, subobj, start, end, -1); 13673 if (result == -1) 13674 return NULL; 13675 return PyBool_FromLong(result); 13676 } 13677 13678 13679 PyDoc_STRVAR(endswith__doc__, 13680 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13681 \n\ 13682 Return True if S ends with the specified suffix, False otherwise.\n\ 13683 With optional start, test S beginning at that position.\n\ 13684 With optional end, stop comparing S at that position.\n\ 13685 suffix can also be a tuple of strings to try."); 13686 13687 static PyObject * unicode_endswith(PyObject * self,PyObject * args)13688 unicode_endswith(PyObject *self, 13689 PyObject *args) 13690 { 13691 PyObject *subobj; 13692 PyObject *substring; 13693 Py_ssize_t start = 0; 13694 Py_ssize_t end = PY_SSIZE_T_MAX; 13695 int result; 13696 13697 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13698 return NULL; 13699 if (PyTuple_Check(subobj)) { 13700 Py_ssize_t i; 13701 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13702 substring = PyTuple_GET_ITEM(subobj, i); 13703 if (!PyUnicode_Check(substring)) { 13704 PyErr_Format(PyExc_TypeError, 13705 "tuple for endswith must only contain str, " 13706 "not %.100s", 13707 Py_TYPE(substring)->tp_name); 13708 return NULL; 13709 } 13710 result = tailmatch(self, substring, start, end, +1); 13711 if (result == -1) 13712 return NULL; 13713 if (result) { 13714 Py_RETURN_TRUE; 13715 } 13716 } 13717 Py_RETURN_FALSE; 13718 } 13719 if (!PyUnicode_Check(subobj)) { 13720 PyErr_Format(PyExc_TypeError, 13721 "endswith first arg must be str or " 13722 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13723 return NULL; 13724 } 13725 result = tailmatch(self, subobj, start, end, +1); 13726 if (result == -1) 13727 return NULL; 13728 return PyBool_FromLong(result); 13729 } 13730 13731 static inline void _PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13732 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13733 { 13734 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13735 writer->data = PyUnicode_DATA(writer->buffer); 13736 13737 if (!writer->readonly) { 13738 writer->kind = PyUnicode_KIND(writer->buffer); 13739 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13740 } 13741 else { 13742 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13743 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13744 writer->kind = PyUnicode_WCHAR_KIND; 13745 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13746 13747 /* Copy-on-write mode: set buffer size to 0 so 13748 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13749 * next write. */ 13750 writer->size = 0; 13751 } 13752 } 13753 13754 void _PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13755 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13756 { 13757 memset(writer, 0, sizeof(*writer)); 13758 13759 /* ASCII is the bare minimum */ 13760 writer->min_char = 127; 13761 13762 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13763 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13764 writer->kind = PyUnicode_WCHAR_KIND; 13765 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13766 } 13767 13768 // Initialize _PyUnicodeWriter with initial buffer 13769 static inline void _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13770 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) 13771 { 13772 memset(writer, 0, sizeof(*writer)); 13773 writer->buffer = buffer; 13774 _PyUnicodeWriter_Update(writer); 13775 writer->min_length = writer->size; 13776 } 13777 13778 int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13779 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13780 Py_ssize_t length, Py_UCS4 maxchar) 13781 { 13782 Py_ssize_t newlen; 13783 PyObject *newbuffer; 13784 13785 assert(maxchar <= MAX_UNICODE); 13786 13787 /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13788 assert((maxchar > writer->maxchar && length >= 0) 13789 || length > 0); 13790 13791 if (length > PY_SSIZE_T_MAX - writer->pos) { 13792 PyErr_NoMemory(); 13793 return -1; 13794 } 13795 newlen = writer->pos + length; 13796 13797 maxchar = Py_MAX(maxchar, writer->min_char); 13798 13799 if (writer->buffer == NULL) { 13800 assert(!writer->readonly); 13801 if (writer->overallocate 13802 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13803 /* overallocate to limit the number of realloc() */ 13804 newlen += newlen / OVERALLOCATE_FACTOR; 13805 } 13806 if (newlen < writer->min_length) 13807 newlen = writer->min_length; 13808 13809 writer->buffer = PyUnicode_New(newlen, maxchar); 13810 if (writer->buffer == NULL) 13811 return -1; 13812 } 13813 else if (newlen > writer->size) { 13814 if (writer->overallocate 13815 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13816 /* overallocate to limit the number of realloc() */ 13817 newlen += newlen / OVERALLOCATE_FACTOR; 13818 } 13819 if (newlen < writer->min_length) 13820 newlen = writer->min_length; 13821 13822 if (maxchar > writer->maxchar || writer->readonly) { 13823 /* resize + widen */ 13824 maxchar = Py_MAX(maxchar, writer->maxchar); 13825 newbuffer = PyUnicode_New(newlen, maxchar); 13826 if (newbuffer == NULL) 13827 return -1; 13828 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13829 writer->buffer, 0, writer->pos); 13830 Py_DECREF(writer->buffer); 13831 writer->readonly = 0; 13832 } 13833 else { 13834 newbuffer = resize_compact(writer->buffer, newlen); 13835 if (newbuffer == NULL) 13836 return -1; 13837 } 13838 writer->buffer = newbuffer; 13839 } 13840 else if (maxchar > writer->maxchar) { 13841 assert(!writer->readonly); 13842 newbuffer = PyUnicode_New(writer->size, maxchar); 13843 if (newbuffer == NULL) 13844 return -1; 13845 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13846 writer->buffer, 0, writer->pos); 13847 Py_SETREF(writer->buffer, newbuffer); 13848 } 13849 _PyUnicodeWriter_Update(writer); 13850 return 0; 13851 13852 #undef OVERALLOCATE_FACTOR 13853 } 13854 13855 int _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13856 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13857 enum PyUnicode_Kind kind) 13858 { 13859 Py_UCS4 maxchar; 13860 13861 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13862 assert(writer->kind < kind); 13863 13864 switch (kind) 13865 { 13866 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13867 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13868 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break; 13869 default: 13870 Py_UNREACHABLE(); 13871 } 13872 13873 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13874 } 13875 13876 static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13877 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13878 { 13879 assert(ch <= MAX_UNICODE); 13880 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13881 return -1; 13882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13883 writer->pos++; 13884 return 0; 13885 } 13886 13887 int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13888 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13889 { 13890 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13891 } 13892 13893 int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13894 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13895 { 13896 Py_UCS4 maxchar; 13897 Py_ssize_t len; 13898 13899 if (PyUnicode_READY(str) == -1) 13900 return -1; 13901 len = PyUnicode_GET_LENGTH(str); 13902 if (len == 0) 13903 return 0; 13904 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13905 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13906 if (writer->buffer == NULL && !writer->overallocate) { 13907 assert(_PyUnicode_CheckConsistency(str, 1)); 13908 writer->readonly = 1; 13909 Py_INCREF(str); 13910 writer->buffer = str; 13911 _PyUnicodeWriter_Update(writer); 13912 writer->pos += len; 13913 return 0; 13914 } 13915 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13916 return -1; 13917 } 13918 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13919 str, 0, len); 13920 writer->pos += len; 13921 return 0; 13922 } 13923 13924 int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13925 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13926 Py_ssize_t start, Py_ssize_t end) 13927 { 13928 Py_UCS4 maxchar; 13929 Py_ssize_t len; 13930 13931 if (PyUnicode_READY(str) == -1) 13932 return -1; 13933 13934 assert(0 <= start); 13935 assert(end <= PyUnicode_GET_LENGTH(str)); 13936 assert(start <= end); 13937 13938 if (end == 0) 13939 return 0; 13940 13941 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13942 return _PyUnicodeWriter_WriteStr(writer, str); 13943 13944 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13945 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13946 else 13947 maxchar = writer->maxchar; 13948 len = end - start; 13949 13950 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13951 return -1; 13952 13953 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13954 str, start, len); 13955 writer->pos += len; 13956 return 0; 13957 } 13958 13959 int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13960 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13961 const char *ascii, Py_ssize_t len) 13962 { 13963 if (len == -1) 13964 len = strlen(ascii); 13965 13966 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128); 13967 13968 if (writer->buffer == NULL && !writer->overallocate) { 13969 PyObject *str; 13970 13971 str = _PyUnicode_FromASCII(ascii, len); 13972 if (str == NULL) 13973 return -1; 13974 13975 writer->readonly = 1; 13976 writer->buffer = str; 13977 _PyUnicodeWriter_Update(writer); 13978 writer->pos += len; 13979 return 0; 13980 } 13981 13982 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13983 return -1; 13984 13985 switch (writer->kind) 13986 { 13987 case PyUnicode_1BYTE_KIND: 13988 { 13989 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13990 Py_UCS1 *data = writer->data; 13991 13992 memcpy(data + writer->pos, str, len); 13993 break; 13994 } 13995 case PyUnicode_2BYTE_KIND: 13996 { 13997 _PyUnicode_CONVERT_BYTES( 13998 Py_UCS1, Py_UCS2, 13999 ascii, ascii + len, 14000 (Py_UCS2 *)writer->data + writer->pos); 14001 break; 14002 } 14003 case PyUnicode_4BYTE_KIND: 14004 { 14005 _PyUnicode_CONVERT_BYTES( 14006 Py_UCS1, Py_UCS4, 14007 ascii, ascii + len, 14008 (Py_UCS4 *)writer->data + writer->pos); 14009 break; 14010 } 14011 default: 14012 Py_UNREACHABLE(); 14013 } 14014 14015 writer->pos += len; 14016 return 0; 14017 } 14018 14019 int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14020 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 14021 const char *str, Py_ssize_t len) 14022 { 14023 Py_UCS4 maxchar; 14024 14025 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len); 14026 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 14027 return -1; 14028 unicode_write_cstr(writer->buffer, writer->pos, str, len); 14029 writer->pos += len; 14030 return 0; 14031 } 14032 14033 PyObject * _PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14034 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 14035 { 14036 PyObject *str; 14037 14038 if (writer->pos == 0) { 14039 Py_CLEAR(writer->buffer); 14040 _Py_RETURN_UNICODE_EMPTY(); 14041 } 14042 14043 str = writer->buffer; 14044 writer->buffer = NULL; 14045 14046 if (writer->readonly) { 14047 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 14048 return str; 14049 } 14050 14051 if (PyUnicode_GET_LENGTH(str) != writer->pos) { 14052 PyObject *str2; 14053 str2 = resize_compact(str, writer->pos); 14054 if (str2 == NULL) { 14055 Py_DECREF(str); 14056 return NULL; 14057 } 14058 str = str2; 14059 } 14060 14061 assert(_PyUnicode_CheckConsistency(str, 1)); 14062 return unicode_result_ready(str); 14063 } 14064 14065 void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14066 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 14067 { 14068 Py_CLEAR(writer->buffer); 14069 } 14070 14071 #include "stringlib/unicode_format.h" 14072 14073 PyDoc_STRVAR(format__doc__, 14074 "S.format(*args, **kwargs) -> str\n\ 14075 \n\ 14076 Return a formatted version of S, using substitutions from args and kwargs.\n\ 14077 The substitutions are identified by braces ('{' and '}')."); 14078 14079 PyDoc_STRVAR(format_map__doc__, 14080 "S.format_map(mapping) -> str\n\ 14081 \n\ 14082 Return a formatted version of S, using substitutions from mapping.\n\ 14083 The substitutions are identified by braces ('{' and '}')."); 14084 14085 /*[clinic input] 14086 str.__format__ as unicode___format__ 14087 14088 format_spec: unicode 14089 / 14090 14091 Return a formatted version of the string as described by format_spec. 14092 [clinic start generated code]*/ 14093 14094 static PyObject * unicode___format___impl(PyObject * self,PyObject * format_spec)14095 unicode___format___impl(PyObject *self, PyObject *format_spec) 14096 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/ 14097 { 14098 _PyUnicodeWriter writer; 14099 int ret; 14100 14101 if (PyUnicode_READY(self) == -1) 14102 return NULL; 14103 _PyUnicodeWriter_Init(&writer); 14104 ret = _PyUnicode_FormatAdvancedWriter(&writer, 14105 self, format_spec, 0, 14106 PyUnicode_GET_LENGTH(format_spec)); 14107 if (ret == -1) { 14108 _PyUnicodeWriter_Dealloc(&writer); 14109 return NULL; 14110 } 14111 return _PyUnicodeWriter_Finish(&writer); 14112 } 14113 14114 /*[clinic input] 14115 str.__sizeof__ as unicode_sizeof 14116 14117 Return the size of the string in memory, in bytes. 14118 [clinic start generated code]*/ 14119 14120 static PyObject * unicode_sizeof_impl(PyObject * self)14121 unicode_sizeof_impl(PyObject *self) 14122 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/ 14123 { 14124 Py_ssize_t size; 14125 14126 /* If it's a compact object, account for base structure + 14127 character data. */ 14128 if (PyUnicode_IS_COMPACT_ASCII(self)) 14129 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1; 14130 else if (PyUnicode_IS_COMPACT(self)) 14131 size = sizeof(PyCompactUnicodeObject) + 14132 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self); 14133 else { 14134 /* If it is a two-block object, account for base object, and 14135 for character block if present. */ 14136 size = sizeof(PyUnicodeObject); 14137 if (_PyUnicode_DATA_ANY(self)) 14138 size += (PyUnicode_GET_LENGTH(self) + 1) * 14139 PyUnicode_KIND(self); 14140 } 14141 /* If the wstr pointer is present, account for it unless it is shared 14142 with the data pointer. Check if the data is not shared. */ 14143 if (_PyUnicode_HAS_WSTR_MEMORY(self)) 14144 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t); 14145 if (_PyUnicode_HAS_UTF8_MEMORY(self)) 14146 size += PyUnicode_UTF8_LENGTH(self) + 1; 14147 14148 return PyLong_FromSsize_t(size); 14149 } 14150 14151 static PyObject * unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14152 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored)) 14153 { 14154 PyObject *copy = _PyUnicode_Copy(v); 14155 if (!copy) 14156 return NULL; 14157 return Py_BuildValue("(N)", copy); 14158 } 14159 14160 static PyMethodDef unicode_methods[] = { 14161 UNICODE_ENCODE_METHODDEF 14162 UNICODE_REPLACE_METHODDEF 14163 UNICODE_SPLIT_METHODDEF 14164 UNICODE_RSPLIT_METHODDEF 14165 UNICODE_JOIN_METHODDEF 14166 UNICODE_CAPITALIZE_METHODDEF 14167 UNICODE_CASEFOLD_METHODDEF 14168 UNICODE_TITLE_METHODDEF 14169 UNICODE_CENTER_METHODDEF 14170 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 14171 UNICODE_EXPANDTABS_METHODDEF 14172 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 14173 UNICODE_PARTITION_METHODDEF 14174 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 14175 UNICODE_LJUST_METHODDEF 14176 UNICODE_LOWER_METHODDEF 14177 UNICODE_LSTRIP_METHODDEF 14178 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 14179 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 14180 UNICODE_RJUST_METHODDEF 14181 UNICODE_RSTRIP_METHODDEF 14182 UNICODE_RPARTITION_METHODDEF 14183 UNICODE_SPLITLINES_METHODDEF 14184 UNICODE_STRIP_METHODDEF 14185 UNICODE_SWAPCASE_METHODDEF 14186 UNICODE_TRANSLATE_METHODDEF 14187 UNICODE_UPPER_METHODDEF 14188 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 14189 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 14190 UNICODE_REMOVEPREFIX_METHODDEF 14191 UNICODE_REMOVESUFFIX_METHODDEF 14192 UNICODE_ISASCII_METHODDEF 14193 UNICODE_ISLOWER_METHODDEF 14194 UNICODE_ISUPPER_METHODDEF 14195 UNICODE_ISTITLE_METHODDEF 14196 UNICODE_ISSPACE_METHODDEF 14197 UNICODE_ISDECIMAL_METHODDEF 14198 UNICODE_ISDIGIT_METHODDEF 14199 UNICODE_ISNUMERIC_METHODDEF 14200 UNICODE_ISALPHA_METHODDEF 14201 UNICODE_ISALNUM_METHODDEF 14202 UNICODE_ISIDENTIFIER_METHODDEF 14203 UNICODE_ISPRINTABLE_METHODDEF 14204 UNICODE_ZFILL_METHODDEF 14205 {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__}, 14206 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 14207 UNICODE___FORMAT___METHODDEF 14208 UNICODE_MAKETRANS_METHODDEF 14209 UNICODE_SIZEOF_METHODDEF 14210 {"__getnewargs__", unicode_getnewargs, METH_NOARGS}, 14211 {NULL, NULL} 14212 }; 14213 14214 static PyObject * unicode_mod(PyObject * v,PyObject * w)14215 unicode_mod(PyObject *v, PyObject *w) 14216 { 14217 if (!PyUnicode_Check(v)) 14218 Py_RETURN_NOTIMPLEMENTED; 14219 return PyUnicode_Format(v, w); 14220 } 14221 14222 static PyNumberMethods unicode_as_number = { 14223 0, /*nb_add*/ 14224 0, /*nb_subtract*/ 14225 0, /*nb_multiply*/ 14226 unicode_mod, /*nb_remainder*/ 14227 }; 14228 14229 static PySequenceMethods unicode_as_sequence = { 14230 (lenfunc) unicode_length, /* sq_length */ 14231 PyUnicode_Concat, /* sq_concat */ 14232 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 14233 (ssizeargfunc) unicode_getitem, /* sq_item */ 14234 0, /* sq_slice */ 14235 0, /* sq_ass_item */ 14236 0, /* sq_ass_slice */ 14237 PyUnicode_Contains, /* sq_contains */ 14238 }; 14239 14240 static PyObject* unicode_subscript(PyObject * self,PyObject * item)14241 unicode_subscript(PyObject* self, PyObject* item) 14242 { 14243 if (PyUnicode_READY(self) == -1) 14244 return NULL; 14245 14246 if (_PyIndex_Check(item)) { 14247 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 14248 if (i == -1 && PyErr_Occurred()) 14249 return NULL; 14250 if (i < 0) 14251 i += PyUnicode_GET_LENGTH(self); 14252 return unicode_getitem(self, i); 14253 } else if (PySlice_Check(item)) { 14254 Py_ssize_t start, stop, step, slicelength, i; 14255 size_t cur; 14256 PyObject *result; 14257 const void *src_data; 14258 void *dest_data; 14259 int src_kind, dest_kind; 14260 Py_UCS4 ch, max_char, kind_limit; 14261 14262 if (PySlice_Unpack(item, &start, &stop, &step) < 0) { 14263 return NULL; 14264 } 14265 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self), 14266 &start, &stop, step); 14267 14268 if (slicelength <= 0) { 14269 _Py_RETURN_UNICODE_EMPTY(); 14270 } else if (start == 0 && step == 1 && 14271 slicelength == PyUnicode_GET_LENGTH(self)) { 14272 return unicode_result_unchanged(self); 14273 } else if (step == 1) { 14274 return PyUnicode_Substring(self, 14275 start, start + slicelength); 14276 } 14277 /* General case */ 14278 src_kind = PyUnicode_KIND(self); 14279 src_data = PyUnicode_DATA(self); 14280 if (!PyUnicode_IS_ASCII(self)) { 14281 kind_limit = kind_maxchar_limit(src_kind); 14282 max_char = 0; 14283 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14284 ch = PyUnicode_READ(src_kind, src_data, cur); 14285 if (ch > max_char) { 14286 max_char = ch; 14287 if (max_char >= kind_limit) 14288 break; 14289 } 14290 } 14291 } 14292 else 14293 max_char = 127; 14294 result = PyUnicode_New(slicelength, max_char); 14295 if (result == NULL) 14296 return NULL; 14297 dest_kind = PyUnicode_KIND(result); 14298 dest_data = PyUnicode_DATA(result); 14299 14300 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14301 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 14302 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 14303 } 14304 assert(_PyUnicode_CheckConsistency(result, 1)); 14305 return result; 14306 } else { 14307 PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'", 14308 Py_TYPE(item)->tp_name); 14309 return NULL; 14310 } 14311 } 14312 14313 static PyMappingMethods unicode_as_mapping = { 14314 (lenfunc)unicode_length, /* mp_length */ 14315 (binaryfunc)unicode_subscript, /* mp_subscript */ 14316 (objobjargproc)0, /* mp_ass_subscript */ 14317 }; 14318 14319 14320 /* Helpers for PyUnicode_Format() */ 14321 14322 struct unicode_formatter_t { 14323 PyObject *args; 14324 int args_owned; 14325 Py_ssize_t arglen, argidx; 14326 PyObject *dict; 14327 14328 enum PyUnicode_Kind fmtkind; 14329 Py_ssize_t fmtcnt, fmtpos; 14330 const void *fmtdata; 14331 PyObject *fmtstr; 14332 14333 _PyUnicodeWriter writer; 14334 }; 14335 14336 struct unicode_format_arg_t { 14337 Py_UCS4 ch; 14338 int flags; 14339 Py_ssize_t width; 14340 int prec; 14341 int sign; 14342 }; 14343 14344 static PyObject * unicode_format_getnextarg(struct unicode_formatter_t * ctx)14345 unicode_format_getnextarg(struct unicode_formatter_t *ctx) 14346 { 14347 Py_ssize_t argidx = ctx->argidx; 14348 14349 if (argidx < ctx->arglen) { 14350 ctx->argidx++; 14351 if (ctx->arglen < 0) 14352 return ctx->args; 14353 else 14354 return PyTuple_GetItem(ctx->args, argidx); 14355 } 14356 PyErr_SetString(PyExc_TypeError, 14357 "not enough arguments for format string"); 14358 return NULL; 14359 } 14360 14361 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 14362 14363 /* Format a float into the writer if the writer is not NULL, or into *p_output 14364 otherwise. 14365 14366 Return 0 on success, raise an exception and return -1 on error. */ 14367 static int formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14368 formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 14369 PyObject **p_output, 14370 _PyUnicodeWriter *writer) 14371 { 14372 char *p; 14373 double x; 14374 Py_ssize_t len; 14375 int prec; 14376 int dtoa_flags = 0; 14377 14378 x = PyFloat_AsDouble(v); 14379 if (x == -1.0 && PyErr_Occurred()) 14380 return -1; 14381 14382 prec = arg->prec; 14383 if (prec < 0) 14384 prec = 6; 14385 14386 if (arg->flags & F_ALT) 14387 dtoa_flags |= Py_DTSF_ALT; 14388 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 14389 if (p == NULL) 14390 return -1; 14391 len = strlen(p); 14392 if (writer) { 14393 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 14394 PyMem_Free(p); 14395 return -1; 14396 } 14397 } 14398 else 14399 *p_output = _PyUnicode_FromASCII(p, len); 14400 PyMem_Free(p); 14401 return 0; 14402 } 14403 14404 /* formatlong() emulates the format codes d, u, o, x and X, and 14405 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 14406 * Python's regular ints. 14407 * Return value: a new PyUnicodeObject*, or NULL if error. 14408 * The output string is of the form 14409 * "-"? ("0x" | "0X")? digit+ 14410 * "0x"/"0X" are present only for x and X conversions, with F_ALT 14411 * set in flags. The case of hex digits will be correct, 14412 * There will be at least prec digits, zero-filled on the left if 14413 * necessary to get that many. 14414 * val object to be converted 14415 * flags bitmask of format flags; only F_ALT is looked at 14416 * prec minimum number of digits; 0-fill on left if needed 14417 * type a character in [duoxX]; u acts the same as d 14418 * 14419 * CAUTION: o, x and X conversions on regular ints can never 14420 * produce a '-' sign, but can for Python's unbounded ints. 14421 */ 14422 PyObject * _PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14423 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 14424 { 14425 PyObject *result = NULL; 14426 char *buf; 14427 Py_ssize_t i; 14428 int sign; /* 1 if '-', else 0 */ 14429 int len; /* number of characters */ 14430 Py_ssize_t llen; 14431 int numdigits; /* len == numnondigits + numdigits */ 14432 int numnondigits = 0; 14433 14434 /* Avoid exceeding SSIZE_T_MAX */ 14435 if (prec > INT_MAX-3) { 14436 PyErr_SetString(PyExc_OverflowError, 14437 "precision too large"); 14438 return NULL; 14439 } 14440 14441 assert(PyLong_Check(val)); 14442 14443 switch (type) { 14444 default: 14445 Py_UNREACHABLE(); 14446 case 'd': 14447 case 'i': 14448 case 'u': 14449 /* int and int subclasses should print numerically when a numeric */ 14450 /* format code is used (see issue18780) */ 14451 result = PyNumber_ToBase(val, 10); 14452 break; 14453 case 'o': 14454 numnondigits = 2; 14455 result = PyNumber_ToBase(val, 8); 14456 break; 14457 case 'x': 14458 case 'X': 14459 numnondigits = 2; 14460 result = PyNumber_ToBase(val, 16); 14461 break; 14462 } 14463 if (!result) 14464 return NULL; 14465 14466 assert(unicode_modifiable(result)); 14467 assert(PyUnicode_IS_READY(result)); 14468 assert(PyUnicode_IS_ASCII(result)); 14469 14470 /* To modify the string in-place, there can only be one reference. */ 14471 if (Py_REFCNT(result) != 1) { 14472 Py_DECREF(result); 14473 PyErr_BadInternalCall(); 14474 return NULL; 14475 } 14476 buf = PyUnicode_DATA(result); 14477 llen = PyUnicode_GET_LENGTH(result); 14478 if (llen > INT_MAX) { 14479 Py_DECREF(result); 14480 PyErr_SetString(PyExc_ValueError, 14481 "string too large in _PyUnicode_FormatLong"); 14482 return NULL; 14483 } 14484 len = (int)llen; 14485 sign = buf[0] == '-'; 14486 numnondigits += sign; 14487 numdigits = len - numnondigits; 14488 assert(numdigits > 0); 14489 14490 /* Get rid of base marker unless F_ALT */ 14491 if (((alt) == 0 && 14492 (type == 'o' || type == 'x' || type == 'X'))) { 14493 assert(buf[sign] == '0'); 14494 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14495 buf[sign+1] == 'o'); 14496 numnondigits -= 2; 14497 buf += 2; 14498 len -= 2; 14499 if (sign) 14500 buf[0] = '-'; 14501 assert(len == numnondigits + numdigits); 14502 assert(numdigits > 0); 14503 } 14504 14505 /* Fill with leading zeroes to meet minimum width. */ 14506 if (prec > numdigits) { 14507 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14508 numnondigits + prec); 14509 char *b1; 14510 if (!r1) { 14511 Py_DECREF(result); 14512 return NULL; 14513 } 14514 b1 = PyBytes_AS_STRING(r1); 14515 for (i = 0; i < numnondigits; ++i) 14516 *b1++ = *buf++; 14517 for (i = 0; i < prec - numdigits; i++) 14518 *b1++ = '0'; 14519 for (i = 0; i < numdigits; i++) 14520 *b1++ = *buf++; 14521 *b1 = '\0'; 14522 Py_DECREF(result); 14523 result = r1; 14524 buf = PyBytes_AS_STRING(result); 14525 len = numnondigits + prec; 14526 } 14527 14528 /* Fix up case for hex conversions. */ 14529 if (type == 'X') { 14530 /* Need to convert all lower case letters to upper case. 14531 and need to convert 0x to 0X (and -0x to -0X). */ 14532 for (i = 0; i < len; i++) 14533 if (buf[i] >= 'a' && buf[i] <= 'x') 14534 buf[i] -= 'a'-'A'; 14535 } 14536 if (!PyUnicode_Check(result) 14537 || buf != PyUnicode_DATA(result)) { 14538 PyObject *unicode; 14539 unicode = _PyUnicode_FromASCII(buf, len); 14540 Py_DECREF(result); 14541 result = unicode; 14542 } 14543 else if (len != PyUnicode_GET_LENGTH(result)) { 14544 if (PyUnicode_Resize(&result, len) < 0) 14545 Py_CLEAR(result); 14546 } 14547 return result; 14548 } 14549 14550 /* Format an integer or a float as an integer. 14551 * Return 1 if the number has been formatted into the writer, 14552 * 0 if the number has been formatted into *p_output 14553 * -1 and raise an exception on error */ 14554 static int mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14555 mainformatlong(PyObject *v, 14556 struct unicode_format_arg_t *arg, 14557 PyObject **p_output, 14558 _PyUnicodeWriter *writer) 14559 { 14560 PyObject *iobj, *res; 14561 char type = (char)arg->ch; 14562 14563 if (!PyNumber_Check(v)) 14564 goto wrongtype; 14565 14566 /* make sure number is a type of integer for o, x, and X */ 14567 if (!PyLong_Check(v)) { 14568 if (type == 'o' || type == 'x' || type == 'X') { 14569 iobj = _PyNumber_Index(v); 14570 } 14571 else { 14572 iobj = PyNumber_Long(v); 14573 } 14574 if (iobj == NULL ) { 14575 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14576 goto wrongtype; 14577 return -1; 14578 } 14579 assert(PyLong_Check(iobj)); 14580 } 14581 else { 14582 iobj = v; 14583 Py_INCREF(iobj); 14584 } 14585 14586 if (PyLong_CheckExact(v) 14587 && arg->width == -1 && arg->prec == -1 14588 && !(arg->flags & (F_SIGN | F_BLANK)) 14589 && type != 'X') 14590 { 14591 /* Fast path */ 14592 int alternate = arg->flags & F_ALT; 14593 int base; 14594 14595 switch(type) 14596 { 14597 default: 14598 Py_UNREACHABLE(); 14599 case 'd': 14600 case 'i': 14601 case 'u': 14602 base = 10; 14603 break; 14604 case 'o': 14605 base = 8; 14606 break; 14607 case 'x': 14608 case 'X': 14609 base = 16; 14610 break; 14611 } 14612 14613 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14614 Py_DECREF(iobj); 14615 return -1; 14616 } 14617 Py_DECREF(iobj); 14618 return 1; 14619 } 14620 14621 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14622 Py_DECREF(iobj); 14623 if (res == NULL) 14624 return -1; 14625 *p_output = res; 14626 return 0; 14627 14628 wrongtype: 14629 switch(type) 14630 { 14631 case 'o': 14632 case 'x': 14633 case 'X': 14634 PyErr_Format(PyExc_TypeError, 14635 "%%%c format: an integer is required, " 14636 "not %.200s", 14637 type, Py_TYPE(v)->tp_name); 14638 break; 14639 default: 14640 PyErr_Format(PyExc_TypeError, 14641 "%%%c format: a real number is required, " 14642 "not %.200s", 14643 type, Py_TYPE(v)->tp_name); 14644 break; 14645 } 14646 return -1; 14647 } 14648 14649 static Py_UCS4 formatchar(PyObject * v)14650 formatchar(PyObject *v) 14651 { 14652 /* presume that the buffer is at least 3 characters long */ 14653 if (PyUnicode_Check(v)) { 14654 if (PyUnicode_GET_LENGTH(v) == 1) { 14655 return PyUnicode_READ_CHAR(v, 0); 14656 } 14657 goto onError; 14658 } 14659 else { 14660 int overflow; 14661 long x = PyLong_AsLongAndOverflow(v, &overflow); 14662 if (x == -1 && PyErr_Occurred()) { 14663 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 14664 goto onError; 14665 } 14666 return (Py_UCS4) -1; 14667 } 14668 14669 if (x < 0 || x > MAX_UNICODE) { 14670 /* this includes an overflow in converting to C long */ 14671 PyErr_SetString(PyExc_OverflowError, 14672 "%c arg not in range(0x110000)"); 14673 return (Py_UCS4) -1; 14674 } 14675 14676 return (Py_UCS4) x; 14677 } 14678 14679 onError: 14680 PyErr_SetString(PyExc_TypeError, 14681 "%c requires int or char"); 14682 return (Py_UCS4) -1; 14683 } 14684 14685 /* Parse options of an argument: flags, width, precision. 14686 Handle also "%(name)" syntax. 14687 14688 Return 0 if the argument has been formatted into arg->str. 14689 Return 1 if the argument has been written into ctx->writer, 14690 Raise an exception and return -1 on error. */ 14691 static int unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14692 unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14693 struct unicode_format_arg_t *arg) 14694 { 14695 #define FORMAT_READ(ctx) \ 14696 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14697 14698 PyObject *v; 14699 14700 if (arg->ch == '(') { 14701 /* Get argument value from a dictionary. Example: "%(name)s". */ 14702 Py_ssize_t keystart; 14703 Py_ssize_t keylen; 14704 PyObject *key; 14705 int pcount = 1; 14706 14707 if (ctx->dict == NULL) { 14708 PyErr_SetString(PyExc_TypeError, 14709 "format requires a mapping"); 14710 return -1; 14711 } 14712 ++ctx->fmtpos; 14713 --ctx->fmtcnt; 14714 keystart = ctx->fmtpos; 14715 /* Skip over balanced parentheses */ 14716 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14717 arg->ch = FORMAT_READ(ctx); 14718 if (arg->ch == ')') 14719 --pcount; 14720 else if (arg->ch == '(') 14721 ++pcount; 14722 ctx->fmtpos++; 14723 } 14724 keylen = ctx->fmtpos - keystart - 1; 14725 if (ctx->fmtcnt < 0 || pcount > 0) { 14726 PyErr_SetString(PyExc_ValueError, 14727 "incomplete format key"); 14728 return -1; 14729 } 14730 key = PyUnicode_Substring(ctx->fmtstr, 14731 keystart, keystart + keylen); 14732 if (key == NULL) 14733 return -1; 14734 if (ctx->args_owned) { 14735 ctx->args_owned = 0; 14736 Py_DECREF(ctx->args); 14737 } 14738 ctx->args = PyObject_GetItem(ctx->dict, key); 14739 Py_DECREF(key); 14740 if (ctx->args == NULL) 14741 return -1; 14742 ctx->args_owned = 1; 14743 ctx->arglen = -1; 14744 ctx->argidx = -2; 14745 } 14746 14747 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14748 while (--ctx->fmtcnt >= 0) { 14749 arg->ch = FORMAT_READ(ctx); 14750 ctx->fmtpos++; 14751 switch (arg->ch) { 14752 case '-': arg->flags |= F_LJUST; continue; 14753 case '+': arg->flags |= F_SIGN; continue; 14754 case ' ': arg->flags |= F_BLANK; continue; 14755 case '#': arg->flags |= F_ALT; continue; 14756 case '0': arg->flags |= F_ZERO; continue; 14757 } 14758 break; 14759 } 14760 14761 /* Parse width. Example: "%10s" => width=10 */ 14762 if (arg->ch == '*') { 14763 v = unicode_format_getnextarg(ctx); 14764 if (v == NULL) 14765 return -1; 14766 if (!PyLong_Check(v)) { 14767 PyErr_SetString(PyExc_TypeError, 14768 "* wants int"); 14769 return -1; 14770 } 14771 arg->width = PyLong_AsSsize_t(v); 14772 if (arg->width == -1 && PyErr_Occurred()) 14773 return -1; 14774 if (arg->width < 0) { 14775 arg->flags |= F_LJUST; 14776 arg->width = -arg->width; 14777 } 14778 if (--ctx->fmtcnt >= 0) { 14779 arg->ch = FORMAT_READ(ctx); 14780 ctx->fmtpos++; 14781 } 14782 } 14783 else if (arg->ch >= '0' && arg->ch <= '9') { 14784 arg->width = arg->ch - '0'; 14785 while (--ctx->fmtcnt >= 0) { 14786 arg->ch = FORMAT_READ(ctx); 14787 ctx->fmtpos++; 14788 if (arg->ch < '0' || arg->ch > '9') 14789 break; 14790 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14791 mixing signed and unsigned comparison. Since arg->ch is between 14792 '0' and '9', casting to int is safe. */ 14793 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14794 PyErr_SetString(PyExc_ValueError, 14795 "width too big"); 14796 return -1; 14797 } 14798 arg->width = arg->width*10 + (arg->ch - '0'); 14799 } 14800 } 14801 14802 /* Parse precision. Example: "%.3f" => prec=3 */ 14803 if (arg->ch == '.') { 14804 arg->prec = 0; 14805 if (--ctx->fmtcnt >= 0) { 14806 arg->ch = FORMAT_READ(ctx); 14807 ctx->fmtpos++; 14808 } 14809 if (arg->ch == '*') { 14810 v = unicode_format_getnextarg(ctx); 14811 if (v == NULL) 14812 return -1; 14813 if (!PyLong_Check(v)) { 14814 PyErr_SetString(PyExc_TypeError, 14815 "* wants int"); 14816 return -1; 14817 } 14818 arg->prec = _PyLong_AsInt(v); 14819 if (arg->prec == -1 && PyErr_Occurred()) 14820 return -1; 14821 if (arg->prec < 0) 14822 arg->prec = 0; 14823 if (--ctx->fmtcnt >= 0) { 14824 arg->ch = FORMAT_READ(ctx); 14825 ctx->fmtpos++; 14826 } 14827 } 14828 else if (arg->ch >= '0' && arg->ch <= '9') { 14829 arg->prec = arg->ch - '0'; 14830 while (--ctx->fmtcnt >= 0) { 14831 arg->ch = FORMAT_READ(ctx); 14832 ctx->fmtpos++; 14833 if (arg->ch < '0' || arg->ch > '9') 14834 break; 14835 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14836 PyErr_SetString(PyExc_ValueError, 14837 "precision too big"); 14838 return -1; 14839 } 14840 arg->prec = arg->prec*10 + (arg->ch - '0'); 14841 } 14842 } 14843 } 14844 14845 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14846 if (ctx->fmtcnt >= 0) { 14847 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14848 if (--ctx->fmtcnt >= 0) { 14849 arg->ch = FORMAT_READ(ctx); 14850 ctx->fmtpos++; 14851 } 14852 } 14853 } 14854 if (ctx->fmtcnt < 0) { 14855 PyErr_SetString(PyExc_ValueError, 14856 "incomplete format"); 14857 return -1; 14858 } 14859 return 0; 14860 14861 #undef FORMAT_READ 14862 } 14863 14864 /* Format one argument. Supported conversion specifiers: 14865 14866 - "s", "r", "a": any type 14867 - "i", "d", "u": int or float 14868 - "o", "x", "X": int 14869 - "e", "E", "f", "F", "g", "G": float 14870 - "c": int or str (1 character) 14871 14872 When possible, the output is written directly into the Unicode writer 14873 (ctx->writer). A string is created when padding is required. 14874 14875 Return 0 if the argument has been formatted into *p_str, 14876 1 if the argument has been written into ctx->writer, 14877 -1 on error. */ 14878 static int unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14879 unicode_format_arg_format(struct unicode_formatter_t *ctx, 14880 struct unicode_format_arg_t *arg, 14881 PyObject **p_str) 14882 { 14883 PyObject *v; 14884 _PyUnicodeWriter *writer = &ctx->writer; 14885 14886 if (ctx->fmtcnt == 0) 14887 ctx->writer.overallocate = 0; 14888 14889 v = unicode_format_getnextarg(ctx); 14890 if (v == NULL) 14891 return -1; 14892 14893 14894 switch (arg->ch) { 14895 case 's': 14896 case 'r': 14897 case 'a': 14898 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14899 /* Fast path */ 14900 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14901 return -1; 14902 return 1; 14903 } 14904 14905 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14906 *p_str = v; 14907 Py_INCREF(*p_str); 14908 } 14909 else { 14910 if (arg->ch == 's') 14911 *p_str = PyObject_Str(v); 14912 else if (arg->ch == 'r') 14913 *p_str = PyObject_Repr(v); 14914 else 14915 *p_str = PyObject_ASCII(v); 14916 } 14917 break; 14918 14919 case 'i': 14920 case 'd': 14921 case 'u': 14922 case 'o': 14923 case 'x': 14924 case 'X': 14925 { 14926 int ret = mainformatlong(v, arg, p_str, writer); 14927 if (ret != 0) 14928 return ret; 14929 arg->sign = 1; 14930 break; 14931 } 14932 14933 case 'e': 14934 case 'E': 14935 case 'f': 14936 case 'F': 14937 case 'g': 14938 case 'G': 14939 if (arg->width == -1 && arg->prec == -1 14940 && !(arg->flags & (F_SIGN | F_BLANK))) 14941 { 14942 /* Fast path */ 14943 if (formatfloat(v, arg, NULL, writer) == -1) 14944 return -1; 14945 return 1; 14946 } 14947 14948 arg->sign = 1; 14949 if (formatfloat(v, arg, p_str, NULL) == -1) 14950 return -1; 14951 break; 14952 14953 case 'c': 14954 { 14955 Py_UCS4 ch = formatchar(v); 14956 if (ch == (Py_UCS4) -1) 14957 return -1; 14958 if (arg->width == -1 && arg->prec == -1) { 14959 /* Fast path */ 14960 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14961 return -1; 14962 return 1; 14963 } 14964 *p_str = PyUnicode_FromOrdinal(ch); 14965 break; 14966 } 14967 14968 default: 14969 PyErr_Format(PyExc_ValueError, 14970 "unsupported format character '%c' (0x%x) " 14971 "at index %zd", 14972 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14973 (int)arg->ch, 14974 ctx->fmtpos - 1); 14975 return -1; 14976 } 14977 if (*p_str == NULL) 14978 return -1; 14979 assert (PyUnicode_Check(*p_str)); 14980 return 0; 14981 } 14982 14983 static int unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14984 unicode_format_arg_output(struct unicode_formatter_t *ctx, 14985 struct unicode_format_arg_t *arg, 14986 PyObject *str) 14987 { 14988 Py_ssize_t len; 14989 enum PyUnicode_Kind kind; 14990 const void *pbuf; 14991 Py_ssize_t pindex; 14992 Py_UCS4 signchar; 14993 Py_ssize_t buflen; 14994 Py_UCS4 maxchar; 14995 Py_ssize_t sublen; 14996 _PyUnicodeWriter *writer = &ctx->writer; 14997 Py_UCS4 fill; 14998 14999 fill = ' '; 15000 if (arg->sign && arg->flags & F_ZERO) 15001 fill = '0'; 15002 15003 if (PyUnicode_READY(str) == -1) 15004 return -1; 15005 15006 len = PyUnicode_GET_LENGTH(str); 15007 if ((arg->width == -1 || arg->width <= len) 15008 && (arg->prec == -1 || arg->prec >= len) 15009 && !(arg->flags & (F_SIGN | F_BLANK))) 15010 { 15011 /* Fast path */ 15012 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 15013 return -1; 15014 return 0; 15015 } 15016 15017 /* Truncate the string for "s", "r" and "a" formats 15018 if the precision is set */ 15019 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 15020 if (arg->prec >= 0 && len > arg->prec) 15021 len = arg->prec; 15022 } 15023 15024 /* Adjust sign and width */ 15025 kind = PyUnicode_KIND(str); 15026 pbuf = PyUnicode_DATA(str); 15027 pindex = 0; 15028 signchar = '\0'; 15029 if (arg->sign) { 15030 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 15031 if (ch == '-' || ch == '+') { 15032 signchar = ch; 15033 len--; 15034 pindex++; 15035 } 15036 else if (arg->flags & F_SIGN) 15037 signchar = '+'; 15038 else if (arg->flags & F_BLANK) 15039 signchar = ' '; 15040 else 15041 arg->sign = 0; 15042 } 15043 if (arg->width < len) 15044 arg->width = len; 15045 15046 /* Prepare the writer */ 15047 maxchar = writer->maxchar; 15048 if (!(arg->flags & F_LJUST)) { 15049 if (arg->sign) { 15050 if ((arg->width-1) > len) 15051 maxchar = Py_MAX(maxchar, fill); 15052 } 15053 else { 15054 if (arg->width > len) 15055 maxchar = Py_MAX(maxchar, fill); 15056 } 15057 } 15058 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 15059 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 15060 maxchar = Py_MAX(maxchar, strmaxchar); 15061 } 15062 15063 buflen = arg->width; 15064 if (arg->sign && len == arg->width) 15065 buflen++; 15066 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 15067 return -1; 15068 15069 /* Write the sign if needed */ 15070 if (arg->sign) { 15071 if (fill != ' ') { 15072 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 15073 writer->pos += 1; 15074 } 15075 if (arg->width > len) 15076 arg->width--; 15077 } 15078 15079 /* Write the numeric prefix for "x", "X" and "o" formats 15080 if the alternate form is used. 15081 For example, write "0x" for the "%#x" format. */ 15082 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 15083 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 15084 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 15085 if (fill != ' ') { 15086 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 15087 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 15088 writer->pos += 2; 15089 pindex += 2; 15090 } 15091 arg->width -= 2; 15092 if (arg->width < 0) 15093 arg->width = 0; 15094 len -= 2; 15095 } 15096 15097 /* Pad left with the fill character if needed */ 15098 if (arg->width > len && !(arg->flags & F_LJUST)) { 15099 sublen = arg->width - len; 15100 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen); 15101 writer->pos += sublen; 15102 arg->width = len; 15103 } 15104 15105 /* If padding with spaces: write sign if needed and/or numeric prefix if 15106 the alternate form is used */ 15107 if (fill == ' ') { 15108 if (arg->sign) { 15109 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 15110 writer->pos += 1; 15111 } 15112 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 15113 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 15114 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 15115 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 15116 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 15117 writer->pos += 2; 15118 pindex += 2; 15119 } 15120 } 15121 15122 /* Write characters */ 15123 if (len) { 15124 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 15125 str, pindex, len); 15126 writer->pos += len; 15127 } 15128 15129 /* Pad right with the fill character if needed */ 15130 if (arg->width > len) { 15131 sublen = arg->width - len; 15132 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen); 15133 writer->pos += sublen; 15134 } 15135 return 0; 15136 } 15137 15138 /* Helper of PyUnicode_Format(): format one arg. 15139 Return 0 on success, raise an exception and return -1 on error. */ 15140 static int unicode_format_arg(struct unicode_formatter_t * ctx)15141 unicode_format_arg(struct unicode_formatter_t *ctx) 15142 { 15143 struct unicode_format_arg_t arg; 15144 PyObject *str; 15145 int ret; 15146 15147 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 15148 if (arg.ch == '%') { 15149 ctx->fmtpos++; 15150 ctx->fmtcnt--; 15151 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0) 15152 return -1; 15153 return 0; 15154 } 15155 arg.flags = 0; 15156 arg.width = -1; 15157 arg.prec = -1; 15158 arg.sign = 0; 15159 str = NULL; 15160 15161 ret = unicode_format_arg_parse(ctx, &arg); 15162 if (ret == -1) 15163 return -1; 15164 15165 ret = unicode_format_arg_format(ctx, &arg, &str); 15166 if (ret == -1) 15167 return -1; 15168 15169 if (ret != 1) { 15170 ret = unicode_format_arg_output(ctx, &arg, str); 15171 Py_DECREF(str); 15172 if (ret == -1) 15173 return -1; 15174 } 15175 15176 if (ctx->dict && (ctx->argidx < ctx->arglen)) { 15177 PyErr_SetString(PyExc_TypeError, 15178 "not all arguments converted during string formatting"); 15179 return -1; 15180 } 15181 return 0; 15182 } 15183 15184 PyObject * PyUnicode_Format(PyObject * format,PyObject * args)15185 PyUnicode_Format(PyObject *format, PyObject *args) 15186 { 15187 struct unicode_formatter_t ctx; 15188 15189 if (format == NULL || args == NULL) { 15190 PyErr_BadInternalCall(); 15191 return NULL; 15192 } 15193 15194 if (ensure_unicode(format) < 0) 15195 return NULL; 15196 15197 ctx.fmtstr = format; 15198 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 15199 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 15200 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 15201 ctx.fmtpos = 0; 15202 15203 _PyUnicodeWriter_Init(&ctx.writer); 15204 ctx.writer.min_length = ctx.fmtcnt + 100; 15205 ctx.writer.overallocate = 1; 15206 15207 if (PyTuple_Check(args)) { 15208 ctx.arglen = PyTuple_Size(args); 15209 ctx.argidx = 0; 15210 } 15211 else { 15212 ctx.arglen = -1; 15213 ctx.argidx = -2; 15214 } 15215 ctx.args_owned = 0; 15216 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 15217 ctx.dict = args; 15218 else 15219 ctx.dict = NULL; 15220 ctx.args = args; 15221 15222 while (--ctx.fmtcnt >= 0) { 15223 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 15224 Py_ssize_t nonfmtpos; 15225 15226 nonfmtpos = ctx.fmtpos++; 15227 while (ctx.fmtcnt >= 0 && 15228 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 15229 ctx.fmtpos++; 15230 ctx.fmtcnt--; 15231 } 15232 if (ctx.fmtcnt < 0) { 15233 ctx.fmtpos--; 15234 ctx.writer.overallocate = 0; 15235 } 15236 15237 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 15238 nonfmtpos, ctx.fmtpos) < 0) 15239 goto onError; 15240 } 15241 else { 15242 ctx.fmtpos++; 15243 if (unicode_format_arg(&ctx) == -1) 15244 goto onError; 15245 } 15246 } 15247 15248 if (ctx.argidx < ctx.arglen && !ctx.dict) { 15249 PyErr_SetString(PyExc_TypeError, 15250 "not all arguments converted during string formatting"); 15251 goto onError; 15252 } 15253 15254 if (ctx.args_owned) { 15255 Py_DECREF(ctx.args); 15256 } 15257 return _PyUnicodeWriter_Finish(&ctx.writer); 15258 15259 onError: 15260 _PyUnicodeWriter_Dealloc(&ctx.writer); 15261 if (ctx.args_owned) { 15262 Py_DECREF(ctx.args); 15263 } 15264 return NULL; 15265 } 15266 15267 static PyObject * 15268 unicode_subtype_new(PyTypeObject *type, PyObject *unicode); 15269 15270 /*[clinic input] 15271 @classmethod 15272 str.__new__ as unicode_new 15273 15274 object as x: object = NULL 15275 encoding: str = NULL 15276 errors: str = NULL 15277 15278 [clinic start generated code]*/ 15279 15280 static PyObject * unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15281 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding, 15282 const char *errors) 15283 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/ 15284 { 15285 PyObject *unicode; 15286 if (x == NULL) { 15287 unicode = unicode_new_empty(); 15288 } 15289 else if (encoding == NULL && errors == NULL) { 15290 unicode = PyObject_Str(x); 15291 } 15292 else { 15293 unicode = PyUnicode_FromEncodedObject(x, encoding, errors); 15294 } 15295 15296 if (unicode != NULL && type != &PyUnicode_Type) { 15297 Py_SETREF(unicode, unicode_subtype_new(type, unicode)); 15298 } 15299 return unicode; 15300 } 15301 15302 static PyObject * unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15303 unicode_subtype_new(PyTypeObject *type, PyObject *unicode) 15304 { 15305 PyObject *self; 15306 Py_ssize_t length, char_size; 15307 int share_wstr, share_utf8; 15308 unsigned int kind; 15309 void *data; 15310 15311 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 15312 assert(_PyUnicode_CHECK(unicode)); 15313 if (PyUnicode_READY(unicode) == -1) { 15314 return NULL; 15315 } 15316 15317 self = type->tp_alloc(type, 0); 15318 if (self == NULL) { 15319 return NULL; 15320 } 15321 kind = PyUnicode_KIND(unicode); 15322 length = PyUnicode_GET_LENGTH(unicode); 15323 15324 _PyUnicode_LENGTH(self) = length; 15325 #ifdef Py_DEBUG 15326 _PyUnicode_HASH(self) = -1; 15327 #else 15328 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15329 #endif 15330 _PyUnicode_STATE(self).interned = 0; 15331 _PyUnicode_STATE(self).kind = kind; 15332 _PyUnicode_STATE(self).compact = 0; 15333 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 15334 _PyUnicode_STATE(self).ready = 1; 15335 _PyUnicode_WSTR(self) = NULL; 15336 _PyUnicode_UTF8_LENGTH(self) = 0; 15337 _PyUnicode_UTF8(self) = NULL; 15338 _PyUnicode_WSTR_LENGTH(self) = 0; 15339 _PyUnicode_DATA_ANY(self) = NULL; 15340 15341 share_utf8 = 0; 15342 share_wstr = 0; 15343 if (kind == PyUnicode_1BYTE_KIND) { 15344 char_size = 1; 15345 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 15346 share_utf8 = 1; 15347 } 15348 else if (kind == PyUnicode_2BYTE_KIND) { 15349 char_size = 2; 15350 if (sizeof(wchar_t) == 2) 15351 share_wstr = 1; 15352 } 15353 else { 15354 assert(kind == PyUnicode_4BYTE_KIND); 15355 char_size = 4; 15356 if (sizeof(wchar_t) == 4) 15357 share_wstr = 1; 15358 } 15359 15360 /* Ensure we won't overflow the length. */ 15361 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 15362 PyErr_NoMemory(); 15363 goto onError; 15364 } 15365 data = PyObject_Malloc((length + 1) * char_size); 15366 if (data == NULL) { 15367 PyErr_NoMemory(); 15368 goto onError; 15369 } 15370 15371 _PyUnicode_DATA_ANY(self) = data; 15372 if (share_utf8) { 15373 _PyUnicode_UTF8_LENGTH(self) = length; 15374 _PyUnicode_UTF8(self) = data; 15375 } 15376 if (share_wstr) { 15377 _PyUnicode_WSTR_LENGTH(self) = length; 15378 _PyUnicode_WSTR(self) = (wchar_t *)data; 15379 } 15380 15381 memcpy(data, PyUnicode_DATA(unicode), 15382 kind * (length + 1)); 15383 assert(_PyUnicode_CheckConsistency(self, 1)); 15384 #ifdef Py_DEBUG 15385 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15386 #endif 15387 return self; 15388 15389 onError: 15390 Py_DECREF(self); 15391 return NULL; 15392 } 15393 15394 void _PyUnicode_ExactDealloc(PyObject * op)15395 _PyUnicode_ExactDealloc(PyObject *op) 15396 { 15397 assert(PyUnicode_CheckExact(op)); 15398 unicode_dealloc(op); 15399 } 15400 15401 PyDoc_STRVAR(unicode_doc, 15402 "str(object='') -> str\n\ 15403 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 15404 \n\ 15405 Create a new string object from the given object. If encoding or\n\ 15406 errors is specified, then the object must expose a data buffer\n\ 15407 that will be decoded using the given encoding and error handler.\n\ 15408 Otherwise, returns the result of object.__str__() (if defined)\n\ 15409 or repr(object).\n\ 15410 encoding defaults to sys.getdefaultencoding().\n\ 15411 errors defaults to 'strict'."); 15412 15413 static PyObject *unicode_iter(PyObject *seq); 15414 15415 PyTypeObject PyUnicode_Type = { 15416 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15417 "str", /* tp_name */ 15418 sizeof(PyUnicodeObject), /* tp_basicsize */ 15419 0, /* tp_itemsize */ 15420 /* Slots */ 15421 (destructor)unicode_dealloc, /* tp_dealloc */ 15422 0, /* tp_vectorcall_offset */ 15423 0, /* tp_getattr */ 15424 0, /* tp_setattr */ 15425 0, /* tp_as_async */ 15426 unicode_repr, /* tp_repr */ 15427 &unicode_as_number, /* tp_as_number */ 15428 &unicode_as_sequence, /* tp_as_sequence */ 15429 &unicode_as_mapping, /* tp_as_mapping */ 15430 (hashfunc) unicode_hash, /* tp_hash*/ 15431 0, /* tp_call*/ 15432 (reprfunc) unicode_str, /* tp_str */ 15433 PyObject_GenericGetAttr, /* tp_getattro */ 15434 0, /* tp_setattro */ 15435 0, /* tp_as_buffer */ 15436 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 15437 Py_TPFLAGS_UNICODE_SUBCLASS | 15438 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */ 15439 unicode_doc, /* tp_doc */ 15440 0, /* tp_traverse */ 15441 0, /* tp_clear */ 15442 PyUnicode_RichCompare, /* tp_richcompare */ 15443 0, /* tp_weaklistoffset */ 15444 unicode_iter, /* tp_iter */ 15445 0, /* tp_iternext */ 15446 unicode_methods, /* tp_methods */ 15447 0, /* tp_members */ 15448 0, /* tp_getset */ 15449 0, /* tp_base */ 15450 0, /* tp_dict */ 15451 0, /* tp_descr_get */ 15452 0, /* tp_descr_set */ 15453 0, /* tp_dictoffset */ 15454 0, /* tp_init */ 15455 0, /* tp_alloc */ 15456 unicode_new, /* tp_new */ 15457 PyObject_Del, /* tp_free */ 15458 }; 15459 15460 /* Initialize the Unicode implementation */ 15461 15462 void _PyUnicode_InitState(PyInterpreterState * interp)15463 _PyUnicode_InitState(PyInterpreterState *interp) 15464 { 15465 if (!_Py_IsMainInterpreter(interp)) { 15466 return; 15467 } 15468 15469 /* initialize the linebreak bloom filter */ 15470 const Py_UCS2 linebreak[] = { 15471 0x000A, /* LINE FEED */ 15472 0x000D, /* CARRIAGE RETURN */ 15473 0x001C, /* FILE SEPARATOR */ 15474 0x001D, /* GROUP SEPARATOR */ 15475 0x001E, /* RECORD SEPARATOR */ 15476 0x0085, /* NEXT LINE */ 15477 0x2028, /* LINE SEPARATOR */ 15478 0x2029, /* PARAGRAPH SEPARATOR */ 15479 }; 15480 bloom_linebreak = make_bloom_mask( 15481 PyUnicode_2BYTE_KIND, linebreak, 15482 Py_ARRAY_LENGTH(linebreak)); 15483 } 15484 15485 15486 PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15487 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp) 15488 { 15489 if (!_Py_IsMainInterpreter(interp)) { 15490 return _PyStatus_OK(); 15491 } 15492 15493 #ifdef Py_DEBUG 15494 assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); 15495 15496 for (int i = 0; i < 256; i++) { 15497 assert(_PyUnicode_CheckConsistency(LATIN1(i), 1)); 15498 } 15499 #endif 15500 15501 return _PyStatus_OK(); 15502 } 15503 15504 15505 PyStatus _PyUnicode_InitTypes(PyInterpreterState * interp)15506 _PyUnicode_InitTypes(PyInterpreterState *interp) 15507 { 15508 if (!_Py_IsMainInterpreter(interp)) { 15509 return _PyStatus_OK(); 15510 } 15511 15512 if (PyType_Ready(&EncodingMapType) < 0) { 15513 goto error; 15514 } 15515 if (PyType_Ready(&PyFieldNameIter_Type) < 0) { 15516 goto error; 15517 } 15518 if (PyType_Ready(&PyFormatterIter_Type) < 0) { 15519 goto error; 15520 } 15521 return _PyStatus_OK(); 15522 15523 error: 15524 return _PyStatus_ERR("Can't initialize unicode types"); 15525 } 15526 15527 15528 void PyUnicode_InternInPlace(PyObject ** p)15529 PyUnicode_InternInPlace(PyObject **p) 15530 { 15531 PyObject *s = *p; 15532 #ifdef Py_DEBUG 15533 assert(s != NULL); 15534 assert(_PyUnicode_CHECK(s)); 15535 #else 15536 if (s == NULL || !PyUnicode_Check(s)) { 15537 return; 15538 } 15539 #endif 15540 15541 /* If it's a subclass, we don't really know what putting 15542 it in the interned dict might do. */ 15543 if (!PyUnicode_CheckExact(s)) { 15544 return; 15545 } 15546 15547 if (PyUnicode_CHECK_INTERNED(s)) { 15548 return; 15549 } 15550 15551 if (PyUnicode_READY(s) == -1) { 15552 PyErr_Clear(); 15553 return; 15554 } 15555 15556 if (interned == NULL) { 15557 interned = PyDict_New(); 15558 if (interned == NULL) { 15559 PyErr_Clear(); /* Don't leave an exception */ 15560 return; 15561 } 15562 } 15563 15564 PyObject *t = PyDict_SetDefault(interned, s, s); 15565 if (t == NULL) { 15566 PyErr_Clear(); 15567 return; 15568 } 15569 15570 if (t != s) { 15571 Py_INCREF(t); 15572 Py_SETREF(*p, t); 15573 return; 15574 } 15575 15576 /* The two references in interned dict (key and value) are not counted by 15577 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of 15578 this. */ 15579 Py_SET_REFCNT(s, Py_REFCNT(s) - 2); 15580 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15581 } 15582 15583 void PyUnicode_InternImmortal(PyObject ** p)15584 PyUnicode_InternImmortal(PyObject **p) 15585 { 15586 if (PyErr_WarnEx(PyExc_DeprecationWarning, 15587 "PyUnicode_InternImmortal() is deprecated; " 15588 "use PyUnicode_InternInPlace() instead", 1) < 0) 15589 { 15590 // The function has no return value, the exception cannot 15591 // be reported to the caller, so just log it. 15592 PyErr_WriteUnraisable(NULL); 15593 } 15594 15595 PyUnicode_InternInPlace(p); 15596 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15597 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15598 Py_INCREF(*p); 15599 } 15600 } 15601 15602 PyObject * PyUnicode_InternFromString(const char * cp)15603 PyUnicode_InternFromString(const char *cp) 15604 { 15605 PyObject *s = PyUnicode_FromString(cp); 15606 if (s == NULL) 15607 return NULL; 15608 PyUnicode_InternInPlace(&s); 15609 return s; 15610 } 15611 15612 15613 void _PyUnicode_ClearInterned(PyInterpreterState * interp)15614 _PyUnicode_ClearInterned(PyInterpreterState *interp) 15615 { 15616 if (!_Py_IsMainInterpreter(interp)) { 15617 // interned dict is shared by all interpreters 15618 return; 15619 } 15620 15621 if (interned == NULL) { 15622 return; 15623 } 15624 assert(PyDict_CheckExact(interned)); 15625 15626 /* Interned unicode strings are not forcibly deallocated; rather, we give 15627 them their stolen references back, and then clear and DECREF the 15628 interned dict. */ 15629 15630 #ifdef INTERNED_STATS 15631 fprintf(stderr, "releasing %zd interned strings\n", 15632 PyDict_GET_SIZE(interned)); 15633 15634 Py_ssize_t immortal_size = 0, mortal_size = 0; 15635 #endif 15636 Py_ssize_t pos = 0; 15637 PyObject *s, *ignored_value; 15638 while (PyDict_Next(interned, &pos, &s, &ignored_value)) { 15639 assert(PyUnicode_IS_READY(s)); 15640 15641 switch (PyUnicode_CHECK_INTERNED(s)) { 15642 case SSTATE_INTERNED_IMMORTAL: 15643 Py_SET_REFCNT(s, Py_REFCNT(s) + 1); 15644 #ifdef INTERNED_STATS 15645 immortal_size += PyUnicode_GET_LENGTH(s); 15646 #endif 15647 break; 15648 case SSTATE_INTERNED_MORTAL: 15649 // Restore the two references (key and value) ignored 15650 // by PyUnicode_InternInPlace(). 15651 Py_SET_REFCNT(s, Py_REFCNT(s) + 2); 15652 #ifdef INTERNED_STATS 15653 mortal_size += PyUnicode_GET_LENGTH(s); 15654 #endif 15655 break; 15656 case SSTATE_NOT_INTERNED: 15657 /* fall through */ 15658 default: 15659 Py_UNREACHABLE(); 15660 } 15661 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15662 } 15663 #ifdef INTERNED_STATS 15664 fprintf(stderr, 15665 "total size of all interned strings: %zd/%zd mortal/immortal\n", 15666 mortal_size, immortal_size); 15667 #endif 15668 15669 PyDict_Clear(interned); 15670 Py_CLEAR(interned); 15671 } 15672 15673 15674 /********************* Unicode Iterator **************************/ 15675 15676 typedef struct { 15677 PyObject_HEAD 15678 Py_ssize_t it_index; 15679 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15680 } unicodeiterobject; 15681 15682 static void unicodeiter_dealloc(unicodeiterobject * it)15683 unicodeiter_dealloc(unicodeiterobject *it) 15684 { 15685 _PyObject_GC_UNTRACK(it); 15686 Py_XDECREF(it->it_seq); 15687 PyObject_GC_Del(it); 15688 } 15689 15690 static int unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15691 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15692 { 15693 Py_VISIT(it->it_seq); 15694 return 0; 15695 } 15696 15697 static PyObject * unicodeiter_next(unicodeiterobject * it)15698 unicodeiter_next(unicodeiterobject *it) 15699 { 15700 PyObject *seq; 15701 15702 assert(it != NULL); 15703 seq = it->it_seq; 15704 if (seq == NULL) 15705 return NULL; 15706 assert(_PyUnicode_CHECK(seq)); 15707 15708 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15709 int kind = PyUnicode_KIND(seq); 15710 const void *data = PyUnicode_DATA(seq); 15711 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15712 it->it_index++; 15713 return unicode_char(chr); 15714 } 15715 15716 it->it_seq = NULL; 15717 Py_DECREF(seq); 15718 return NULL; 15719 } 15720 15721 static PyObject * unicode_ascii_iter_next(unicodeiterobject * it)15722 unicode_ascii_iter_next(unicodeiterobject *it) 15723 { 15724 assert(it != NULL); 15725 PyObject *seq = it->it_seq; 15726 if (seq == NULL) { 15727 return NULL; 15728 } 15729 assert(_PyUnicode_CHECK(seq)); 15730 assert(PyUnicode_IS_COMPACT_ASCII(seq)); 15731 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15732 const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1)); 15733 Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND, 15734 data, it->it_index); 15735 it->it_index++; 15736 PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr]; 15737 return Py_NewRef(item); 15738 } 15739 it->it_seq = NULL; 15740 Py_DECREF(seq); 15741 return NULL; 15742 } 15743 15744 static PyObject * unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15745 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored)) 15746 { 15747 Py_ssize_t len = 0; 15748 if (it->it_seq) 15749 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15750 return PyLong_FromSsize_t(len); 15751 } 15752 15753 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15754 15755 static PyObject * unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15756 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored)) 15757 { 15758 PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter)); 15759 15760 /* _PyEval_GetBuiltin can invoke arbitrary code, 15761 * call must be before access of iterator pointers. 15762 * see issue #101765 */ 15763 15764 if (it->it_seq != NULL) { 15765 return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index); 15766 } else { 15767 PyObject *u = (PyObject *)_PyUnicode_New(0); 15768 if (u == NULL) { 15769 Py_XDECREF(iter); 15770 return NULL; 15771 } 15772 return Py_BuildValue("N(N)", iter, u); 15773 } 15774 } 15775 15776 PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15777 15778 static PyObject * unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15779 unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15780 { 15781 Py_ssize_t index = PyLong_AsSsize_t(state); 15782 if (index == -1 && PyErr_Occurred()) 15783 return NULL; 15784 if (it->it_seq != NULL) { 15785 if (index < 0) 15786 index = 0; 15787 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15788 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15789 it->it_index = index; 15790 } 15791 Py_RETURN_NONE; 15792 } 15793 15794 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15795 15796 static PyMethodDef unicodeiter_methods[] = { 15797 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15798 length_hint_doc}, 15799 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15800 reduce_doc}, 15801 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15802 setstate_doc}, 15803 {NULL, NULL} /* sentinel */ 15804 }; 15805 15806 PyTypeObject PyUnicodeIter_Type = { 15807 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15808 "str_iterator", /* tp_name */ 15809 sizeof(unicodeiterobject), /* tp_basicsize */ 15810 0, /* tp_itemsize */ 15811 /* methods */ 15812 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15813 0, /* tp_vectorcall_offset */ 15814 0, /* tp_getattr */ 15815 0, /* tp_setattr */ 15816 0, /* tp_as_async */ 15817 0, /* tp_repr */ 15818 0, /* tp_as_number */ 15819 0, /* tp_as_sequence */ 15820 0, /* tp_as_mapping */ 15821 0, /* tp_hash */ 15822 0, /* tp_call */ 15823 0, /* tp_str */ 15824 PyObject_GenericGetAttr, /* tp_getattro */ 15825 0, /* tp_setattro */ 15826 0, /* tp_as_buffer */ 15827 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15828 0, /* tp_doc */ 15829 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15830 0, /* tp_clear */ 15831 0, /* tp_richcompare */ 15832 0, /* tp_weaklistoffset */ 15833 PyObject_SelfIter, /* tp_iter */ 15834 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15835 unicodeiter_methods, /* tp_methods */ 15836 0, 15837 }; 15838 15839 PyTypeObject _PyUnicodeASCIIIter_Type = { 15840 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15841 .tp_name = "str_ascii_iterator", 15842 .tp_basicsize = sizeof(unicodeiterobject), 15843 .tp_dealloc = (destructor)unicodeiter_dealloc, 15844 .tp_getattro = PyObject_GenericGetAttr, 15845 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, 15846 .tp_traverse = (traverseproc)unicodeiter_traverse, 15847 .tp_iter = PyObject_SelfIter, 15848 .tp_iternext = (iternextfunc)unicode_ascii_iter_next, 15849 .tp_methods = unicodeiter_methods, 15850 }; 15851 15852 static PyObject * unicode_iter(PyObject * seq)15853 unicode_iter(PyObject *seq) 15854 { 15855 unicodeiterobject *it; 15856 15857 if (!PyUnicode_Check(seq)) { 15858 PyErr_BadInternalCall(); 15859 return NULL; 15860 } 15861 if (PyUnicode_READY(seq) == -1) 15862 return NULL; 15863 if (PyUnicode_IS_COMPACT_ASCII(seq)) { 15864 it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type); 15865 } 15866 else { 15867 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15868 } 15869 if (it == NULL) 15870 return NULL; 15871 it->it_index = 0; 15872 Py_INCREF(seq); 15873 it->it_seq = seq; 15874 _PyObject_GC_TRACK(it); 15875 return (PyObject *)it; 15876 } 15877 15878 static int encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15879 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name) 15880 { 15881 int res; 15882 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT); 15883 if (res == -2) { 15884 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name); 15885 return -1; 15886 } 15887 if (res < 0) { 15888 PyErr_NoMemory(); 15889 return -1; 15890 } 15891 return 0; 15892 } 15893 15894 15895 static int config_get_codec_name(wchar_t ** config_encoding)15896 config_get_codec_name(wchar_t **config_encoding) 15897 { 15898 char *encoding; 15899 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) { 15900 return -1; 15901 } 15902 15903 PyObject *name_obj = NULL; 15904 PyObject *codec = _PyCodec_Lookup(encoding); 15905 PyMem_RawFree(encoding); 15906 15907 if (!codec) 15908 goto error; 15909 15910 name_obj = PyObject_GetAttrString(codec, "name"); 15911 Py_CLEAR(codec); 15912 if (!name_obj) { 15913 goto error; 15914 } 15915 15916 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL); 15917 Py_DECREF(name_obj); 15918 if (wname == NULL) { 15919 goto error; 15920 } 15921 15922 wchar_t *raw_wname = _PyMem_RawWcsdup(wname); 15923 if (raw_wname == NULL) { 15924 PyMem_Free(wname); 15925 PyErr_NoMemory(); 15926 goto error; 15927 } 15928 15929 PyMem_RawFree(*config_encoding); 15930 *config_encoding = raw_wname; 15931 15932 PyMem_Free(wname); 15933 return 0; 15934 15935 error: 15936 Py_XDECREF(codec); 15937 Py_XDECREF(name_obj); 15938 return -1; 15939 } 15940 15941 15942 static PyStatus init_stdio_encoding(PyInterpreterState * interp)15943 init_stdio_encoding(PyInterpreterState *interp) 15944 { 15945 /* Update the stdio encoding to the normalized Python codec name. */ 15946 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp); 15947 if (config_get_codec_name(&config->stdio_encoding) < 0) { 15948 return _PyStatus_ERR("failed to get the Python codec name " 15949 "of the stdio encoding"); 15950 } 15951 return _PyStatus_OK(); 15952 } 15953 15954 15955 static int init_fs_codec(PyInterpreterState * interp)15956 init_fs_codec(PyInterpreterState *interp) 15957 { 15958 const PyConfig *config = _PyInterpreterState_GetConfig(interp); 15959 15960 _Py_error_handler error_handler; 15961 error_handler = get_error_handler_wide(config->filesystem_errors); 15962 if (error_handler == _Py_ERROR_UNKNOWN) { 15963 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler"); 15964 return -1; 15965 } 15966 15967 char *encoding, *errors; 15968 if (encode_wstr_utf8(config->filesystem_encoding, 15969 &encoding, 15970 "filesystem_encoding") < 0) { 15971 return -1; 15972 } 15973 15974 if (encode_wstr_utf8(config->filesystem_errors, 15975 &errors, 15976 "filesystem_errors") < 0) { 15977 PyMem_RawFree(encoding); 15978 return -1; 15979 } 15980 15981 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec; 15982 PyMem_RawFree(fs_codec->encoding); 15983 fs_codec->encoding = encoding; 15984 /* encoding has been normalized by init_fs_encoding() */ 15985 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0); 15986 PyMem_RawFree(fs_codec->errors); 15987 fs_codec->errors = errors; 15988 fs_codec->error_handler = error_handler; 15989 15990 #ifdef _Py_FORCE_UTF8_FS_ENCODING 15991 assert(fs_codec->utf8 == 1); 15992 #endif 15993 15994 /* At this point, PyUnicode_EncodeFSDefault() and 15995 PyUnicode_DecodeFSDefault() can now use the Python codec rather than 15996 the C implementation of the filesystem encoding. */ 15997 15998 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors 15999 global configuration variables. */ 16000 if (_Py_SetFileSystemEncoding(fs_codec->encoding, 16001 fs_codec->errors) < 0) { 16002 PyErr_NoMemory(); 16003 return -1; 16004 } 16005 return 0; 16006 } 16007 16008 16009 static PyStatus init_fs_encoding(PyThreadState * tstate)16010 init_fs_encoding(PyThreadState *tstate) 16011 { 16012 PyInterpreterState *interp = tstate->interp; 16013 16014 /* Update the filesystem encoding to the normalized Python codec name. 16015 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii" 16016 (Python codec name). */ 16017 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp); 16018 if (config_get_codec_name(&config->filesystem_encoding) < 0) { 16019 _Py_DumpPathConfig(tstate); 16020 return _PyStatus_ERR("failed to get the Python codec " 16021 "of the filesystem encoding"); 16022 } 16023 16024 if (init_fs_codec(interp) < 0) { 16025 return _PyStatus_ERR("cannot initialize filesystem codec"); 16026 } 16027 return _PyStatus_OK(); 16028 } 16029 16030 16031 PyStatus _PyUnicode_InitEncodings(PyThreadState * tstate)16032 _PyUnicode_InitEncodings(PyThreadState *tstate) 16033 { 16034 PyStatus status = init_fs_encoding(tstate); 16035 if (_PyStatus_EXCEPTION(status)) { 16036 return status; 16037 } 16038 16039 return init_stdio_encoding(tstate->interp); 16040 } 16041 16042 16043 static void _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16044 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec) 16045 { 16046 PyMem_RawFree(fs_codec->encoding); 16047 fs_codec->encoding = NULL; 16048 fs_codec->utf8 = 0; 16049 PyMem_RawFree(fs_codec->errors); 16050 fs_codec->errors = NULL; 16051 fs_codec->error_handler = _Py_ERROR_UNKNOWN; 16052 } 16053 16054 16055 #ifdef MS_WINDOWS 16056 int _PyUnicode_EnableLegacyWindowsFSEncoding(void)16057 _PyUnicode_EnableLegacyWindowsFSEncoding(void) 16058 { 16059 PyInterpreterState *interp = _PyInterpreterState_GET(); 16060 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp); 16061 16062 /* Set the filesystem encoding to mbcs/replace (PEP 529) */ 16063 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs"); 16064 wchar_t *errors = _PyMem_RawWcsdup(L"replace"); 16065 if (encoding == NULL || errors == NULL) { 16066 PyMem_RawFree(encoding); 16067 PyMem_RawFree(errors); 16068 PyErr_NoMemory(); 16069 return -1; 16070 } 16071 16072 PyMem_RawFree(config->filesystem_encoding); 16073 config->filesystem_encoding = encoding; 16074 PyMem_RawFree(config->filesystem_errors); 16075 config->filesystem_errors = errors; 16076 16077 return init_fs_codec(interp); 16078 } 16079 #endif 16080 16081 16082 #ifdef Py_DEBUG 16083 static inline int unicode_is_finalizing(void)16084 unicode_is_finalizing(void) 16085 { 16086 return (interned == NULL); 16087 } 16088 #endif 16089 16090 16091 void _PyUnicode_FiniTypes(PyInterpreterState * interp)16092 _PyUnicode_FiniTypes(PyInterpreterState *interp) 16093 { 16094 if (!_Py_IsMainInterpreter(interp)) { 16095 return; 16096 } 16097 16098 _PyStaticType_Dealloc(&EncodingMapType); 16099 _PyStaticType_Dealloc(&PyFieldNameIter_Type); 16100 _PyStaticType_Dealloc(&PyFormatterIter_Type); 16101 } 16102 16103 unicode_static_dealloc(PyObject * op)16104 static void unicode_static_dealloc(PyObject *op) 16105 { 16106 PyASCIIObject *ascii = _PyASCIIObject_CAST(op); 16107 16108 assert(ascii->state.compact); 16109 16110 if (ascii->state.ascii) { 16111 if (ascii->wstr) { 16112 PyObject_Free(ascii->wstr); 16113 ascii->wstr = NULL; 16114 } 16115 } 16116 else { 16117 PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op; 16118 void* data = (void*)(compact + 1); 16119 if (ascii->wstr && ascii->wstr != data) { 16120 PyObject_Free(ascii->wstr); 16121 ascii->wstr = NULL; 16122 compact->wstr_length = 0; 16123 } 16124 if (compact->utf8) { 16125 PyObject_Free(compact->utf8); 16126 compact->utf8 = NULL; 16127 compact->utf8_length = 0; 16128 } 16129 } 16130 } 16131 16132 16133 void _PyUnicode_Fini(PyInterpreterState * interp)16134 _PyUnicode_Fini(PyInterpreterState *interp) 16135 { 16136 struct _Py_unicode_state *state = &interp->unicode; 16137 16138 if (_Py_IsMainInterpreter(interp)) { 16139 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini() 16140 assert(interned == NULL); 16141 // bpo-47182: force a unicodedata CAPI capsule re-import on 16142 // subsequent initialization of main interpreter. 16143 ucnhash_capi = NULL; 16144 } 16145 16146 _PyUnicode_FiniEncodings(&state->fs_codec); 16147 16148 unicode_clear_identifiers(state); 16149 16150 // Clear the single character singletons 16151 for (int i = 0; i < 128; i++) { 16152 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]); 16153 } 16154 for (int i = 0; i < 128; i++) { 16155 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]); 16156 } 16157 } 16158 16159 16160 void _PyStaticUnicode_Dealloc(PyObject * op)16161 _PyStaticUnicode_Dealloc(PyObject *op) 16162 { 16163 unicode_static_dealloc(op); 16164 } 16165 16166 16167 /* A _string module, to export formatter_parser and formatter_field_name_split 16168 to the string.Formatter class implemented in Python. */ 16169 16170 static PyMethodDef _string_methods[] = { 16171 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 16172 METH_O, PyDoc_STR("split the argument as a field name")}, 16173 {"formatter_parser", (PyCFunction) formatter_parser, 16174 METH_O, PyDoc_STR("parse the argument as a format string")}, 16175 {NULL, NULL} 16176 }; 16177 16178 static struct PyModuleDef _string_module = { 16179 PyModuleDef_HEAD_INIT, 16180 .m_name = "_string", 16181 .m_doc = PyDoc_STR("string helper module"), 16182 .m_size = 0, 16183 .m_methods = _string_methods, 16184 }; 16185 16186 PyMODINIT_FUNC PyInit__string(void)16187 PyInit__string(void) 16188 { 16189 return PyModuleDef_Init(&_string_module); 16190 } 16191 16192 16193 #ifdef __cplusplus 16194 } 16195 #endif 16196