1""" Tests for the unicodedata module. 2 3 Written by Marc-Andre Lemburg ([email protected]). 4 5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8 9import hashlib 10from http.client import HTTPException 11import sys 12import unicodedata 13import unittest 14from test.support import (open_urlresource, requires_resource, script_helper, 15 cpython_only, check_disallow_instantiation, 16 ResourceDenied) 17 18 19class UnicodeMethodsTest(unittest.TestCase): 20 21 # update this, if the database changes 22 expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326' 23 24 @requires_resource('cpu') 25 def test_method_checksum(self): 26 h = hashlib.sha1() 27 for i in range(sys.maxunicode + 1): 28 char = chr(i) 29 data = [ 30 # Predicates (single char) 31 "01"[char.isalnum()], 32 "01"[char.isalpha()], 33 "01"[char.isdecimal()], 34 "01"[char.isdigit()], 35 "01"[char.islower()], 36 "01"[char.isnumeric()], 37 "01"[char.isspace()], 38 "01"[char.istitle()], 39 "01"[char.isupper()], 40 41 # Predicates (multiple chars) 42 "01"[(char + 'abc').isalnum()], 43 "01"[(char + 'abc').isalpha()], 44 "01"[(char + '123').isdecimal()], 45 "01"[(char + '123').isdigit()], 46 "01"[(char + 'abc').islower()], 47 "01"[(char + '123').isnumeric()], 48 "01"[(char + ' \t').isspace()], 49 "01"[(char + 'abc').istitle()], 50 "01"[(char + 'ABC').isupper()], 51 52 # Mappings (single char) 53 char.lower(), 54 char.upper(), 55 char.title(), 56 57 # Mappings (multiple chars) 58 (char + 'abc').lower(), 59 (char + 'ABC').upper(), 60 (char + 'abc').title(), 61 (char + 'ABC').title(), 62 63 ] 64 h.update(''.join(data).encode('utf-8', 'surrogatepass')) 65 result = h.hexdigest() 66 self.assertEqual(result, self.expectedchecksum) 67 68class UnicodeDatabaseTest(unittest.TestCase): 69 db = unicodedata 70 71class UnicodeFunctionsTest(UnicodeDatabaseTest): 72 73 # Update this if the database changes. Make sure to do a full rebuild 74 # (e.g. 'make distclean && make') to get the correct checksum. 75 expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370' 76 77 @requires_resource('cpu') 78 def test_function_checksum(self): 79 data = [] 80 h = hashlib.sha1() 81 82 for i in range(sys.maxunicode + 1): 83 char = chr(i) 84 data = [ 85 # Properties 86 format(self.db.digit(char, -1), '.12g'), 87 format(self.db.numeric(char, -1), '.12g'), 88 format(self.db.decimal(char, -1), '.12g'), 89 self.db.category(char), 90 self.db.bidirectional(char), 91 self.db.decomposition(char), 92 str(self.db.mirrored(char)), 93 str(self.db.combining(char)), 94 ] 95 h.update(''.join(data).encode("ascii")) 96 result = h.hexdigest() 97 self.assertEqual(result, self.expectedchecksum) 98 99 @requires_resource('cpu') 100 def test_name_inverse_lookup(self): 101 for i in range(sys.maxunicode + 1): 102 char = chr(i) 103 if looked_name := self.db.name(char, None): 104 self.assertEqual(self.db.lookup(looked_name), char) 105 106 def test_digit(self): 107 self.assertEqual(self.db.digit('A', None), None) 108 self.assertEqual(self.db.digit('9'), 9) 109 self.assertEqual(self.db.digit('\u215b', None), None) 110 self.assertEqual(self.db.digit('\u2468'), 9) 111 self.assertEqual(self.db.digit('\U00020000', None), None) 112 self.assertEqual(self.db.digit('\U0001D7FD'), 7) 113 114 self.assertRaises(TypeError, self.db.digit) 115 self.assertRaises(TypeError, self.db.digit, 'xx') 116 self.assertRaises(ValueError, self.db.digit, 'x') 117 118 def test_numeric(self): 119 self.assertEqual(self.db.numeric('A',None), None) 120 self.assertEqual(self.db.numeric('9'), 9) 121 self.assertEqual(self.db.numeric('\u215b'), 0.125) 122 self.assertEqual(self.db.numeric('\u2468'), 9.0) 123 self.assertEqual(self.db.numeric('\ua627'), 7.0) 124 self.assertEqual(self.db.numeric('\U00020000', None), None) 125 self.assertEqual(self.db.numeric('\U0001012A'), 9000) 126 127 self.assertRaises(TypeError, self.db.numeric) 128 self.assertRaises(TypeError, self.db.numeric, 'xx') 129 self.assertRaises(ValueError, self.db.numeric, 'x') 130 131 def test_decimal(self): 132 self.assertEqual(self.db.decimal('A',None), None) 133 self.assertEqual(self.db.decimal('9'), 9) 134 self.assertEqual(self.db.decimal('\u215b', None), None) 135 self.assertEqual(self.db.decimal('\u2468', None), None) 136 self.assertEqual(self.db.decimal('\U00020000', None), None) 137 self.assertEqual(self.db.decimal('\U0001D7FD'), 7) 138 139 self.assertRaises(TypeError, self.db.decimal) 140 self.assertRaises(TypeError, self.db.decimal, 'xx') 141 self.assertRaises(ValueError, self.db.decimal, 'x') 142 143 def test_category(self): 144 self.assertEqual(self.db.category('\uFFFE'), 'Cn') 145 self.assertEqual(self.db.category('a'), 'Ll') 146 self.assertEqual(self.db.category('A'), 'Lu') 147 self.assertEqual(self.db.category('\U00020000'), 'Lo') 148 self.assertEqual(self.db.category('\U0001012A'), 'No') 149 150 self.assertRaises(TypeError, self.db.category) 151 self.assertRaises(TypeError, self.db.category, 'xx') 152 153 def test_bidirectional(self): 154 self.assertEqual(self.db.bidirectional('\uFFFE'), '') 155 self.assertEqual(self.db.bidirectional(' '), 'WS') 156 self.assertEqual(self.db.bidirectional('A'), 'L') 157 self.assertEqual(self.db.bidirectional('\U00020000'), 'L') 158 159 self.assertRaises(TypeError, self.db.bidirectional) 160 self.assertRaises(TypeError, self.db.bidirectional, 'xx') 161 162 def test_decomposition(self): 163 self.assertEqual(self.db.decomposition('\uFFFE'),'') 164 self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034') 165 166 self.assertRaises(TypeError, self.db.decomposition) 167 self.assertRaises(TypeError, self.db.decomposition, 'xx') 168 169 def test_mirrored(self): 170 self.assertEqual(self.db.mirrored('\uFFFE'), 0) 171 self.assertEqual(self.db.mirrored('a'), 0) 172 self.assertEqual(self.db.mirrored('\u2201'), 1) 173 self.assertEqual(self.db.mirrored('\U00020000'), 0) 174 175 self.assertRaises(TypeError, self.db.mirrored) 176 self.assertRaises(TypeError, self.db.mirrored, 'xx') 177 178 def test_combining(self): 179 self.assertEqual(self.db.combining('\uFFFE'), 0) 180 self.assertEqual(self.db.combining('a'), 0) 181 self.assertEqual(self.db.combining('\u20e1'), 230) 182 self.assertEqual(self.db.combining('\U00020000'), 0) 183 184 self.assertRaises(TypeError, self.db.combining) 185 self.assertRaises(TypeError, self.db.combining, 'xx') 186 187 def test_pr29(self): 188 # https://www.unicode.org/review/pr-29.html 189 # See issues #1054943 and #10254. 190 composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161", 191 'Li\u030dt-s\u1e73\u0301', 192 '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c' 193 + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917', 194 '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c' 195 + '\u0938\u094d\u0924\u093e\u0928') 196 for text in composed: 197 self.assertEqual(self.db.normalize('NFC', text), text) 198 199 def test_issue10254(self): 200 # Crash reported in #10254 201 a = 'C\u0338' * 20 + 'C\u0327' 202 b = 'C\u0338' * 20 + '\xC7' 203 self.assertEqual(self.db.normalize('NFC', a), b) 204 205 def test_issue29456(self): 206 # Fix #29456 207 u1176_str_a = '\u1100\u1176\u11a8' 208 u1176_str_b = '\u1100\u1176\u11a8' 209 u11a7_str_a = '\u1100\u1175\u11a7' 210 u11a7_str_b = '\uae30\u11a7' 211 u11c3_str_a = '\u1100\u1175\u11c3' 212 u11c3_str_b = '\uae30\u11c3' 213 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) 214 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) 215 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) 216 217 def test_east_asian_width(self): 218 eaw = self.db.east_asian_width 219 self.assertRaises(TypeError, eaw, b'a') 220 self.assertRaises(TypeError, eaw, bytearray()) 221 self.assertRaises(TypeError, eaw, '') 222 self.assertRaises(TypeError, eaw, 'ra') 223 self.assertEqual(eaw('\x1e'), 'N') 224 self.assertEqual(eaw('\x20'), 'Na') 225 self.assertEqual(eaw('\uC894'), 'W') 226 self.assertEqual(eaw('\uFF66'), 'H') 227 self.assertEqual(eaw('\uFF1F'), 'F') 228 self.assertEqual(eaw('\u2010'), 'A') 229 self.assertEqual(eaw('\U00020000'), 'W') 230 231 def test_east_asian_width_9_0_changes(self): 232 self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') 233 self.assertEqual(self.db.east_asian_width('\u231a'), 'W') 234 235class UnicodeMiscTest(UnicodeDatabaseTest): 236 237 @cpython_only 238 def test_disallow_instantiation(self): 239 # Ensure that the type disallows instantiation (bpo-43916) 240 check_disallow_instantiation(self, unicodedata.UCD) 241 242 def test_failed_import_during_compiling(self): 243 # Issue 4367 244 # Decoding \N escapes requires the unicodedata module. If it can't be 245 # imported, we shouldn't segfault. 246 247 # This program should raise a SyntaxError in the eval. 248 code = "import sys;" \ 249 "sys.modules['unicodedata'] = None;" \ 250 """eval("'\\\\N{SOFT HYPHEN}'")""" 251 # We use a separate process because the unicodedata module may already 252 # have been loaded in this process. 253 result = script_helper.assert_python_failure("-c", code) 254 error = "SyntaxError: (unicode error) \\N escapes not supported " \ 255 "(can't load unicodedata module)" 256 self.assertIn(error, result.err.decode("ascii")) 257 258 def test_decimal_numeric_consistent(self): 259 # Test that decimal and numeric are consistent, 260 # i.e. if a character has a decimal value, 261 # its numeric value should be the same. 262 count = 0 263 for i in range(0x10000): 264 c = chr(i) 265 dec = self.db.decimal(c, -1) 266 if dec != -1: 267 self.assertEqual(dec, self.db.numeric(c)) 268 count += 1 269 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 270 271 def test_digit_numeric_consistent(self): 272 # Test that digit and numeric are consistent, 273 # i.e. if a character has a digit value, 274 # its numeric value should be the same. 275 count = 0 276 for i in range(0x10000): 277 c = chr(i) 278 dec = self.db.digit(c, -1) 279 if dec != -1: 280 self.assertEqual(dec, self.db.numeric(c)) 281 count += 1 282 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 283 284 def test_bug_1704793(self): 285 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346') 286 287 def test_ucd_510(self): 288 import unicodedata 289 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 290 self.assertTrue(unicodedata.mirrored("\u0f3a")) 291 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a")) 292 # Also, we now have two ways of representing 293 # the upper-case mapping: as delta, or as absolute value 294 self.assertTrue("a".upper()=='A') 295 self.assertTrue("\u1d79".upper()=='\ua77d') 296 self.assertTrue(".".upper()=='.') 297 298 def test_bug_5828(self): 299 self.assertEqual("\u1d79".lower(), "\u1d79") 300 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant 301 self.assertEqual( 302 [ 303 c for c in range(sys.maxunicode+1) 304 if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title() 305 ], 306 [0] 307 ) 308 309 def test_bug_4971(self): 310 # LETTER DZ WITH CARON: DZ, Dz, dz 311 self.assertEqual("\u01c4".title(), "\u01c5") 312 self.assertEqual("\u01c5".title(), "\u01c5") 313 self.assertEqual("\u01c6".title(), "\u01c5") 314 315 def test_linebreak_7643(self): 316 for i in range(0x10000): 317 lines = (chr(i) + 'A').splitlines() 318 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, 319 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): 320 self.assertEqual(len(lines), 2, 321 r"\u%.4x should be a linebreak" % i) 322 else: 323 self.assertEqual(len(lines), 1, 324 r"\u%.4x should not be a linebreak" % i) 325 326class NormalizationTest(unittest.TestCase): 327 @staticmethod 328 def check_version(testfile): 329 hdr = testfile.readline() 330 return unicodedata.unidata_version in hdr 331 332 @staticmethod 333 def unistr(data): 334 data = [int(x, 16) for x in data.split(" ")] 335 return "".join([chr(x) for x in data]) 336 337 @requires_resource('network') 338 def test_normalization(self): 339 TESTDATAFILE = "NormalizationTest.txt" 340 TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}" 341 342 # Hit the exception early 343 try: 344 testdata = open_urlresource(TESTDATAURL, encoding="utf-8", 345 check=self.check_version) 346 except PermissionError: 347 self.skipTest(f"Permission error when downloading {TESTDATAURL} " 348 f"into the test data directory") 349 except (OSError, HTTPException) as exc: 350 self.skipTest(f"Failed to download {TESTDATAURL}: {exc}") 351 352 with testdata: 353 self.run_normalization_tests(testdata) 354 355 def run_normalization_tests(self, testdata): 356 part = None 357 part1_data = {} 358 359 def NFC(str): 360 return unicodedata.normalize("NFC", str) 361 362 def NFKC(str): 363 return unicodedata.normalize("NFKC", str) 364 365 def NFD(str): 366 return unicodedata.normalize("NFD", str) 367 368 def NFKD(str): 369 return unicodedata.normalize("NFKD", str) 370 371 for line in testdata: 372 if '#' in line: 373 line = line.split('#')[0] 374 line = line.strip() 375 if not line: 376 continue 377 if line.startswith("@Part"): 378 part = line.split()[0] 379 continue 380 c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]] 381 382 # Perform tests 383 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 384 self.assertTrue(c4 == NFC(c4) == NFC(c5), line) 385 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 386 self.assertTrue(c5 == NFD(c4) == NFD(c5), line) 387 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ 388 NFKC(c3) == NFKC(c4) == NFKC(c5), 389 line) 390 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ 391 NFKD(c3) == NFKD(c4) == NFKD(c5), 392 line) 393 394 self.assertTrue(unicodedata.is_normalized("NFC", c2)) 395 self.assertTrue(unicodedata.is_normalized("NFC", c4)) 396 397 self.assertTrue(unicodedata.is_normalized("NFD", c3)) 398 self.assertTrue(unicodedata.is_normalized("NFD", c5)) 399 400 self.assertTrue(unicodedata.is_normalized("NFKC", c4)) 401 self.assertTrue(unicodedata.is_normalized("NFKD", c5)) 402 403 # Record part 1 data 404 if part == "@Part1": 405 part1_data[c1] = 1 406 407 # Perform tests for all other data 408 for c in range(sys.maxunicode+1): 409 X = chr(c) 410 if X in part1_data: 411 continue 412 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 413 414 def test_edge_cases(self): 415 self.assertRaises(TypeError, unicodedata.normalize) 416 self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') 417 self.assertEqual(unicodedata.normalize('NFKC', ''), '') 418 419 def test_bug_834676(self): 420 # Check for bug 834676 421 unicodedata.normalize('NFC', '\ud55c\uae00') 422 423 424if __name__ == "__main__": 425 unittest.main() 426