1""" Tests for the unicodedata module.
2
3    Written by Marc-Andre Lemburg ([email protected]).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import hashlib
10from http.client import HTTPException
11import sys
12import unicodedata
13import unittest
14from test.support import (open_urlresource, requires_resource, script_helper,
15                          cpython_only, check_disallow_instantiation,
16                          ResourceDenied)
17
18
19class UnicodeMethodsTest(unittest.TestCase):
20
21    # update this, if the database changes
22    expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'
23
24    @requires_resource('cpu')
25    def test_method_checksum(self):
26        h = hashlib.sha1()
27        for i in range(sys.maxunicode + 1):
28            char = chr(i)
29            data = [
30                # Predicates (single char)
31                "01"[char.isalnum()],
32                "01"[char.isalpha()],
33                "01"[char.isdecimal()],
34                "01"[char.isdigit()],
35                "01"[char.islower()],
36                "01"[char.isnumeric()],
37                "01"[char.isspace()],
38                "01"[char.istitle()],
39                "01"[char.isupper()],
40
41                # Predicates (multiple chars)
42                "01"[(char + 'abc').isalnum()],
43                "01"[(char + 'abc').isalpha()],
44                "01"[(char + '123').isdecimal()],
45                "01"[(char + '123').isdigit()],
46                "01"[(char + 'abc').islower()],
47                "01"[(char + '123').isnumeric()],
48                "01"[(char + ' \t').isspace()],
49                "01"[(char + 'abc').istitle()],
50                "01"[(char + 'ABC').isupper()],
51
52                # Mappings (single char)
53                char.lower(),
54                char.upper(),
55                char.title(),
56
57                # Mappings (multiple chars)
58                (char + 'abc').lower(),
59                (char + 'ABC').upper(),
60                (char + 'abc').title(),
61                (char + 'ABC').title(),
62
63                ]
64            h.update(''.join(data).encode('utf-8', 'surrogatepass'))
65        result = h.hexdigest()
66        self.assertEqual(result, self.expectedchecksum)
67
68class UnicodeDatabaseTest(unittest.TestCase):
69    db = unicodedata
70
71class UnicodeFunctionsTest(UnicodeDatabaseTest):
72
73    # Update this if the database changes. Make sure to do a full rebuild
74    # (e.g. 'make distclean && make') to get the correct checksum.
75    expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
76
77    @requires_resource('cpu')
78    def test_function_checksum(self):
79        data = []
80        h = hashlib.sha1()
81
82        for i in range(sys.maxunicode + 1):
83            char = chr(i)
84            data = [
85                # Properties
86                format(self.db.digit(char, -1), '.12g'),
87                format(self.db.numeric(char, -1), '.12g'),
88                format(self.db.decimal(char, -1), '.12g'),
89                self.db.category(char),
90                self.db.bidirectional(char),
91                self.db.decomposition(char),
92                str(self.db.mirrored(char)),
93                str(self.db.combining(char)),
94            ]
95            h.update(''.join(data).encode("ascii"))
96        result = h.hexdigest()
97        self.assertEqual(result, self.expectedchecksum)
98
99    @requires_resource('cpu')
100    def test_name_inverse_lookup(self):
101        for i in range(sys.maxunicode + 1):
102            char = chr(i)
103            if looked_name := self.db.name(char, None):
104                self.assertEqual(self.db.lookup(looked_name), char)
105
106    def test_digit(self):
107        self.assertEqual(self.db.digit('A', None), None)
108        self.assertEqual(self.db.digit('9'), 9)
109        self.assertEqual(self.db.digit('\u215b', None), None)
110        self.assertEqual(self.db.digit('\u2468'), 9)
111        self.assertEqual(self.db.digit('\U00020000', None), None)
112        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
113
114        self.assertRaises(TypeError, self.db.digit)
115        self.assertRaises(TypeError, self.db.digit, 'xx')
116        self.assertRaises(ValueError, self.db.digit, 'x')
117
118    def test_numeric(self):
119        self.assertEqual(self.db.numeric('A',None), None)
120        self.assertEqual(self.db.numeric('9'), 9)
121        self.assertEqual(self.db.numeric('\u215b'), 0.125)
122        self.assertEqual(self.db.numeric('\u2468'), 9.0)
123        self.assertEqual(self.db.numeric('\ua627'), 7.0)
124        self.assertEqual(self.db.numeric('\U00020000', None), None)
125        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
126
127        self.assertRaises(TypeError, self.db.numeric)
128        self.assertRaises(TypeError, self.db.numeric, 'xx')
129        self.assertRaises(ValueError, self.db.numeric, 'x')
130
131    def test_decimal(self):
132        self.assertEqual(self.db.decimal('A',None), None)
133        self.assertEqual(self.db.decimal('9'), 9)
134        self.assertEqual(self.db.decimal('\u215b', None), None)
135        self.assertEqual(self.db.decimal('\u2468', None), None)
136        self.assertEqual(self.db.decimal('\U00020000', None), None)
137        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
138
139        self.assertRaises(TypeError, self.db.decimal)
140        self.assertRaises(TypeError, self.db.decimal, 'xx')
141        self.assertRaises(ValueError, self.db.decimal, 'x')
142
143    def test_category(self):
144        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
145        self.assertEqual(self.db.category('a'), 'Ll')
146        self.assertEqual(self.db.category('A'), 'Lu')
147        self.assertEqual(self.db.category('\U00020000'), 'Lo')
148        self.assertEqual(self.db.category('\U0001012A'), 'No')
149
150        self.assertRaises(TypeError, self.db.category)
151        self.assertRaises(TypeError, self.db.category, 'xx')
152
153    def test_bidirectional(self):
154        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
155        self.assertEqual(self.db.bidirectional(' '), 'WS')
156        self.assertEqual(self.db.bidirectional('A'), 'L')
157        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
158
159        self.assertRaises(TypeError, self.db.bidirectional)
160        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
161
162    def test_decomposition(self):
163        self.assertEqual(self.db.decomposition('\uFFFE'),'')
164        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
165
166        self.assertRaises(TypeError, self.db.decomposition)
167        self.assertRaises(TypeError, self.db.decomposition, 'xx')
168
169    def test_mirrored(self):
170        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
171        self.assertEqual(self.db.mirrored('a'), 0)
172        self.assertEqual(self.db.mirrored('\u2201'), 1)
173        self.assertEqual(self.db.mirrored('\U00020000'), 0)
174
175        self.assertRaises(TypeError, self.db.mirrored)
176        self.assertRaises(TypeError, self.db.mirrored, 'xx')
177
178    def test_combining(self):
179        self.assertEqual(self.db.combining('\uFFFE'), 0)
180        self.assertEqual(self.db.combining('a'), 0)
181        self.assertEqual(self.db.combining('\u20e1'), 230)
182        self.assertEqual(self.db.combining('\U00020000'), 0)
183
184        self.assertRaises(TypeError, self.db.combining)
185        self.assertRaises(TypeError, self.db.combining, 'xx')
186
187    def test_pr29(self):
188        # https://www.unicode.org/review/pr-29.html
189        # See issues #1054943 and #10254.
190        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
191                    'Li\u030dt-s\u1e73\u0301',
192                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
193                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
194                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
195                    + '\u0938\u094d\u0924\u093e\u0928')
196        for text in composed:
197            self.assertEqual(self.db.normalize('NFC', text), text)
198
199    def test_issue10254(self):
200        # Crash reported in #10254
201        a = 'C\u0338' * 20  + 'C\u0327'
202        b = 'C\u0338' * 20  + '\xC7'
203        self.assertEqual(self.db.normalize('NFC', a), b)
204
205    def test_issue29456(self):
206        # Fix #29456
207        u1176_str_a = '\u1100\u1176\u11a8'
208        u1176_str_b = '\u1100\u1176\u11a8'
209        u11a7_str_a = '\u1100\u1175\u11a7'
210        u11a7_str_b = '\uae30\u11a7'
211        u11c3_str_a = '\u1100\u1175\u11c3'
212        u11c3_str_b = '\uae30\u11c3'
213        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
214        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
215        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
216
217    def test_east_asian_width(self):
218        eaw = self.db.east_asian_width
219        self.assertRaises(TypeError, eaw, b'a')
220        self.assertRaises(TypeError, eaw, bytearray())
221        self.assertRaises(TypeError, eaw, '')
222        self.assertRaises(TypeError, eaw, 'ra')
223        self.assertEqual(eaw('\x1e'), 'N')
224        self.assertEqual(eaw('\x20'), 'Na')
225        self.assertEqual(eaw('\uC894'), 'W')
226        self.assertEqual(eaw('\uFF66'), 'H')
227        self.assertEqual(eaw('\uFF1F'), 'F')
228        self.assertEqual(eaw('\u2010'), 'A')
229        self.assertEqual(eaw('\U00020000'), 'W')
230
231    def test_east_asian_width_9_0_changes(self):
232        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
233        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
234
235class UnicodeMiscTest(UnicodeDatabaseTest):
236
237    @cpython_only
238    def test_disallow_instantiation(self):
239        # Ensure that the type disallows instantiation (bpo-43916)
240        check_disallow_instantiation(self, unicodedata.UCD)
241
242    def test_failed_import_during_compiling(self):
243        # Issue 4367
244        # Decoding \N escapes requires the unicodedata module. If it can't be
245        # imported, we shouldn't segfault.
246
247        # This program should raise a SyntaxError in the eval.
248        code = "import sys;" \
249            "sys.modules['unicodedata'] = None;" \
250            """eval("'\\\\N{SOFT HYPHEN}'")"""
251        # We use a separate process because the unicodedata module may already
252        # have been loaded in this process.
253        result = script_helper.assert_python_failure("-c", code)
254        error = "SyntaxError: (unicode error) \\N escapes not supported " \
255            "(can't load unicodedata module)"
256        self.assertIn(error, result.err.decode("ascii"))
257
258    def test_decimal_numeric_consistent(self):
259        # Test that decimal and numeric are consistent,
260        # i.e. if a character has a decimal value,
261        # its numeric value should be the same.
262        count = 0
263        for i in range(0x10000):
264            c = chr(i)
265            dec = self.db.decimal(c, -1)
266            if dec != -1:
267                self.assertEqual(dec, self.db.numeric(c))
268                count += 1
269        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
270
271    def test_digit_numeric_consistent(self):
272        # Test that digit and numeric are consistent,
273        # i.e. if a character has a digit value,
274        # its numeric value should be the same.
275        count = 0
276        for i in range(0x10000):
277            c = chr(i)
278            dec = self.db.digit(c, -1)
279            if dec != -1:
280                self.assertEqual(dec, self.db.numeric(c))
281                count += 1
282        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
283
284    def test_bug_1704793(self):
285        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
286
287    def test_ucd_510(self):
288        import unicodedata
289        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
290        self.assertTrue(unicodedata.mirrored("\u0f3a"))
291        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
292        # Also, we now have two ways of representing
293        # the upper-case mapping: as delta, or as absolute value
294        self.assertTrue("a".upper()=='A')
295        self.assertTrue("\u1d79".upper()=='\ua77d')
296        self.assertTrue(".".upper()=='.')
297
298    def test_bug_5828(self):
299        self.assertEqual("\u1d79".lower(), "\u1d79")
300        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
301        self.assertEqual(
302            [
303                c for c in range(sys.maxunicode+1)
304                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
305            ],
306            [0]
307        )
308
309    def test_bug_4971(self):
310        # LETTER DZ WITH CARON: DZ, Dz, dz
311        self.assertEqual("\u01c4".title(), "\u01c5")
312        self.assertEqual("\u01c5".title(), "\u01c5")
313        self.assertEqual("\u01c6".title(), "\u01c5")
314
315    def test_linebreak_7643(self):
316        for i in range(0x10000):
317            lines = (chr(i) + 'A').splitlines()
318            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
319                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
320                self.assertEqual(len(lines), 2,
321                                 r"\u%.4x should be a linebreak" % i)
322            else:
323                self.assertEqual(len(lines), 1,
324                                 r"\u%.4x should not be a linebreak" % i)
325
326class NormalizationTest(unittest.TestCase):
327    @staticmethod
328    def check_version(testfile):
329        hdr = testfile.readline()
330        return unicodedata.unidata_version in hdr
331
332    @staticmethod
333    def unistr(data):
334        data = [int(x, 16) for x in data.split(" ")]
335        return "".join([chr(x) for x in data])
336
337    @requires_resource('network')
338    def test_normalization(self):
339        TESTDATAFILE = "NormalizationTest.txt"
340        TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
341
342        # Hit the exception early
343        try:
344            testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
345                                        check=self.check_version)
346        except PermissionError:
347            self.skipTest(f"Permission error when downloading {TESTDATAURL} "
348                          f"into the test data directory")
349        except (OSError, HTTPException) as exc:
350            self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
351
352        with testdata:
353            self.run_normalization_tests(testdata)
354
355    def run_normalization_tests(self, testdata):
356        part = None
357        part1_data = {}
358
359        def NFC(str):
360            return unicodedata.normalize("NFC", str)
361
362        def NFKC(str):
363            return unicodedata.normalize("NFKC", str)
364
365        def NFD(str):
366            return unicodedata.normalize("NFD", str)
367
368        def NFKD(str):
369            return unicodedata.normalize("NFKD", str)
370
371        for line in testdata:
372            if '#' in line:
373                line = line.split('#')[0]
374            line = line.strip()
375            if not line:
376                continue
377            if line.startswith("@Part"):
378                part = line.split()[0]
379                continue
380            c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
381
382            # Perform tests
383            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
384            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
385            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
386            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
387            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
388                            NFKC(c3) == NFKC(c4) == NFKC(c5),
389                            line)
390            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
391                            NFKD(c3) == NFKD(c4) == NFKD(c5),
392                            line)
393
394            self.assertTrue(unicodedata.is_normalized("NFC", c2))
395            self.assertTrue(unicodedata.is_normalized("NFC", c4))
396
397            self.assertTrue(unicodedata.is_normalized("NFD", c3))
398            self.assertTrue(unicodedata.is_normalized("NFD", c5))
399
400            self.assertTrue(unicodedata.is_normalized("NFKC", c4))
401            self.assertTrue(unicodedata.is_normalized("NFKD", c5))
402
403            # Record part 1 data
404            if part == "@Part1":
405                part1_data[c1] = 1
406
407        # Perform tests for all other data
408        for c in range(sys.maxunicode+1):
409            X = chr(c)
410            if X in part1_data:
411                continue
412            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
413
414    def test_edge_cases(self):
415        self.assertRaises(TypeError, unicodedata.normalize)
416        self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
417        self.assertEqual(unicodedata.normalize('NFKC', ''), '')
418
419    def test_bug_834676(self):
420        # Check for bug 834676
421        unicodedata.normalize('NFC', '\ud55c\uae00')
422
423
424if __name__ == "__main__":
425    unittest.main()
426