1 //! Character inclusion in binary or General_Category value Unicode sets.
2 //!
3 //! We rely on dead code elimination to remove the tables that aren't needed.
4 
5 #![allow(bad_style)]
6 #![allow(clippy::all)]
7 
8 use alloc::boxed::Box;
9 
10 macro_rules! property_functions {
11     ($module:ident, $property_names:ident, [$(
12         $prop:ident,
13     )*]) => {
14         #[allow(unused)]
15         mod $module;
16         // unicode::ALPHABETIC('a')
17         $(pub fn $prop(c: char) -> bool {
18             self::$module::$prop.contains_char(c)
19         })*
20 
21         pub static $property_names: &[&str] = &[
22             $(stringify!($prop),)*
23         ];
24     };
25 }
26 
27 macro_rules! char_property_functions {
28     // For define custom property names
29     {$(
30         mod $module:ident;
31         static $property_names:ident = [$(
32             $prop:ident,
33         )*];
34     )*} => {$(
35         property_functions!($module, $property_names, [$(
36             $prop,
37         )*]);
38     )*};
39     // For define property by copy BY_NAME values from `ucd-generate` generated.
40     {$(
41         mod $module:ident;
42         static $property_names:ident = [$(
43             ($_name:tt, $prop:ident),
44         )*];
45     )*} => {$(
46         property_functions!($module, $property_names, [$(
47             $prop,
48         )*]);
49     )*};
50 }
51 
52 char_property_functions! {
53     mod binary;
54     static BINARY_PROPERTY_NAMES = [
55         // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
56         ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
57         CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
58         CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC,
59         EMOJI, EMOJI_COMPONENT, EMOJI_MODIFIER, EMOJI_MODIFIER_BASE, EMOJI_PRESENTATION, EXTENDED_PICTOGRAPHIC,
60         EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN,
61         IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL,
62         LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC,
63         OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE,
64         OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX,
65         PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL,
66         REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
67         UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
68     ];
69 }
70 
71 char_property_functions! {
72     mod category;
73     // Copy from category::BY_NAME
74     static CATEGORY_PROPERTY_NAMES = [
75         ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
76         ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
77         ("Currency_Symbol", CURRENCY_SYMBOL),
78         ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
79         ("Enclosing_Mark", ENCLOSING_MARK),
80         ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
81         ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
82         ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
83         ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
84         ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
85         ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
86         ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
87         ("Other", OTHER), ("Other_Letter", OTHER_LETTER),
88         ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
89         ("Other_Symbol", OTHER_SYMBOL),
90         ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
91         ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
92         ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
93         ("Surrogate", SURROGATE), ("Symbol", SYMBOL),
94         ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
95         ("Uppercase_Letter", UPPERCASE_LETTER),
96     ];
97 
98     mod script;
99     // Copy from script::BY_NAME
100     static SCRIPT_PROPERTY_NAMES = [
101         ("Adlam", ADLAM),
102         ("Ahom", AHOM),
103         ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
104         ("Arabic", ARABIC),
105         ("Armenian", ARMENIAN),
106         ("Avestan", AVESTAN),
107         ("Balinese", BALINESE),
108         ("Bamum", BAMUM),
109         ("Bassa_Vah", BASSA_VAH),
110         ("Batak", BATAK),
111         ("Bengali", BENGALI),
112         ("Bhaiksuki", BHAIKSUKI),
113         ("Bopomofo", BOPOMOFO),
114         ("Brahmi", BRAHMI),
115         ("Braille", BRAILLE),
116         ("Buginese", BUGINESE),
117         ("Buhid", BUHID),
118         ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
119         ("Carian", CARIAN),
120         ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
121         ("Chakma", CHAKMA),
122         ("Cham", CHAM),
123         ("Cherokee", CHEROKEE),
124         ("Chorasmian", CHORASMIAN),
125         ("Common", COMMON),
126         ("Coptic", COPTIC),
127         ("Cuneiform", CUNEIFORM),
128         ("Cypriot", CYPRIOT),
129         ("Cypro_Minoan", CYPRO_MINOAN),
130         ("Cyrillic", CYRILLIC),
131         ("Deseret", DESERET),
132         ("Devanagari", DEVANAGARI),
133         ("Dives_Akuru", DIVES_AKURU),
134         ("Dogra", DOGRA),
135         ("Duployan", DUPLOYAN),
136         ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
137         ("Elbasan", ELBASAN),
138         ("Elymaic", ELYMAIC),
139         ("Ethiopic", ETHIOPIC),
140         ("Georgian", GEORGIAN),
141         ("Glagolitic", GLAGOLITIC),
142         ("Gothic", GOTHIC),
143         ("Grantha", GRANTHA),
144         ("Greek", GREEK),
145         ("Gujarati", GUJARATI),
146         ("Gunjala_Gondi", GUNJALA_GONDI),
147         ("Gurmukhi", GURMUKHI),
148         ("Han", HAN),
149         ("Hangul", HANGUL),
150         ("Hanifi_Rohingya", HANIFI_ROHINGYA),
151         ("Hanunoo", HANUNOO),
152         ("Hatran", HATRAN),
153         ("Hebrew", HEBREW),
154         ("Hiragana", HIRAGANA),
155         ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
156         ("Inherited", INHERITED),
157         ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
158         ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
159         ("Javanese", JAVANESE),
160         ("Kaithi", KAITHI),
161         ("Kannada", KANNADA),
162         ("Katakana", KATAKANA),
163         ("Kawi", KAWI),
164         ("Kayah_Li", KAYAH_LI),
165         ("Kharoshthi", KHAROSHTHI),
166         ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
167         ("Khmer", KHMER),
168         ("Khojki", KHOJKI),
169         ("Khudawadi", KHUDAWADI),
170         ("Lao", LAO),
171         ("Latin", LATIN),
172         ("Lepcha", LEPCHA),
173         ("Limbu", LIMBU),
174         ("Linear_A", LINEAR_A),
175         ("Linear_B", LINEAR_B),
176         ("Lisu", LISU),
177         ("Lycian", LYCIAN),
178         ("Lydian", LYDIAN),
179         ("Mahajani", MAHAJANI),
180         ("Makasar", MAKASAR),
181         ("Malayalam", MALAYALAM),
182         ("Mandaic", MANDAIC),
183         ("Manichaean", MANICHAEAN),
184         ("Marchen", MARCHEN),
185         ("Masaram_Gondi", MASARAM_GONDI),
186         ("Medefaidrin", MEDEFAIDRIN),
187         ("Meetei_Mayek", MEETEI_MAYEK),
188         ("Mende_Kikakui", MENDE_KIKAKUI),
189         ("Meroitic_Cursive", MEROITIC_CURSIVE),
190         ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
191         ("Miao", MIAO),
192         ("Modi", MODI),
193         ("Mongolian", MONGOLIAN),
194         ("Mro", MRO),
195         ("Multani", MULTANI),
196         ("Myanmar", MYANMAR),
197         ("Nabataean", NABATAEAN),
198         ("Nag_Mundari", NAG_MUNDARI),
199         ("Nandinagari", NANDINAGARI),
200         ("New_Tai_Lue", NEW_TAI_LUE),
201         ("Newa", NEWA),
202         ("Nko", NKO),
203         ("Nushu", NUSHU),
204         ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
205         ("Ogham", OGHAM),
206         ("Ol_Chiki", OL_CHIKI),
207         ("Old_Hungarian", OLD_HUNGARIAN),
208         ("Old_Italic", OLD_ITALIC),
209         ("Old_North_Arabian", OLD_NORTH_ARABIAN),
210         ("Old_Permic", OLD_PERMIC),
211         ("Old_Persian", OLD_PERSIAN),
212         ("Old_Sogdian", OLD_SOGDIAN),
213         ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
214         ("Old_Turkic", OLD_TURKIC),
215         ("Old_Uyghur", OLD_UYGHUR),
216         ("Oriya", ORIYA),
217         ("Osage", OSAGE),
218         ("Osmanya", OSMANYA),
219         ("Pahawh_Hmong", PAHAWH_HMONG),
220         ("Palmyrene", PALMYRENE),
221         ("Pau_Cin_Hau", PAU_CIN_HAU),
222         ("Phags_Pa", PHAGS_PA),
223         ("Phoenician", PHOENICIAN),
224         ("Psalter_Pahlavi", PSALTER_PAHLAVI),
225         ("Rejang", REJANG),
226         ("Runic", RUNIC),
227         ("Samaritan", SAMARITAN),
228         ("Saurashtra", SAURASHTRA),
229         ("Sharada", SHARADA),
230         ("Shavian", SHAVIAN),
231         ("Siddham", SIDDHAM),
232         ("SignWriting", SIGNWRITING),
233         ("Sinhala", SINHALA),
234         ("Sogdian", SOGDIAN),
235         ("Sora_Sompeng", SORA_SOMPENG),
236         ("Soyombo", SOYOMBO),
237         ("Sundanese", SUNDANESE),
238         ("Syloti_Nagri", SYLOTI_NAGRI),
239         ("Syriac", SYRIAC),
240         ("Tagalog", TAGALOG),
241         ("Tagbanwa", TAGBANWA),
242         ("Tai_Le", TAI_LE),
243         ("Tai_Tham", TAI_THAM),
244         ("Tai_Viet", TAI_VIET),
245         ("Takri", TAKRI),
246         ("Tamil", TAMIL),
247         ("Tangsa", TANGSA),
248         ("Tangut", TANGUT),
249         ("Telugu", TELUGU),
250         ("Thaana", THAANA),
251         ("Thai", THAI),
252         ("Tibetan", TIBETAN),
253         ("Tifinagh", TIFINAGH),
254         ("Tirhuta", TIRHUTA),
255         ("Toto", TOTO),
256         ("Ugaritic", UGARITIC),
257         ("Vai", VAI),
258         ("Vithkuqi", VITHKUQI),
259         ("Wancho", WANCHO),
260         ("Warang_Citi", WARANG_CITI),
261         ("Yezidi", YEZIDI),
262         ("Yi", YI),
263         ("Zanabazar_Square", ZANABAZAR_SQUARE),
264     ];
265 }
266 
267 /// Return all available unicode property names
unicode_property_names() -> Box<dyn Iterator<Item = &'static str>>268 pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
269     Box::new(
270         BINARY_PROPERTY_NAMES
271             .iter()
272             .map(|name| *name)
273             .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name))
274             .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)),
275     )
276 }
277 
by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>>278 pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
279     for property in binary::BY_NAME {
280         if name == property.0.to_uppercase() {
281             return Some(Box::new(move |c| property.1.contains_char(c)));
282         }
283     }
284 
285     for property in category::BY_NAME {
286         if name == property.0.to_uppercase() {
287             return Some(Box::new(move |c| property.1.contains_char(c)));
288         }
289     }
290 
291     for property in script::BY_NAME {
292         if name == property.0.to_uppercase() {
293             return Some(Box::new(move |c| property.1.contains_char(c)));
294         }
295     }
296 
297     None
298 }
299