xref: /aosp_15_r20/external/icu/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu;
4 
5 import static com.google.common.base.CharMatcher.whitespace;
6 import static com.google.common.base.Preconditions.checkArgument;
7 import static com.google.common.base.Preconditions.checkNotNull;
8 import static com.google.common.base.Preconditions.checkState;
9 import static com.google.common.collect.ImmutableMap.toImmutableMap;
10 import static java.util.function.Function.identity;
11 import static org.unicode.cldr.api.AttributeKey.keyOf;
12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
13 
14 import java.util.Arrays;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.Objects;
19 import java.util.Optional;
20 import java.util.Set;
21 import java.util.function.Function;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24 import java.util.stream.Stream;
25 
26 import org.unicode.cldr.api.AttributeKey;
27 import org.unicode.cldr.api.CldrDataSupplier;
28 import org.unicode.cldr.api.CldrDataType;
29 import org.unicode.cldr.api.PathMatcher;
30 
31 import com.google.common.base.Ascii;
32 import com.google.common.base.Splitter;
33 import com.google.common.base.Strings;
34 import com.google.common.collect.HashBasedTable;
35 import com.google.common.collect.ImmutableMap;
36 import com.google.common.collect.ImmutableSet;
37 import com.google.common.collect.ImmutableTable;
38 import com.google.common.collect.Sets;
39 import com.google.common.collect.Table;
40 
41 /**
42  * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
43  * in some mapper classes.
44  *
45  * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
46  * build using the same underlying CLDR data. The only reason mapper classes do not create their
47  * own instances directly is the relative cost of processing all the supplemental data each time.
48  */
49 // TODO: This should be moved into the API and leverage some of the existing utility functions.
50 public final class SupplementalData {
51     // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
52     // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
53     // locale. However CLDR cannot represent this currently because calendar defaults are in
54     // supplemental data (rather than locale data) and are keyed only on territory.
55     private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
56         ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
57 
58     private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
59 
60     private static final PathMatcher ALIAS =
61         PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]");
62 
63     private static final PathMatcher PARENT_LOCALE =
64         PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]");
65     private static final AttributeKey COMPONENT = keyOf("parentLocales", "component");
66     private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
67     private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
68 
69     private static final PathMatcher CALENDER_PREFERENCE =
70         PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
71     private static final AttributeKey CALENDER_TERRITORIES =
72         keyOf("calendarPreference", "territories");
73     private static final AttributeKey CALENDER_ORDERING =
74         keyOf("calendarPreference", "ordering");
75 
76     private static final PathMatcher LIKELY_SUBTAGS =
77         PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]");
78     private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
79     private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
80 
81     private static final Splitter LIST_SPLITTER =
82         Splitter.on(whitespace()).omitEmptyStrings();
83 
84     // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
85     // a single value (it's structurally always a list, but only territory aliases have a need for
86     // more than one value).
87     private enum Alias {
88         LANGUAGE, SCRIPT, TERRITORY;
89 
90         private static final ImmutableMap<String, Alias> TYPE_MAP =
91             Arrays.stream(values())
92                 .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
93 
94         private final String elementName = Ascii.toLowerCase(name()) + "Alias";
95         final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
96         final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
97 
forElementName(String name)98         static Optional<Alias> forElementName(String name) {
99             return Optional.ofNullable(TYPE_MAP.get(name));
100         }
101     }
102 
103     /**
104      * Creates a supplemental data API instance from the given CLDR data supplier.
105      *
106      * @param src the CLDR data supplier.
107      * @return the supplemental data API.
108      */
create(CldrDataSupplier src)109     public static SupplementalData create(CldrDataSupplier src) {
110         Table<Alias, String, String> aliasTable = HashBasedTable.create();
111         Map<String, String> parentLocaleMap = new HashMap<>();
112         Map<String, String> defaultCalendarMap = new HashMap<>();
113         Map<String, String> likelySubtagMap = new HashMap<>();
114 
115         src.getDataForType(CldrDataType.SUPPLEMENTAL).accept(
116             ARBITRARY,
117             v -> {
118                 if (ALIAS.matches(v.getPath())) {
119                     // Territory alias replacements can be a list of values (e.g. when countries
120                     // break up). We use the first (geo-politically most significant) value. This
121                     // doesn't happen for languages or scripts, but could in theory.
122                     Alias.forElementName(v.getPath().getName()).ifPresent(
123                         alias -> aliasTable.put(
124                             alias,
125                             alias.typeKey.valueFrom(v),
126                             alias.replacementKey.valueFrom(v)));
127                 } else if (PARENT_LOCALE.matches(v.getPath()) && !COMPONENT.optionalValueFrom(v).isPresent()) {
128                     // CLDR-16253 added component-specific parents, which we ignore for now.
129                     // TODO(ICU-22289): Handle these properly.
130                     String p = PARENT.valueFrom(v);
131                     LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
132                 } else if (CALENDER_PREFERENCE.matches(v.getPath())) {
133                     String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
134                     CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
135                 } else if (LIKELY_SUBTAGS.matches(v.getPath())) {
136                     likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
137                 }
138             });
139 
140         Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS);
141         return new SupplementalData(
142             availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
143     }
144 
145     // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
146     // data generation. Because this is mutable, it is thoroughly unsuitable for general use.
147     private static final class LocaleId {
148         // From: https://unicode.org/reports/tr35/#Identifiers
149         // Locale ID is:
150         //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
151         //
152         // However in CLDR data, there's always a language (even if it's "und"), and never more
153         // than one variant, so this can be simplified to:
154         //   <language>(_<script>)?(_<region>)?(_<variant>)?
155         //
156         // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
157         //   Note that the specification allows for languages 5-8 characters long, but in reality
158         //   this has never occurred yet, so it's ignored in this code.
159         //
160         // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
161         //   The specification permits any casing for script subtags, but since all the data uses
162         //   the capitalized "Xxxx" form, that's what this code expects.
163         //
164         // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
165         //   identifier (e.g. "001").
166         //
167         // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
168         //   with a digit (this avoids any ambiguity with script subtags). However because ICU
169         //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
170         //   merely "longer than 5".
171         //
172         // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
173         // for either '-' or '_').
174         //
175         // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
176         private static final Pattern LOCALE_ID =
177             Pattern.compile("([a-z]{2,3})"
178                 + "(?:_([A-Z][a-z]{3}))?"
179                 + "(?:_([A-Z]{2}|[0-9]{3}))?"
180                 + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
181 
parse(String localeId)182         static LocaleId parse(String localeId) {
183             Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
184             checkArgument(m.matches(), "invalid locale ID: %s", localeId);
185             return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
186         }
187 
of(String language, String script, String region)188         static LocaleId of(String language, String script, String region) {
189             return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
190         }
191 
192         // Only the language subtag is non-nullable.
193         private String languageSubtag;
194         private String scriptSubtag;
195         private String regionSubtag;
196         private String variantSubtag;
197 
getLanguage()198         String getLanguage() {
199             return languageSubtag;
200         }
201 
getScript()202         String getScript() {
203             return scriptSubtag;
204         }
205 
getRegion()206         String getRegion() {
207             return regionSubtag;
208         }
209 
getVariant()210         String getVariant() {
211             return variantSubtag;
212         }
213 
setLanguage(String languageSubtag)214         LocaleId setLanguage(String languageSubtag) {
215             checkNotNull(languageSubtag, "language subtag must not be null");
216             checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
217             this.languageSubtag = languageSubtag;
218             return this;
219         }
220 
setScript(String scriptSubtag)221         LocaleId setScript(String scriptSubtag) {
222             this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
223             return this;
224         }
225 
setRegion(String regionSubtag)226         LocaleId setRegion(String regionSubtag) {
227             this.regionSubtag = Strings.emptyToNull(regionSubtag);
228             return this;
229         }
230 
setVariant(String variantSubtag)231         LocaleId setVariant(String variantSubtag) {
232             this.variantSubtag = Strings.emptyToNull(variantSubtag);
233             return this;
234         }
235 
toString()236         @Override public String toString() {
237             StringBuilder id = new StringBuilder(languageSubtag);
238             if (scriptSubtag != null) {
239                 id.append("_").append(scriptSubtag);
240             }
241             if (regionSubtag != null) {
242                 id.append("_").append(regionSubtag);
243             }
244             if (variantSubtag != null) {
245                 id.append("_").append(variantSubtag);
246             }
247             return id.toString();
248         }
249 
equals(Object o)250         @Override public boolean equals(Object o) {
251             if (!(o instanceof LocaleId)) {
252                 return false;
253             }
254             LocaleId other = (LocaleId) o;
255             return Objects.equals(languageSubtag, other.languageSubtag)
256                 && Objects.equals(scriptSubtag, other.scriptSubtag)
257                 && Objects.equals(regionSubtag, other.regionSubtag)
258                 && Objects.equals(variantSubtag, other.variantSubtag);
259         }
260 
hashCode()261         @Override public int hashCode() {
262             return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
263         }
264     }
265 
266     private final ImmutableSet<String> availableIds;
267     private final ImmutableTable<Alias, String, String> aliasTable;
268     private final ImmutableMap<String, String> parentLocaleMap;
269     private final ImmutableMap<String, String> defaultCalendarMap;
270     private final ImmutableMap<String, String> likelySubtagMap;
271 
SupplementalData( Set<String> availableIds, Table<Alias, String, String> aliasTable, Map<String, String> parentLocaleMap, Map<String, String> defaultCalendarMap, Map<String, String> likelySubtagMap)272     private SupplementalData(
273         Set<String> availableIds,
274         Table<Alias, String, String> aliasTable,
275         Map<String, String> parentLocaleMap,
276         Map<String, String> defaultCalendarMap,
277         Map<String, String> likelySubtagMap) {
278 
279         this.availableIds = ImmutableSet.copyOf(availableIds);
280         this.aliasTable = ImmutableTable.copyOf(aliasTable);
281         this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
282         this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
283         this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
284     }
285 
getAvailableLocaleIds()286     public ImmutableSet<String> getAvailableLocaleIds() {
287         return availableIds;
288     }
289 
290     /**
291      * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
292      */
maximize(String localeId)293     public Optional<String> maximize(String localeId) {
294         return addLikelySubtags(localeId).map(Object::toString);
295     }
296 
297     /**
298      * Returns the locale ID with any deprecated elements replaced. This is an
299      * implementation of the algorithm specified in
300      * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
301      * specification</a> but without any "minimizing" of the final result (as happens for
302      * canonicalization in the CLDR tools).
303      */
replaceDeprecatedTags(String localeId)304     public String replaceDeprecatedTags(String localeId) {
305         if (localeId.equals("root")) {
306             return localeId;
307         }
308         LocaleId id = LocaleId.parse(localeId);
309 
310         // ---- LDML Specification ----
311         // If the region subtag matches the type attribute of a territoryAlias element in
312         // Supplemental Data, replace the region subtag with the replacement value, as follows:
313         //
314         // * If there is a single territory in the replacement, use it.
315         // * If there are multiple territories:
316         //   * Look up the most likely territory for the base language code (and script, if there
317         //     is one).
318         //   * If that likely territory is in the list, use it.
319         //   * Otherwise, use the first territory in the list.
320         // ----
321         // However there is a footnote that says:
322         //   Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
323         //   However, there are a small number of cases of multiple territories, so the mappings
324         //   can be precomputed. This results in a faster lookup with a very small subset of the
325         //   likely subtags data.
326         //
327         // Note that (contrary to the order implied by the LDML specification) this step is
328         // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
329         // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
330         // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
331         //
332         // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
333         if (id.getRegion() != null) {
334             String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
335             if (replacementRegions != null) {
336                 List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
337                 checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
338                 if (regions.size() == 1) {
339                     id.setRegion(regions.get(0));
340                 } else {
341                     LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
342                     String likelyId = likelySubtagMap.get(key.toString());
343                     if (likelyId == null) {
344                         likelyId = likelySubtagMap.get(key.setScript(null).toString());
345                     }
346                     String likelyRegion =
347                         likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
348                     if (regions.contains(likelyRegion)) {
349                         id.setRegion(likelyRegion);
350                     } else {
351                         id.setRegion(regions.get(0));
352                     }
353                 }
354             }
355         }
356 
357         // While it's not mentioned in the LDML specification, there is data in the alias table for
358         // replacement scripts (currently it contains exactly one entry with one value). Because
359         // its not clear if this is intended to only be single values or a list (and how to handle
360         // it if it were a list), there's a hard check to ensure it's only ever a single value.
361         if (id.getScript() != null) {
362             String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
363             if (replacementScript != null) {
364                 checkArgument(whitespace().matchesNoneOf(replacementScript),
365                     "unexpected list of replacement scripts: %s", replacementScript);
366                 id.setScript(replacementScript);
367             }
368         }
369 
370         // ---- LDML Specification ----
371         // If the language subtag matches the type attribute of a languageAlias element in
372         // Supplemental Data, replace the language subtag with the replacement value.
373         //
374         // If there are additional subtags in the replacement value, add them to the result, but
375         // only if there is no corresponding subtag already in the tag.
376         // ----
377         // Contrary to the precise wording of the specification, we don't just check the language
378         // subtag, since language aliases can contain script and even region information. Instead
379         // we check the alias table using the same order as defined in subtag maximizing:
380         //
381         // <language>_<script>_<region>
382         // <language>_<region>
383         // <language>_<script>
384         // <language>
385         //
386         // There is no need to check for "und" however since that's not aliased anything, but since
387         // it shares the same code it's harmless to do.
388         resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
389             .ifPresent(resolvedId -> {
390                 id.setLanguage(checkNotNull(resolvedId.getLanguage(),
391                      "missing language subtag in language alias: %s", resolvedId));
392                 if (id.getScript() == null) {
393                     id.setScript(resolvedId.getScript());
394                 }
395                 if (id.getRegion() == null) {
396                     id.setRegion(resolvedId.getRegion());
397                 }
398                 if (id.getVariant() == null) {
399                     id.setVariant(resolvedId.getVariant());
400                 }
401             });
402         return id.toString();
403     }
404 
405     /**
406      * Returns a suitable default calendar for a given locale if it's different from the default
407      * calendar inferred by the locale's parent.
408      *
409      * <p>Note that since the default calendar data is keyed from territory (region subtag) rather
410      * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
411      * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
412      * handled with hard-code special casing, but should probably be data driven eventually.
413      */
getDefaultCalendar(String localeId)414     public Optional<String> getDefaultCalendar(String localeId) {
415         Optional<String> calendar = getSpecialCaseCalendar(localeId);
416         if (calendar.isPresent()) {
417             return calendar;
418         }
419         String t = territoryOf(localeId);
420         calendar = Optional.ofNullable(defaultCalendarMap.get(t));
421         if (!calendar.isPresent()) {
422             return Optional.empty();
423         }
424         String rootCalendar = defaultCalendarMap.get("001");
425         checkState(!rootCalendar.isEmpty(), "missing root calendar");
426         if (localeId.equals("root")) {
427             return Optional.of(rootCalendar);
428         }
429         // All locales reach "root" eventually, and that maps to territory "001" which
430         // we already know has a value, so this loop *must* exit.
431         String parentCalendar;
432         do {
433             localeId = getParent(localeId);
434             String territory = territoryOf(localeId);
435             parentCalendar = defaultCalendarMap.get(territory);
436         } while (parentCalendar == null);
437         return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
438     }
439 
440     // Hack to work around the limitation that CLDR data cannot represent default calendars that
441     // change because of non-territory information. Since this is limited to exactly two cases at
442     // the moment, and is unlikely to be expanded, it's being done directly in code.
getSpecialCaseCalendar(String localeId)443     private Optional<String> getSpecialCaseCalendar(String localeId) {
444         Optional<String> maximized = maximize(localeId);
445         if (maximized.isPresent()) {
446             switch (maximized.get()) {
447             case "ja_Jpan_JP_TRADITIONAL":
448                 return Optional.of("japanese");
449             case "th_Thai_TH_TRADITIONAL":
450                 return Optional.of("buddhist");
451             }
452         }
453         return Optional.empty();
454     }
455 
456     /**
457      * Returns the parent of a non-root locale ID. This is more complex than simple truncation for
458      * two reasons:
459      * <ul>
460      *     <li>There may be an explicit parent locale ID specified in the CLDR data.
461      *     <li>Removal of non-default script subtags makes the parent locale "root" (unless there
462      *         was an explicit parent specified).
463      * </ul>
464      * Note that all valid locale ID parent "chains" must end up at "root" eventually.
465      *
466      * For example (showing parent "chains"):
467      * <ul>
468      *     <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
469      *     <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
470      * </ul>
471      *
472      * @throws IllegalArgumentException if the given locale ID is invalid or "root".
473      */
getParent(String localeId)474     public String getParent(String localeId) {
475         checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
476         // We probably want to fully canonicalize here. But in the absence of that we
477         // at least need to do the following canonicalization:
478         if (localeId.equals("no_NO_NY")) {
479             localeId = "nn_NO";
480         }
481         // Always defer to an explicit parent locale set in the CLDR data.
482         Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
483         if (explicitParent.isPresent()) {
484             return explicitParent.get();
485         }
486         // Now look for the start of the last ID "part" in order to truncate.
487         int lastPartSeperatorIndex = localeId.lastIndexOf('_');
488         // The parent of a base language ID (e.g. "en" or "fr") is always "root".
489         if (lastPartSeperatorIndex == -1) {
490             return "root";
491         }
492         String parentId = localeId.substring(0, lastPartSeperatorIndex);
493 
494         // However, if the script of the locale is what's being truncated and it's NOT the default
495         // script for the language, return "root" as the parent rather than truncating.
496         String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
497         if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
498             return "root";
499         }
500         return !parentId.isEmpty() ? parentId : "root";
501     }
502 
503     /**
504      * Returns the explicit parent of a locale ID if specified in the CLDR data.
505      *
506      * Note that this method will not return a value for most locale IDs, since they do not have
507      * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
508      * #getParent(String)}.
509      */
getExplicitParentLocaleOf(String localeId)510     public Optional<String> getExplicitParentLocaleOf(String localeId) {
511         return Optional.ofNullable(parentLocaleMap.get(localeId));
512     }
513 
territoryOf(String localeId)514     private String territoryOf(String localeId) {
515         return localeId.equals("root")
516             ? "001"
517             : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
518     }
519 
scriptOf(String localeId)520     private String scriptOf(String localeId) {
521         return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
522     }
523 
524     // From: https://unicode.org/reports/tr35/#Likely_Subtags
525     //
526     // Add Likely Subtags
527     // ------------------
528     // Given a source locale X, to return a locale Y where the empty subtags have been filled in
529     // by the most likely subtags. A subtag is called empty if it is a missing script or region
530     // subtag, or it is a base language subtag with the value "und".
531     //
532     // Canonicalize
533     // ------------
534     // Make sure the input locale is in canonical form ...
535     // ...
536     // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
537     //
538     // Note that this implementation does not need to handle
539     // legacy language tags (marked as “Type: grandfathered” in BCP 47).
addLikelySubtags(String localeId)540     private Optional<LocaleId> addLikelySubtags(String localeId) {
541         if (localeId.equals("root")) {
542             return Optional.empty();
543         }
544 
545         LocaleId id = LocaleId.parse(localeId);
546         // ---- LDML Specification ----
547         // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
548         if ("Zzzz".equals(id.getScript())) {
549             id.setScript(null);
550         }
551         if ("ZZ".equals(id.getRegion())) {
552             id.setRegion(null);
553         }
554         // ---- LDML Specification ----
555         // A subtag is called empty if it is a missing script or region subtag, or it is a base
556         // language subtag with the value "und"
557         if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
558             // We are already canonical, so just return.
559             return Optional.of(id);
560         }
561         Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
562         if (!optTags.isPresent()) {
563             return Optional.empty();
564         }
565         LocaleId subtags = optTags.get();
566         checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
567         // Replace "missing" elements in the original ID with likely subtags.
568         if (id.getLanguage().equals("und")) {
569             id.setLanguage(subtags.getLanguage());
570         }
571         if (id.getScript() == null) {
572             id.setScript(checkNotNull(subtags.getScript()));
573         }
574         if (id.getRegion() == null) {
575             id.setRegion(checkNotNull(subtags.getRegion()));
576         }
577         // Language is not "und" and both script and region subtags are set!
578         return Optional.of(id);
579     }
580 
581     // From: https://unicode.org/reports/tr35/#Likely_Subtags
582     //
583     // Lookup
584     // ------
585     // Lookup each of the following in order, and stop on the first match:
586     // <language>_<script>_<region>
587     // <language>_<region>
588     // <language>_<script>
589     // <language>
590     // "und"_<script>
resolveLocaleId(LocaleId id, Function<String, String> fn)591     private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
592         String lang = id.getLanguage();
593         String script = id.getScript();
594         String region = id.getRegion();
595         Stream<LocaleId> candidateIds = Stream.of(
596             LocaleId.of(lang, script, region),
597             LocaleId.of(lang, null, region),
598             LocaleId.of(lang, script, null),
599             LocaleId.of(lang, null, null));
600         // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
601         // its own ("en_Latn_US") which is not intended.
602         if (script != null) {
603             candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
604         }
605         return candidateIds
606             // Remove duplicate IDs (keeps the first one encountered).
607             .distinct()
608             .map(Object::toString)
609             .map(fn)
610             .filter(Objects::nonNull)
611             .findFirst()
612             .map(LocaleId::parse);
613     }
614 }
615