1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.CharMatcher.whitespace; 6 import static com.google.common.base.Preconditions.checkArgument; 7 import static com.google.common.base.Preconditions.checkNotNull; 8 import static com.google.common.base.Preconditions.checkState; 9 import static com.google.common.collect.ImmutableMap.toImmutableMap; 10 import static java.util.function.Function.identity; 11 import static org.unicode.cldr.api.AttributeKey.keyOf; 12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY; 13 14 import java.util.Arrays; 15 import java.util.HashMap; 16 import java.util.List; 17 import java.util.Map; 18 import java.util.Objects; 19 import java.util.Optional; 20 import java.util.Set; 21 import java.util.function.Function; 22 import java.util.regex.Matcher; 23 import java.util.regex.Pattern; 24 import java.util.stream.Stream; 25 26 import org.unicode.cldr.api.AttributeKey; 27 import org.unicode.cldr.api.CldrDataSupplier; 28 import org.unicode.cldr.api.CldrDataType; 29 import org.unicode.cldr.api.PathMatcher; 30 31 import com.google.common.base.Ascii; 32 import com.google.common.base.Splitter; 33 import com.google.common.base.Strings; 34 import com.google.common.collect.HashBasedTable; 35 import com.google.common.collect.ImmutableMap; 36 import com.google.common.collect.ImmutableSet; 37 import com.google.common.collect.ImmutableTable; 38 import com.google.common.collect.Sets; 39 import com.google.common.collect.Table; 40 41 /** 42 * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic 43 * in some mapper classes. 44 * 45 * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is 46 * build using the same underlying CLDR data. The only reason mapper classes do not create their 47 * own instances directly is the relative cost of processing all the supplemental data each time. 48 */ 49 // TODO: This should be moved into the API and leverage some of the existing utility functions. 50 public final class SupplementalData { 51 // Special IDs which are not supported via CLDR, but for which synthetic data is injected. 52 // The "TRADITIONAL" variants are here because their calendar differs from the non-variant 53 // locale. However CLDR cannot represent this currently because calendar defaults are in 54 // supplemental data (rather than locale data) and are keyed only on territory. 55 private static final ImmutableSet<String> PHANTOM_LOCALE_IDS = 56 ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL"); 57 58 private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}"); 59 60 private static final PathMatcher ALIAS = 61 PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]"); 62 63 private static final PathMatcher PARENT_LOCALE = 64 PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]"); 65 private static final AttributeKey COMPONENT = keyOf("parentLocales", "component"); 66 private static final AttributeKey PARENT = keyOf("parentLocale", "parent"); 67 private static final AttributeKey LOCALES = keyOf("parentLocale", "locales"); 68 69 private static final PathMatcher CALENDER_PREFERENCE = 70 PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]"); 71 private static final AttributeKey CALENDER_TERRITORIES = 72 keyOf("calendarPreference", "territories"); 73 private static final AttributeKey CALENDER_ORDERING = 74 keyOf("calendarPreference", "ordering"); 75 76 private static final PathMatcher LIKELY_SUBTAGS = 77 PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]"); 78 private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from"); 79 private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to"); 80 81 private static final Splitter LIST_SPLITTER = 82 Splitter.on(whitespace()).omitEmptyStrings(); 83 84 // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than 85 // a single value (it's structurally always a list, but only territory aliases have a need for 86 // more than one value). 87 private enum Alias { 88 LANGUAGE, SCRIPT, TERRITORY; 89 90 private static final ImmutableMap<String, Alias> TYPE_MAP = 91 Arrays.stream(values()) 92 .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity())); 93 94 private final String elementName = Ascii.toLowerCase(name()) + "Alias"; 95 final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type"); 96 final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement"); 97 forElementName(String name)98 static Optional<Alias> forElementName(String name) { 99 return Optional.ofNullable(TYPE_MAP.get(name)); 100 } 101 } 102 103 /** 104 * Creates a supplemental data API instance from the given CLDR data supplier. 105 * 106 * @param src the CLDR data supplier. 107 * @return the supplemental data API. 108 */ create(CldrDataSupplier src)109 public static SupplementalData create(CldrDataSupplier src) { 110 Table<Alias, String, String> aliasTable = HashBasedTable.create(); 111 Map<String, String> parentLocaleMap = new HashMap<>(); 112 Map<String, String> defaultCalendarMap = new HashMap<>(); 113 Map<String, String> likelySubtagMap = new HashMap<>(); 114 115 src.getDataForType(CldrDataType.SUPPLEMENTAL).accept( 116 ARBITRARY, 117 v -> { 118 if (ALIAS.matches(v.getPath())) { 119 // Territory alias replacements can be a list of values (e.g. when countries 120 // break up). We use the first (geo-politically most significant) value. This 121 // doesn't happen for languages or scripts, but could in theory. 122 Alias.forElementName(v.getPath().getName()).ifPresent( 123 alias -> aliasTable.put( 124 alias, 125 alias.typeKey.valueFrom(v), 126 alias.replacementKey.valueFrom(v))); 127 } else if (PARENT_LOCALE.matches(v.getPath()) && !COMPONENT.optionalValueFrom(v).isPresent()) { 128 // CLDR-16253 added component-specific parents, which we ignore for now. 129 // TODO(ICU-22289): Handle these properly. 130 String p = PARENT.valueFrom(v); 131 LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p)); 132 } else if (CALENDER_PREFERENCE.matches(v.getPath())) { 133 String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0); 134 CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c)); 135 } else if (LIKELY_SUBTAGS.matches(v.getPath())) { 136 likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v)); 137 } 138 }); 139 140 Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS); 141 return new SupplementalData( 142 availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap); 143 } 144 145 // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU 146 // data generation. Because this is mutable, it is thoroughly unsuitable for general use. 147 private static final class LocaleId { 148 // From: https://unicode.org/reports/tr35/#Identifiers 149 // Locale ID is: 150 // (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)* 151 // 152 // However in CLDR data, there's always a language (even if it's "und"), and never more 153 // than one variant, so this can be simplified to: 154 // <language>(_<script>)?(_<region>)?(_<variant>)? 155 // 156 // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw"). 157 // Note that the specification allows for languages 5-8 characters long, but in reality 158 // this has never occurred yet, so it's ignored in this code. 159 // 160 // * Script is 4-letter Xxxx script identifier (e.g. "Latn"). 161 // The specification permits any casing for script subtags, but since all the data uses 162 // the capitalized "Xxxx" form, that's what this code expects. 163 // 164 // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric 165 // identifier (e.g. "001"). 166 // 167 // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting 168 // with a digit (this avoids any ambiguity with script subtags). However because ICU 169 // violates this rule by using "TRADITIONAL" (11-letters) the length restriction is 170 // merely "longer than 5". 171 // 172 // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows 173 // for either '-' or '_'). 174 // 175 // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is: 176 private static final Pattern LOCALE_ID = 177 Pattern.compile("([a-z]{2,3})" 178 + "(?:_([A-Z][a-z]{3}))?" 179 + "(?:_([A-Z]{2}|[0-9]{3}))?" 180 + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?"); 181 parse(String localeId)182 static LocaleId parse(String localeId) { 183 Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null")); 184 checkArgument(m.matches(), "invalid locale ID: %s", localeId); 185 return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4)); 186 } 187 of(String language, String script, String region)188 static LocaleId of(String language, String script, String region) { 189 return new LocaleId().setLanguage(language).setScript(script).setRegion(region); 190 } 191 192 // Only the language subtag is non-nullable. 193 private String languageSubtag; 194 private String scriptSubtag; 195 private String regionSubtag; 196 private String variantSubtag; 197 getLanguage()198 String getLanguage() { 199 return languageSubtag; 200 } 201 getScript()202 String getScript() { 203 return scriptSubtag; 204 } 205 getRegion()206 String getRegion() { 207 return regionSubtag; 208 } 209 getVariant()210 String getVariant() { 211 return variantSubtag; 212 } 213 setLanguage(String languageSubtag)214 LocaleId setLanguage(String languageSubtag) { 215 checkNotNull(languageSubtag, "language subtag must not be null"); 216 checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty"); 217 this.languageSubtag = languageSubtag; 218 return this; 219 } 220 setScript(String scriptSubtag)221 LocaleId setScript(String scriptSubtag) { 222 this.scriptSubtag = Strings.emptyToNull(scriptSubtag); 223 return this; 224 } 225 setRegion(String regionSubtag)226 LocaleId setRegion(String regionSubtag) { 227 this.regionSubtag = Strings.emptyToNull(regionSubtag); 228 return this; 229 } 230 setVariant(String variantSubtag)231 LocaleId setVariant(String variantSubtag) { 232 this.variantSubtag = Strings.emptyToNull(variantSubtag); 233 return this; 234 } 235 toString()236 @Override public String toString() { 237 StringBuilder id = new StringBuilder(languageSubtag); 238 if (scriptSubtag != null) { 239 id.append("_").append(scriptSubtag); 240 } 241 if (regionSubtag != null) { 242 id.append("_").append(regionSubtag); 243 } 244 if (variantSubtag != null) { 245 id.append("_").append(variantSubtag); 246 } 247 return id.toString(); 248 } 249 equals(Object o)250 @Override public boolean equals(Object o) { 251 if (!(o instanceof LocaleId)) { 252 return false; 253 } 254 LocaleId other = (LocaleId) o; 255 return Objects.equals(languageSubtag, other.languageSubtag) 256 && Objects.equals(scriptSubtag, other.scriptSubtag) 257 && Objects.equals(regionSubtag, other.regionSubtag) 258 && Objects.equals(variantSubtag, other.variantSubtag); 259 } 260 hashCode()261 @Override public int hashCode() { 262 return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag); 263 } 264 } 265 266 private final ImmutableSet<String> availableIds; 267 private final ImmutableTable<Alias, String, String> aliasTable; 268 private final ImmutableMap<String, String> parentLocaleMap; 269 private final ImmutableMap<String, String> defaultCalendarMap; 270 private final ImmutableMap<String, String> likelySubtagMap; 271 SupplementalData( Set<String> availableIds, Table<Alias, String, String> aliasTable, Map<String, String> parentLocaleMap, Map<String, String> defaultCalendarMap, Map<String, String> likelySubtagMap)272 private SupplementalData( 273 Set<String> availableIds, 274 Table<Alias, String, String> aliasTable, 275 Map<String, String> parentLocaleMap, 276 Map<String, String> defaultCalendarMap, 277 Map<String, String> likelySubtagMap) { 278 279 this.availableIds = ImmutableSet.copyOf(availableIds); 280 this.aliasTable = ImmutableTable.copyOf(aliasTable); 281 this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap); 282 this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap); 283 this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap); 284 } 285 getAvailableLocaleIds()286 public ImmutableSet<String> getAvailableLocaleIds() { 287 return availableIds; 288 } 289 290 /** 291 * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible. 292 */ maximize(String localeId)293 public Optional<String> maximize(String localeId) { 294 return addLikelySubtags(localeId).map(Object::toString); 295 } 296 297 /** 298 * Returns the locale ID with any deprecated elements replaced. This is an 299 * implementation of the algorithm specified in 300 * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML 301 * specification</a> but without any "minimizing" of the final result (as happens for 302 * canonicalization in the CLDR tools). 303 */ replaceDeprecatedTags(String localeId)304 public String replaceDeprecatedTags(String localeId) { 305 if (localeId.equals("root")) { 306 return localeId; 307 } 308 LocaleId id = LocaleId.parse(localeId); 309 310 // ---- LDML Specification ---- 311 // If the region subtag matches the type attribute of a territoryAlias element in 312 // Supplemental Data, replace the region subtag with the replacement value, as follows: 313 // 314 // * If there is a single territory in the replacement, use it. 315 // * If there are multiple territories: 316 // * Look up the most likely territory for the base language code (and script, if there 317 // is one). 318 // * If that likely territory is in the list, use it. 319 // * Otherwise, use the first territory in the list. 320 // ---- 321 // However there is a footnote that says: 322 // Formally, replacement of multiple territories uses Section 4.3 Likely Subtags. 323 // However, there are a small number of cases of multiple territories, so the mappings 324 // can be precomputed. This results in a faster lookup with a very small subset of the 325 // likely subtags data. 326 // 327 // Note that (contrary to the order implied by the LDML specification) this step is 328 // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to 329 // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to 330 // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out. 331 // 332 // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing? 333 if (id.getRegion() != null) { 334 String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion()); 335 if (replacementRegions != null) { 336 List<String> regions = LIST_SPLITTER.splitToList(replacementRegions); 337 checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId); 338 if (regions.size() == 1) { 339 id.setRegion(regions.get(0)); 340 } else { 341 LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null); 342 String likelyId = likelySubtagMap.get(key.toString()); 343 if (likelyId == null) { 344 likelyId = likelySubtagMap.get(key.setScript(null).toString()); 345 } 346 String likelyRegion = 347 likelyId != null ? LocaleId.parse(likelyId).getRegion() : null; 348 if (regions.contains(likelyRegion)) { 349 id.setRegion(likelyRegion); 350 } else { 351 id.setRegion(regions.get(0)); 352 } 353 } 354 } 355 } 356 357 // While it's not mentioned in the LDML specification, there is data in the alias table for 358 // replacement scripts (currently it contains exactly one entry with one value). Because 359 // its not clear if this is intended to only be single values or a list (and how to handle 360 // it if it were a list), there's a hard check to ensure it's only ever a single value. 361 if (id.getScript() != null) { 362 String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript()); 363 if (replacementScript != null) { 364 checkArgument(whitespace().matchesNoneOf(replacementScript), 365 "unexpected list of replacement scripts: %s", replacementScript); 366 id.setScript(replacementScript); 367 } 368 } 369 370 // ---- LDML Specification ---- 371 // If the language subtag matches the type attribute of a languageAlias element in 372 // Supplemental Data, replace the language subtag with the replacement value. 373 // 374 // If there are additional subtags in the replacement value, add them to the result, but 375 // only if there is no corresponding subtag already in the tag. 376 // ---- 377 // Contrary to the precise wording of the specification, we don't just check the language 378 // subtag, since language aliases can contain script and even region information. Instead 379 // we check the alias table using the same order as defined in subtag maximizing: 380 // 381 // <language>_<script>_<region> 382 // <language>_<region> 383 // <language>_<script> 384 // <language> 385 // 386 // There is no need to check for "und" however since that's not aliased anything, but since 387 // it shares the same code it's harmless to do. 388 resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s)) 389 .ifPresent(resolvedId -> { 390 id.setLanguage(checkNotNull(resolvedId.getLanguage(), 391 "missing language subtag in language alias: %s", resolvedId)); 392 if (id.getScript() == null) { 393 id.setScript(resolvedId.getScript()); 394 } 395 if (id.getRegion() == null) { 396 id.setRegion(resolvedId.getRegion()); 397 } 398 if (id.getVariant() == null) { 399 id.setVariant(resolvedId.getVariant()); 400 } 401 }); 402 return id.toString(); 403 } 404 405 /** 406 * Returns a suitable default calendar for a given locale if it's different from the default 407 * calendar inferred by the locale's parent. 408 * 409 * <p>Note that since the default calendar data is keyed from territory (region subtag) rather 410 * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact 411 * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently 412 * handled with hard-code special casing, but should probably be data driven eventually. 413 */ getDefaultCalendar(String localeId)414 public Optional<String> getDefaultCalendar(String localeId) { 415 Optional<String> calendar = getSpecialCaseCalendar(localeId); 416 if (calendar.isPresent()) { 417 return calendar; 418 } 419 String t = territoryOf(localeId); 420 calendar = Optional.ofNullable(defaultCalendarMap.get(t)); 421 if (!calendar.isPresent()) { 422 return Optional.empty(); 423 } 424 String rootCalendar = defaultCalendarMap.get("001"); 425 checkState(!rootCalendar.isEmpty(), "missing root calendar"); 426 if (localeId.equals("root")) { 427 return Optional.of(rootCalendar); 428 } 429 // All locales reach "root" eventually, and that maps to territory "001" which 430 // we already know has a value, so this loop *must* exit. 431 String parentCalendar; 432 do { 433 localeId = getParent(localeId); 434 String territory = territoryOf(localeId); 435 parentCalendar = defaultCalendarMap.get(territory); 436 } while (parentCalendar == null); 437 return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar; 438 } 439 440 // Hack to work around the limitation that CLDR data cannot represent default calendars that 441 // change because of non-territory information. Since this is limited to exactly two cases at 442 // the moment, and is unlikely to be expanded, it's being done directly in code. getSpecialCaseCalendar(String localeId)443 private Optional<String> getSpecialCaseCalendar(String localeId) { 444 Optional<String> maximized = maximize(localeId); 445 if (maximized.isPresent()) { 446 switch (maximized.get()) { 447 case "ja_Jpan_JP_TRADITIONAL": 448 return Optional.of("japanese"); 449 case "th_Thai_TH_TRADITIONAL": 450 return Optional.of("buddhist"); 451 } 452 } 453 return Optional.empty(); 454 } 455 456 /** 457 * Returns the parent of a non-root locale ID. This is more complex than simple truncation for 458 * two reasons: 459 * <ul> 460 * <li>There may be an explicit parent locale ID specified in the CLDR data. 461 * <li>Removal of non-default script subtags makes the parent locale "root" (unless there 462 * was an explicit parent specified). 463 * </ul> 464 * Note that all valid locale ID parent "chains" must end up at "root" eventually. 465 * 466 * For example (showing parent "chains"): 467 * <ul> 468 * <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root} 469 * <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root} 470 * </ul> 471 * 472 * @throws IllegalArgumentException if the given locale ID is invalid or "root". 473 */ getParent(String localeId)474 public String getParent(String localeId) { 475 checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale"); 476 // We probably want to fully canonicalize here. But in the absence of that we 477 // at least need to do the following canonicalization: 478 if (localeId.equals("no_NO_NY")) { 479 localeId = "nn_NO"; 480 } 481 // Always defer to an explicit parent locale set in the CLDR data. 482 Optional<String> explicitParent = getExplicitParentLocaleOf(localeId); 483 if (explicitParent.isPresent()) { 484 return explicitParent.get(); 485 } 486 // Now look for the start of the last ID "part" in order to truncate. 487 int lastPartSeperatorIndex = localeId.lastIndexOf('_'); 488 // The parent of a base language ID (e.g. "en" or "fr") is always "root". 489 if (lastPartSeperatorIndex == -1) { 490 return "root"; 491 } 492 String parentId = localeId.substring(0, lastPartSeperatorIndex); 493 494 // However, if the script of the locale is what's being truncated and it's NOT the default 495 // script for the language, return "root" as the parent rather than truncating. 496 String lastPart = localeId.substring(lastPartSeperatorIndex + 1); 497 if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) { 498 return "root"; 499 } 500 return !parentId.isEmpty() ? parentId : "root"; 501 } 502 503 /** 504 * Returns the explicit parent of a locale ID if specified in the CLDR data. 505 * 506 * Note that this method will not return a value for most locale IDs, since they do not have 507 * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link 508 * #getParent(String)}. 509 */ getExplicitParentLocaleOf(String localeId)510 public Optional<String> getExplicitParentLocaleOf(String localeId) { 511 return Optional.ofNullable(parentLocaleMap.get(localeId)); 512 } 513 territoryOf(String localeId)514 private String territoryOf(String localeId) { 515 return localeId.equals("root") 516 ? "001" 517 : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ"); 518 } 519 scriptOf(String localeId)520 private String scriptOf(String localeId) { 521 return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz"); 522 } 523 524 // From: https://unicode.org/reports/tr35/#Likely_Subtags 525 // 526 // Add Likely Subtags 527 // ------------------ 528 // Given a source locale X, to return a locale Y where the empty subtags have been filled in 529 // by the most likely subtags. A subtag is called empty if it is a missing script or region 530 // subtag, or it is a base language subtag with the value "und". 531 // 532 // Canonicalize 533 // ------------ 534 // Make sure the input locale is in canonical form ... 535 // ... 536 // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur. 537 // 538 // Note that this implementation does not need to handle 539 // legacy language tags (marked as “Type: grandfathered” in BCP 47). addLikelySubtags(String localeId)540 private Optional<LocaleId> addLikelySubtags(String localeId) { 541 if (localeId.equals("root")) { 542 return Optional.empty(); 543 } 544 545 LocaleId id = LocaleId.parse(localeId); 546 // ---- LDML Specification ---- 547 // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur. 548 if ("Zzzz".equals(id.getScript())) { 549 id.setScript(null); 550 } 551 if ("ZZ".equals(id.getRegion())) { 552 id.setRegion(null); 553 } 554 // ---- LDML Specification ---- 555 // A subtag is called empty if it is a missing script or region subtag, or it is a base 556 // language subtag with the value "und" 557 if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) { 558 // We are already canonical, so just return. 559 return Optional.of(id); 560 } 561 Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get); 562 if (!optTags.isPresent()) { 563 return Optional.empty(); 564 } 565 LocaleId subtags = optTags.get(); 566 checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags); 567 // Replace "missing" elements in the original ID with likely subtags. 568 if (id.getLanguage().equals("und")) { 569 id.setLanguage(subtags.getLanguage()); 570 } 571 if (id.getScript() == null) { 572 id.setScript(checkNotNull(subtags.getScript())); 573 } 574 if (id.getRegion() == null) { 575 id.setRegion(checkNotNull(subtags.getRegion())); 576 } 577 // Language is not "und" and both script and region subtags are set! 578 return Optional.of(id); 579 } 580 581 // From: https://unicode.org/reports/tr35/#Likely_Subtags 582 // 583 // Lookup 584 // ------ 585 // Lookup each of the following in order, and stop on the first match: 586 // <language>_<script>_<region> 587 // <language>_<region> 588 // <language>_<script> 589 // <language> 590 // "und"_<script> resolveLocaleId(LocaleId id, Function<String, String> fn)591 private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) { 592 String lang = id.getLanguage(); 593 String script = id.getScript(); 594 String region = id.getRegion(); 595 Stream<LocaleId> candidateIds = Stream.of( 596 LocaleId.of(lang, script, region), 597 LocaleId.of(lang, null, region), 598 LocaleId.of(lang, script, null), 599 LocaleId.of(lang, null, null)); 600 // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on 601 // its own ("en_Latn_US") which is not intended. 602 if (script != null) { 603 candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null))); 604 } 605 return candidateIds 606 // Remove duplicate IDs (keeps the first one encountered). 607 .distinct() 608 .map(Object::toString) 609 .map(fn) 610 .filter(Objects::nonNull) 611 .findFirst() 612 .map(LocaleId::parse); 613 } 614 } 615