1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.regex; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static com.google.common.collect.ImmutableList.toImmutableList; 8 import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap; 9 import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap; 10 import static java.util.function.Function.identity; 11 12 import java.io.PrintWriter; 13 import java.io.StringWriter; 14 import java.util.Arrays; 15 import java.util.LinkedHashSet; 16 import java.util.List; 17 import java.util.Optional; 18 import java.util.Set; 19 import java.util.function.BiFunction; 20 import java.util.function.Function; 21 import java.util.regex.Pattern; 22 23 import org.unicode.cldr.api.CldrDataType; 24 import org.unicode.cldr.api.CldrPath; 25 import org.unicode.cldr.api.CldrValue; 26 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer; 27 import org.unicode.icu.tool.cldrtoicu.RbPath; 28 29 import com.google.common.collect.ImmutableList; 30 import com.google.common.collect.ImmutableListMultimap; 31 import com.google.common.collect.ImmutableSetMultimap; 32 33 /** 34 * Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files. 35 * See {@code ldml2icu_readme.txt} for details on the configuration file format and 36 * {@link PathValueTransformer} for the public API description and usage. 37 * 38 * <p>This class is thread safe. 39 */ 40 // TODO: Rewrite the readme to match current behaviour and describe edge cases properly. 41 public final class RegexTransformer extends PathValueTransformer { 42 /** 43 * Returns a new transformer based on transformation rules defined in the given configuration 44 * file contents, and using the specified functions for resolving ICU values. 45 */ fromConfigLines( List<String> lines, NamedFunction... functions)46 public static PathValueTransformer fromConfigLines( 47 List<String> lines, NamedFunction... functions) { 48 return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions))); 49 } 50 51 // Map of path prefixes grouped by DTD type (for early efficient filtering of paths). 52 private final ImmutableSetMultimap<CldrDataType, String> prefixMap; 53 // Transformation rules loading from the configuration file, grouped by path prefix. 54 private final ImmutableListMultimap<String, Rule> rulesMap; 55 // Functions which can generate a fallback value from a given resource bundle path. 56 private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions; 57 // Records the total set of rules, removing them as they are matched. Used for reporting any 58 // unused rules for debugging purposes. 59 private final Set<Rule> unusedRules = new LinkedHashSet<>(); 60 RegexTransformer(List<Rule> rules)61 private RegexTransformer(List<Rule> rules) { 62 this.prefixMap = 63 rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix)); 64 this.rulesMap = 65 rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity())); 66 this.fallbackFunctions = 67 rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList()); 68 // Add all rules first and remove as they are matched. 69 this.unusedRules.addAll(rules); 70 } 71 72 @Override transform(CldrValue value)73 public ImmutableList<Result> transform(CldrValue value) { 74 return transform(value, p -> null); 75 } 76 77 @Override transform(CldrValue value, DynamicVars varLookupFn)78 public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) { 79 // This early rejection of non-matching paths, combined with "bucketing" the rules by path 80 // path prefix for easy lookup dramatically reduces the transformation time. 81 String pathPrefix = getPathPrefix(value); 82 if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) { 83 return ImmutableList.of(); 84 } 85 // Even though this is just derived from the value, resolve it here and pass it into each 86 // rule to avoid recalculating the same thing every time. 87 String fullXPath = getFullXPathWithoutSortIndices(value); 88 // Bucketing the rules by the path prefix means that each incoming value is only tested 89 // against likely matches. This reduces the number of tests per value by about 10x. 90 for (Rule rule : rulesMap.get(pathPrefix)) { 91 // We break after the first matching rule, since there is an implicit assumption 92 // that no paths will match more than one rule. 93 // TODO: Add a debug mode that checks that only one rule matches any given CLDR path. 94 ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn); 95 if (!results.isEmpty()) { 96 unusedRules.remove(rule); 97 return results; 98 } 99 } 100 return ImmutableList.of(); 101 } 102 103 // All "leaf" paths must have at least two elements, so we can find the "prefix" which is 104 // the first element after the DTD root. This corresponds to the value extracted via 105 // PATH_SPEC_PREFIX in the parser. getPathPrefix(CldrValue value)106 private static String getPathPrefix(CldrValue value) { 107 CldrPath prefix = value.getPath(); 108 checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix); 109 while (prefix.getLength() > 2) { 110 prefix = prefix.getParent(); 111 } 112 return prefix.getName(); 113 } 114 115 // A regex to capture any sort-indices in the full path string (which must be removed). 116 private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+"); 117 118 // Note that the full path we get here contains the "sort index" suffix for ORDERED 119 // elements. This means that some element names are "foo#N" where N is the sort index. 120 // Since the regex transformer works around "ordered elements" in a completely different 121 // way and doesn't have them in the regular expressions, we can just remove them. getFullXPathWithoutSortIndices(CldrValue v)122 private static String getFullXPathWithoutSortIndices(CldrValue v) { 123 String fullPath = v.getFullPath(); 124 for (CldrPath p = v.getPath(); p != null; p = p.getParent()) { 125 if (p.getSortIndex() != -1) { 126 // Only do expensive regex stuff if there's an "ordered" element with a sort index. 127 return SORT_INDEX.matcher(fullPath).replaceAll("$1"); 128 } 129 } 130 // No path parts have a sort index, so the original full path string is safe to return. 131 return fullPath; 132 } 133 134 @Override getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn)135 public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) { 136 return fallbackFunctions.stream() 137 .map(f -> f.apply(rbPath, varLookupFn)) 138 .filter(Optional::isPresent) 139 .map(Optional::get) 140 .collect(toImmutableList()); 141 } 142 toString()143 @Override public String toString() { 144 StringWriter buf = new StringWriter(); 145 PrintWriter out = new PrintWriter(buf); 146 out.println(getClass().getName() + "{"); 147 out.println(" Rules: " + rulesMap.size()); 148 if (!unusedRules.isEmpty()) { 149 out.println(" Unused Rules:"); 150 unusedRules.forEach( 151 r -> out.format(" [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec())); 152 } 153 out.println('}'); 154 out.flush(); 155 return buf.toString(); 156 } 157 158 // Package use helper for substituting single-character place-holders like '$N' or '%X'. substitute(String s, char token, Function<Character, String> replaceFn)159 static String substitute(String s, char token, Function<Character, String> replaceFn) { 160 if (s.indexOf(token) == -1) { 161 return s; 162 } 163 StringBuilder out = new StringBuilder(); 164 int i = 0; 165 for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) { 166 char varChar = s.charAt(j + 1); 167 String replacement = 168 checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar); 169 out.append(s, i, j).append(replacement); 170 } 171 return out.append(s.substring(i)).toString(); 172 } 173 } 174