1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.regex;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkNotNull;
7 import static com.google.common.collect.ImmutableList.toImmutableList;
8 import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap;
9 import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
10 import static java.util.function.Function.identity;
11 
12 import java.io.PrintWriter;
13 import java.io.StringWriter;
14 import java.util.Arrays;
15 import java.util.LinkedHashSet;
16 import java.util.List;
17 import java.util.Optional;
18 import java.util.Set;
19 import java.util.function.BiFunction;
20 import java.util.function.Function;
21 import java.util.regex.Pattern;
22 
23 import org.unicode.cldr.api.CldrDataType;
24 import org.unicode.cldr.api.CldrPath;
25 import org.unicode.cldr.api.CldrValue;
26 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
27 import org.unicode.icu.tool.cldrtoicu.RbPath;
28 
29 import com.google.common.collect.ImmutableList;
30 import com.google.common.collect.ImmutableListMultimap;
31 import com.google.common.collect.ImmutableSetMultimap;
32 
33 /**
34  * Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files.
35  * See {@code ldml2icu_readme.txt} for details on the configuration file format and
36  * {@link PathValueTransformer} for the public API description and usage.
37  *
38  * <p>This class is thread safe.
39  */
40 // TODO: Rewrite the readme to match current behaviour and describe edge cases properly.
41 public final class RegexTransformer extends PathValueTransformer {
42     /**
43      * Returns a new transformer based on transformation rules defined in the given configuration
44      * file contents, and using the specified functions for resolving ICU values.
45      */
fromConfigLines( List<String> lines, NamedFunction... functions)46     public static PathValueTransformer fromConfigLines(
47         List<String> lines, NamedFunction... functions) {
48         return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions)));
49     }
50 
51     // Map of path prefixes grouped by DTD type (for early efficient filtering of paths).
52     private final ImmutableSetMultimap<CldrDataType, String> prefixMap;
53     // Transformation rules loading from the configuration file, grouped by path prefix.
54     private final ImmutableListMultimap<String, Rule> rulesMap;
55     // Functions which can generate a fallback value from a given resource bundle path.
56     private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions;
57     // Records the total set of rules, removing them as they are matched. Used for reporting any
58     // unused rules for debugging purposes.
59     private final Set<Rule> unusedRules = new LinkedHashSet<>();
60 
RegexTransformer(List<Rule> rules)61     private RegexTransformer(List<Rule> rules) {
62         this.prefixMap =
63             rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix));
64         this.rulesMap =
65             rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity()));
66         this.fallbackFunctions =
67             rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList());
68         // Add all rules first and remove as they are matched.
69         this.unusedRules.addAll(rules);
70     }
71 
72     @Override
transform(CldrValue value)73     public ImmutableList<Result> transform(CldrValue value) {
74         return transform(value, p -> null);
75     }
76 
77     @Override
transform(CldrValue value, DynamicVars varLookupFn)78     public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) {
79         // This early rejection of non-matching paths, combined with "bucketing" the rules by path
80         // path prefix for easy lookup dramatically reduces the transformation time.
81         String pathPrefix = getPathPrefix(value);
82         if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) {
83             return ImmutableList.of();
84         }
85         // Even though this is just derived from the value, resolve it here and pass it into each
86         // rule to avoid recalculating the same thing every time.
87         String fullXPath = getFullXPathWithoutSortIndices(value);
88         // Bucketing the rules by the path prefix means that each incoming value is only tested
89         // against likely matches. This reduces the number of tests per value by about 10x.
90         for (Rule rule : rulesMap.get(pathPrefix)) {
91             // We break after the first matching rule, since there is an implicit assumption
92             // that no paths will match more than one rule.
93             // TODO: Add a debug mode that checks that only one rule matches any given CLDR path.
94             ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn);
95             if (!results.isEmpty()) {
96                 unusedRules.remove(rule);
97                 return results;
98             }
99         }
100         return ImmutableList.of();
101     }
102 
103     // All "leaf" paths must have at least two elements, so we can find the "prefix" which is
104     // the first element after the DTD root. This corresponds to the value extracted via
105     // PATH_SPEC_PREFIX in the parser.
getPathPrefix(CldrValue value)106     private static String getPathPrefix(CldrValue value) {
107         CldrPath prefix = value.getPath();
108         checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix);
109         while (prefix.getLength() > 2) {
110             prefix = prefix.getParent();
111         }
112         return prefix.getName();
113     }
114 
115     // A regex to capture any sort-indices in the full path string (which must be removed).
116     private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+");
117 
118     // Note that the full path we get here contains the "sort index" suffix for ORDERED
119     // elements. This means that some element names are "foo#N" where N is the sort index.
120     // Since the regex transformer works around "ordered elements" in a completely different
121     // way and doesn't have them in the regular expressions, we can just remove them.
getFullXPathWithoutSortIndices(CldrValue v)122     private static String getFullXPathWithoutSortIndices(CldrValue v) {
123         String fullPath = v.getFullPath();
124         for (CldrPath p = v.getPath(); p != null; p = p.getParent()) {
125             if (p.getSortIndex() != -1) {
126                 // Only do expensive regex stuff if there's an "ordered" element with a sort index.
127                 return SORT_INDEX.matcher(fullPath).replaceAll("$1");
128             }
129         }
130         // No path parts have a sort index, so the original full path string is safe to return.
131         return fullPath;
132     }
133 
134     @Override
getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn)135     public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) {
136         return fallbackFunctions.stream()
137             .map(f -> f.apply(rbPath, varLookupFn))
138             .filter(Optional::isPresent)
139             .map(Optional::get)
140             .collect(toImmutableList());
141     }
142 
toString()143     @Override public String toString() {
144         StringWriter buf = new StringWriter();
145         PrintWriter out = new PrintWriter(buf);
146         out.println(getClass().getName() + "{");
147         out.println("  Rules: " + rulesMap.size());
148         if (!unusedRules.isEmpty()) {
149             out.println("  Unused Rules:");
150             unusedRules.forEach(
151                 r -> out.format("    [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec()));
152         }
153         out.println('}');
154         out.flush();
155         return buf.toString();
156     }
157 
158     // Package use helper for substituting single-character place-holders like '$N' or '%X'.
substitute(String s, char token, Function<Character, String> replaceFn)159     static String substitute(String s, char token, Function<Character, String> replaceFn) {
160         if (s.indexOf(token) == -1) {
161             return s;
162         }
163         StringBuilder out = new StringBuilder();
164         int i = 0;
165         for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) {
166             char varChar = s.charAt(j + 1);
167             String replacement =
168                 checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar);
169             out.append(s, i, j).append(replacement);
170         }
171         return out.append(s.substring(i)).toString();
172     }
173 }
174