1  // This module provides an NFA compiler using Thompson's construction
2  // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
3  // graph as output. The NFA graph is structured in a way that permits it to be
4  // executed by a virtual machine and also used to efficiently build a DFA.
5  //
6  // The compiler deals with a slightly expanded set of NFA states that notably
7  // includes an empty node that has exactly one epsilon transition to the next
8  // state. In other words, it's a "goto" instruction if one views Thompson's NFA
9  // as a set of bytecode instructions. These goto instructions are removed in
10  // a subsequent phase before returning the NFA to the caller. The purpose of
11  // these empty nodes is that they make the construction algorithm substantially
12  // simpler to implement. We remove them before returning to the caller because
13  // they can represent substantial overhead when traversing the NFA graph
14  // (either while searching using the NFA directly or while building a DFA).
15  //
16  // In the future, it would be nice to provide a Glushkov compiler as well,
17  // as it would work well as a bit-parallel NFA for smaller regexes. But
18  // the Thompson construction is one I'm more familiar with and seems more
19  // straight-forward to deal with when it comes to large Unicode character
20  // classes.
21  //
22  // Internally, the compiler uses interior mutability to improve composition
23  // in the face of the borrow checker. In particular, we'd really like to be
24  // able to write things like this:
25  //
26  //     self.c_concat(exprs.iter().map(|e| self.c(e)))
27  //
28  // Which elegantly uses iterators to build up a sequence of compiled regex
29  // sub-expressions and then hands it off to the concatenating compiler
30  // routine. Without interior mutability, the borrow checker won't let us
31  // borrow `self` mutably both inside and outside the closure at the same
32  // time.
33  
34  use std::cell::RefCell;
35  use std::mem;
36  
37  use regex_syntax::hir::{self, Hir, HirKind};
38  use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
39  
40  use classes::ByteClassSet;
41  use error::{Error, Result};
42  use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
43  use nfa::range_trie::RangeTrie;
44  use nfa::{State, StateID, Transition, NFA};
45  
46  /// Config knobs for the NFA compiler. See the builder's methods for more
47  /// docs on each one.
48  #[derive(Clone, Copy, Debug)]
49  struct Config {
50      anchored: bool,
51      allow_invalid_utf8: bool,
52      reverse: bool,
53      shrink: bool,
54  }
55  
56  impl Default for Config {
default() -> Config57      fn default() -> Config {
58          Config {
59              anchored: false,
60              allow_invalid_utf8: false,
61              reverse: false,
62              shrink: true,
63          }
64      }
65  }
66  
67  /// A builder for compiling an NFA.
68  #[derive(Clone, Debug)]
69  pub struct Builder {
70      config: Config,
71  }
72  
73  impl Builder {
74      /// Create a new NFA builder with its default configuration.
new() -> Builder75      pub fn new() -> Builder {
76          Builder { config: Config::default() }
77      }
78  
79      /// Compile the given high level intermediate representation of a regular
80      /// expression into an NFA.
81      ///
82      /// If there was a problem building the NFA, then an error is returned.
83      /// For example, if the regex uses unsupported features (such as zero-width
84      /// assertions), then an error is returned.
build(&self, expr: &Hir) -> Result<NFA>85      pub fn build(&self, expr: &Hir) -> Result<NFA> {
86          let mut nfa = NFA::always_match();
87          self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
88          Ok(nfa)
89      }
90  
91      /// Compile the given high level intermediate representation of a regular
92      /// expression into the NFA given using the given compiler. Callers may
93      /// prefer this over `build` if they would like to reuse allocations while
94      /// compiling many regular expressions.
95      ///
96      /// On success, the given NFA is completely overwritten with the NFA
97      /// produced by the compiler.
98      ///
99      /// If there was a problem building the NFA, then an error is returned. For
100      /// example, if the regex uses unsupported features (such as zero-width
101      /// assertions), then an error is returned. When an error is returned,
102      /// the contents of `nfa` are unspecified and should not be relied upon.
103      /// However, it can still be reused in subsequent calls to this method.
build_with( &self, compiler: &mut Compiler, nfa: &mut NFA, expr: &Hir, ) -> Result<()>104      pub fn build_with(
105          &self,
106          compiler: &mut Compiler,
107          nfa: &mut NFA,
108          expr: &Hir,
109      ) -> Result<()> {
110          compiler.clear();
111          compiler.configure(self.config);
112          compiler.compile(nfa, expr)
113      }
114  
115      /// Set whether matching must be anchored at the beginning of the input.
116      ///
117      /// When enabled, a match must begin at the start of the input. When
118      /// disabled, the NFA will act as if the pattern started with a `.*?`,
119      /// which enables a match to appear anywhere.
120      ///
121      /// By default this is disabled.
anchored(&mut self, yes: bool) -> &mut Builder122      pub fn anchored(&mut self, yes: bool) -> &mut Builder {
123          self.config.anchored = yes;
124          self
125      }
126  
127      /// When enabled, the builder will permit the construction of an NFA that
128      /// may match invalid UTF-8.
129      ///
130      /// When disabled (the default), the builder is guaranteed to produce a
131      /// regex that will only ever match valid UTF-8 (otherwise, the builder
132      /// will return an error).
allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder133      pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
134          self.config.allow_invalid_utf8 = yes;
135          self
136      }
137  
138      /// Reverse the NFA.
139      ///
140      /// A NFA reversal is performed by reversing all of the concatenated
141      /// sub-expressions in the original pattern, recursively. The resulting
142      /// NFA can be used to match the pattern starting from the end of a string
143      /// instead of the beginning of a string.
144      ///
145      /// Reversing the NFA is useful for building a reverse DFA, which is most
146      /// useful for finding the start of a match.
reverse(&mut self, yes: bool) -> &mut Builder147      pub fn reverse(&mut self, yes: bool) -> &mut Builder {
148          self.config.reverse = yes;
149          self
150      }
151  
152      /// Apply best effort heuristics to shrink the NFA at the expense of more
153      /// time/memory.
154      ///
155      /// This is enabled by default. Generally speaking, if one is using an NFA
156      /// to compile DFA, then the extra time used to shrink the NFA will be
157      /// more than made up for during DFA construction (potentially by a lot).
158      /// In other words, enabling this can substantially decrease the overall
159      /// amount of time it takes to build a DFA.
160      ///
161      /// The only reason to disable this if you want to compile an NFA and start
162      /// using it as quickly as possible without needing to build a DFA.
shrink(&mut self, yes: bool) -> &mut Builder163      pub fn shrink(&mut self, yes: bool) -> &mut Builder {
164          self.config.shrink = yes;
165          self
166      }
167  }
168  
169  /// A compiler that converts a regex abstract syntax to an NFA via Thompson's
170  /// construction. Namely, this compiler permits epsilon transitions between
171  /// states.
172  ///
173  /// Users of this crate cannot use a compiler directly. Instead, all one can
174  /// do is create one and use it via the
175  /// [`Builder::build_with`](struct.Builder.html#method.build_with)
176  /// method. This permits callers to reuse compilers in order to amortize
177  /// allocations.
178  #[derive(Clone, Debug)]
179  pub struct Compiler {
180      /// The set of compiled NFA states. Once a state is compiled, it is
181      /// assigned a state ID equivalent to its index in this list. Subsequent
182      /// compilation can modify previous states by adding new transitions.
183      states: RefCell<Vec<CState>>,
184      /// The configuration from the builder.
185      config: Config,
186      /// State used for compiling character classes to UTF-8 byte automata.
187      /// State is not retained between character class compilations. This just
188      /// serves to amortize allocation to the extent possible.
189      utf8_state: RefCell<Utf8State>,
190      /// State used for arranging character classes in reverse into a trie.
191      trie_state: RefCell<RangeTrie>,
192      /// State used for caching common suffixes when compiling reverse UTF-8
193      /// automata (for Unicode character classes).
194      utf8_suffix: RefCell<Utf8SuffixMap>,
195      /// A map used to re-map state IDs when translating the compiler's internal
196      /// NFA state representation to the external NFA representation.
197      remap: RefCell<Vec<StateID>>,
198      /// A set of compiler internal state IDs that correspond to states that are
199      /// exclusively epsilon transitions, i.e., goto instructions, combined with
200      /// the state that they point to. This is used to record said states while
201      /// transforming the compiler's internal NFA representation to the external
202      /// form.
203      empties: RefCell<Vec<(StateID, StateID)>>,
204  }
205  
206  /// A compiler intermediate state representation for an NFA that is only used
207  /// during compilation. Once compilation is done, `CState`s are converted to
208  /// `State`s, which have a much simpler representation.
209  #[derive(Clone, Debug, Eq, PartialEq)]
210  enum CState {
211      /// An empty state whose only purpose is to forward the automaton to
212      /// another state via en epsilon transition. These are useful during
213      /// compilation but are otherwise removed at the end.
214      Empty { next: StateID },
215      /// A state that only transitions to `next` if the current input byte is
216      /// in the range `[start, end]` (inclusive on both ends).
217      Range { range: Transition },
218      /// A state with possibly many transitions, represented in a sparse
219      /// fashion. Transitions are ordered lexicographically by input range.
220      /// As such, this may only be used when every transition has equal
221      /// priority. (In practice, this is only used for encoding large UTF-8
222      /// automata.)
223      Sparse { ranges: Vec<Transition> },
224      /// An alternation such that there exists an epsilon transition to all
225      /// states in `alternates`, where matches found via earlier transitions
226      /// are preferred over later transitions.
227      Union { alternates: Vec<StateID> },
228      /// An alternation such that there exists an epsilon transition to all
229      /// states in `alternates`, where matches found via later transitions
230      /// are preferred over earlier transitions.
231      ///
232      /// This "reverse" state exists for convenience during compilation that
233      /// permits easy construction of non-greedy combinations of NFA states.
234      /// At the end of compilation, Union and UnionReverse states are merged
235      /// into one Union type of state, where the latter has its epsilon
236      /// transitions reversed to reflect the priority inversion.
237      UnionReverse { alternates: Vec<StateID> },
238      /// A match state. There is exactly one such occurrence of this state in
239      /// an NFA.
240      Match,
241  }
242  
243  /// A value that represents the result of compiling a sub-expression of a
244  /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
245  /// has an initial state at `start` and a final state at `end`.
246  #[derive(Clone, Copy, Debug)]
247  pub struct ThompsonRef {
248      start: StateID,
249      end: StateID,
250  }
251  
252  impl Compiler {
253      /// Create a new compiler.
new() -> Compiler254      pub fn new() -> Compiler {
255          Compiler {
256              states: RefCell::new(vec![]),
257              config: Config::default(),
258              utf8_state: RefCell::new(Utf8State::new()),
259              trie_state: RefCell::new(RangeTrie::new()),
260              utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
261              remap: RefCell::new(vec![]),
262              empties: RefCell::new(vec![]),
263          }
264      }
265  
266      /// Clear any memory used by this compiler such that it is ready to compile
267      /// a new regex.
268      ///
269      /// It is preferrable to reuse a compiler if possible in order to reuse
270      /// allocations.
clear(&self)271      fn clear(&self) {
272          self.states.borrow_mut().clear();
273          // We don't need to clear anything else since they are cleared on
274          // their own and only when they are used.
275      }
276  
277      /// Configure this compiler from the builder's knobs.
278      ///
279      /// The compiler is always reconfigured by the builder before using it to
280      /// build an NFA.
configure(&mut self, config: Config)281      fn configure(&mut self, config: Config) {
282          self.config = config;
283      }
284  
285      /// Convert the current intermediate NFA to its final compiled form.
compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()>286      fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
287          nfa.anchored = self.config.anchored;
288  
289          let mut start = self.add_empty();
290          if !nfa.anchored {
291              let compiled = if self.config.allow_invalid_utf8 {
292                  self.c_unanchored_prefix_invalid_utf8()?
293              } else {
294                  self.c_unanchored_prefix_valid_utf8()?
295              };
296              self.patch(start, compiled.start);
297              start = compiled.end;
298          }
299          let compiled = self.c(&expr)?;
300          let match_id = self.add_match();
301          self.patch(start, compiled.start);
302          self.patch(compiled.end, match_id);
303          self.finish(nfa);
304          Ok(())
305      }
306  
307      /// Finishes the compilation process and populates the provide NFA with
308      /// the final graph.
finish(&self, nfa: &mut NFA)309      fn finish(&self, nfa: &mut NFA) {
310          let mut bstates = self.states.borrow_mut();
311          let mut remap = self.remap.borrow_mut();
312          remap.resize(bstates.len(), 0);
313          let mut empties = self.empties.borrow_mut();
314          empties.clear();
315  
316          // We don't reuse allocations here becuase this is what we're
317          // returning.
318          nfa.states.clear();
319          let mut byteset = ByteClassSet::new();
320  
321          // The idea here is to convert our intermediate states to their final
322          // form. The only real complexity here is the process of converting
323          // transitions, which are expressed in terms of state IDs. The new
324          // set of states will be smaller because of partial epsilon removal,
325          // so the state IDs will not be the same.
326          for (id, bstate) in bstates.iter_mut().enumerate() {
327              match *bstate {
328                  CState::Empty { next } => {
329                      // Since we're removing empty states, we need to handle
330                      // them later since we don't yet know which new state this
331                      // empty state will be mapped to.
332                      empties.push((id, next));
333                  }
334                  CState::Range { ref range } => {
335                      remap[id] = nfa.states.len();
336                      byteset.set_range(range.start, range.end);
337                      nfa.states.push(State::Range { range: range.clone() });
338                  }
339                  CState::Sparse { ref mut ranges } => {
340                      remap[id] = nfa.states.len();
341  
342                      let ranges = mem::replace(ranges, vec![]);
343                      for r in &ranges {
344                          byteset.set_range(r.start, r.end);
345                      }
346                      nfa.states.push(State::Sparse {
347                          ranges: ranges.into_boxed_slice(),
348                      });
349                  }
350                  CState::Union { ref mut alternates } => {
351                      remap[id] = nfa.states.len();
352  
353                      let alternates = mem::replace(alternates, vec![]);
354                      nfa.states.push(State::Union {
355                          alternates: alternates.into_boxed_slice(),
356                      });
357                  }
358                  CState::UnionReverse { ref mut alternates } => {
359                      remap[id] = nfa.states.len();
360  
361                      let mut alternates = mem::replace(alternates, vec![]);
362                      alternates.reverse();
363                      nfa.states.push(State::Union {
364                          alternates: alternates.into_boxed_slice(),
365                      });
366                  }
367                  CState::Match => {
368                      remap[id] = nfa.states.len();
369                      nfa.states.push(State::Match);
370                  }
371              }
372          }
373          for &(empty_id, mut empty_next) in empties.iter() {
374              // empty states can point to other empty states, forming a chain.
375              // So we must follow the chain until the end, which must end at
376              // a non-empty state, and therefore, a state that is correctly
377              // remapped. We are guaranteed to terminate because our compiler
378              // never builds a loop among empty states.
379              while let CState::Empty { next } = bstates[empty_next] {
380                  empty_next = next;
381              }
382              remap[empty_id] = remap[empty_next];
383          }
384          for state in &mut nfa.states {
385              state.remap(&remap);
386          }
387          // The compiler always begins the NFA at the first state.
388          nfa.start = remap[0];
389          nfa.byte_classes = byteset.byte_classes();
390      }
391  
c(&self, expr: &Hir) -> Result<ThompsonRef>392      fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
393          match *expr.kind() {
394              HirKind::Empty => {
395                  let id = self.add_empty();
396                  Ok(ThompsonRef { start: id, end: id })
397              }
398              HirKind::Literal(hir::Literal::Unicode(ch)) => {
399                  let mut buf = [0; 4];
400                  let it = ch
401                      .encode_utf8(&mut buf)
402                      .as_bytes()
403                      .iter()
404                      .map(|&b| Ok(self.c_range(b, b)));
405                  self.c_concat(it)
406              }
407              HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
408              HirKind::Class(hir::Class::Bytes(ref cls)) => {
409                  self.c_byte_class(cls)
410              }
411              HirKind::Class(hir::Class::Unicode(ref cls)) => {
412                  self.c_unicode_class(cls)
413              }
414              HirKind::Repetition(ref rep) => self.c_repetition(rep),
415              HirKind::Group(ref group) => self.c(&*group.hir),
416              HirKind::Concat(ref exprs) => {
417                  self.c_concat(exprs.iter().map(|e| self.c(e)))
418              }
419              HirKind::Alternation(ref exprs) => {
420                  self.c_alternation(exprs.iter().map(|e| self.c(e)))
421              }
422              HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
423              HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
424          }
425      }
426  
c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> where I: DoubleEndedIterator<Item = Result<ThompsonRef>>,427      fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
428      where
429          I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
430      {
431          let first =
432              if self.config.reverse { it.next_back() } else { it.next() };
433          let ThompsonRef { start, mut end } = match first {
434              Some(result) => result?,
435              None => return Ok(self.c_empty()),
436          };
437          loop {
438              let next =
439                  if self.config.reverse { it.next_back() } else { it.next() };
440              let compiled = match next {
441                  Some(result) => result?,
442                  None => break,
443              };
444              self.patch(end, compiled.start);
445              end = compiled.end;
446          }
447          Ok(ThompsonRef { start, end })
448      }
449  
c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> where I: Iterator<Item = Result<ThompsonRef>>,450      fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
451      where
452          I: Iterator<Item = Result<ThompsonRef>>,
453      {
454          let first = it.next().expect("alternations must be non-empty")?;
455          let second = match it.next() {
456              None => return Ok(first),
457              Some(result) => result?,
458          };
459  
460          let union = self.add_union();
461          let end = self.add_empty();
462          self.patch(union, first.start);
463          self.patch(first.end, end);
464          self.patch(union, second.start);
465          self.patch(second.end, end);
466          for result in it {
467              let compiled = result?;
468              self.patch(union, compiled.start);
469              self.patch(compiled.end, end);
470          }
471          Ok(ThompsonRef { start: union, end })
472      }
473  
c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef>474      fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
475          match rep.kind {
476              hir::RepetitionKind::ZeroOrOne => {
477                  self.c_zero_or_one(&rep.hir, rep.greedy)
478              }
479              hir::RepetitionKind::ZeroOrMore => {
480                  self.c_at_least(&rep.hir, rep.greedy, 0)
481              }
482              hir::RepetitionKind::OneOrMore => {
483                  self.c_at_least(&rep.hir, rep.greedy, 1)
484              }
485              hir::RepetitionKind::Range(ref rng) => match *rng {
486                  hir::RepetitionRange::Exactly(count) => {
487                      self.c_exactly(&rep.hir, count)
488                  }
489                  hir::RepetitionRange::AtLeast(m) => {
490                      self.c_at_least(&rep.hir, rep.greedy, m)
491                  }
492                  hir::RepetitionRange::Bounded(min, max) => {
493                      self.c_bounded(&rep.hir, rep.greedy, min, max)
494                  }
495              },
496          }
497      }
498  
c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result<ThompsonRef>499      fn c_bounded(
500          &self,
501          expr: &Hir,
502          greedy: bool,
503          min: u32,
504          max: u32,
505      ) -> Result<ThompsonRef> {
506          let prefix = self.c_exactly(expr, min)?;
507          if min == max {
508              return Ok(prefix);
509          }
510  
511          // It is tempting here to compile the rest here as a concatenation
512          // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
513          // were `aaa?a?a?`. The problem here is that it leads to this program:
514          //
515          //     >000000: 61 => 01
516          //     000001: 61 => 02
517          //     000002: alt(03, 04)
518          //     000003: 61 => 04
519          //     000004: alt(05, 06)
520          //     000005: 61 => 06
521          //     000006: alt(07, 08)
522          //     000007: 61 => 08
523          //     000008: MATCH
524          //
525          // And effectively, once you hit state 2, the epsilon closure will
526          // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
527          // better to instead compile it like so:
528          //
529          //     >000000: 61 => 01
530          //      000001: 61 => 02
531          //      000002: alt(03, 08)
532          //      000003: 61 => 04
533          //      000004: alt(05, 08)
534          //      000005: 61 => 06
535          //      000006: alt(07, 08)
536          //      000007: 61 => 08
537          //      000008: MATCH
538          //
539          // So that the epsilon closure of state 2 is now just 3 and 8.
540          let empty = self.add_empty();
541          let mut prev_end = prefix.end;
542          for _ in min..max {
543              let union = if greedy {
544                  self.add_union()
545              } else {
546                  self.add_reverse_union()
547              };
548              let compiled = self.c(expr)?;
549              self.patch(prev_end, union);
550              self.patch(union, compiled.start);
551              self.patch(union, empty);
552              prev_end = compiled.end;
553          }
554          self.patch(prev_end, empty);
555          Ok(ThompsonRef { start: prefix.start, end: empty })
556      }
557  
c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result<ThompsonRef>558      fn c_at_least(
559          &self,
560          expr: &Hir,
561          greedy: bool,
562          n: u32,
563      ) -> Result<ThompsonRef> {
564          if n == 0 {
565              let union = if greedy {
566                  self.add_union()
567              } else {
568                  self.add_reverse_union()
569              };
570              let compiled = self.c(expr)?;
571              self.patch(union, compiled.start);
572              self.patch(compiled.end, union);
573              Ok(ThompsonRef { start: union, end: union })
574          } else if n == 1 {
575              let compiled = self.c(expr)?;
576              let union = if greedy {
577                  self.add_union()
578              } else {
579                  self.add_reverse_union()
580              };
581              self.patch(compiled.end, union);
582              self.patch(union, compiled.start);
583              Ok(ThompsonRef { start: compiled.start, end: union })
584          } else {
585              let prefix = self.c_exactly(expr, n - 1)?;
586              let last = self.c(expr)?;
587              let union = if greedy {
588                  self.add_union()
589              } else {
590                  self.add_reverse_union()
591              };
592              self.patch(prefix.end, last.start);
593              self.patch(last.end, union);
594              self.patch(union, last.start);
595              Ok(ThompsonRef { start: prefix.start, end: union })
596          }
597      }
598  
c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef>599      fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
600          let union =
601              if greedy { self.add_union() } else { self.add_reverse_union() };
602          let compiled = self.c(expr)?;
603          let empty = self.add_empty();
604          self.patch(union, compiled.start);
605          self.patch(union, empty);
606          self.patch(compiled.end, empty);
607          Ok(ThompsonRef { start: union, end: empty })
608      }
609  
c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef>610      fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
611          let it = (0..n).map(|_| self.c(expr));
612          self.c_concat(it)
613      }
614  
c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef>615      fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
616          let end = self.add_empty();
617          let mut trans = Vec::with_capacity(cls.ranges().len());
618          for r in cls.iter() {
619              trans.push(Transition {
620                  start: r.start(),
621                  end: r.end(),
622                  next: end,
623              });
624          }
625          Ok(ThompsonRef { start: self.add_sparse(trans), end })
626      }
627  
c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef>628      fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
629          // If all we have are ASCII ranges wrapped in a Unicode package, then
630          // there is zero reason to bring out the big guns. We can fit all ASCII
631          // ranges within a single sparse transition.
632          if cls.is_all_ascii() {
633              let end = self.add_empty();
634              let mut trans = Vec::with_capacity(cls.ranges().len());
635              for r in cls.iter() {
636                  assert!(r.start() <= '\x7F');
637                  assert!(r.end() <= '\x7F');
638                  trans.push(Transition {
639                      start: r.start() as u8,
640                      end: r.end() as u8,
641                      next: end,
642                  });
643              }
644              Ok(ThompsonRef { start: self.add_sparse(trans), end })
645          } else if self.config.reverse {
646              if !self.config.shrink {
647                  // When we don't want to spend the extra time shrinking, we
648                  // compile the UTF-8 automaton in reverse using something like
649                  // the "naive" approach, but will attempt to re-use common
650                  // suffixes.
651                  self.c_unicode_class_reverse_with_suffix(cls)
652              } else {
653                  // When we want to shrink our NFA for reverse UTF-8 automata,
654                  // we cannot feed UTF-8 sequences directly to the UTF-8
655                  // compiler, since the UTF-8 compiler requires all sequences
656                  // to be lexicographically sorted. Instead, we organize our
657                  // sequences into a range trie, which can then output our
658                  // sequences in the correct order. Unfortunately, building the
659                  // range trie is fairly expensive (but not nearly as expensive
660                  // as building a DFA). Hence the reason why the 'shrink' option
661                  // exists, so that this path can be toggled off.
662                  let mut trie = self.trie_state.borrow_mut();
663                  trie.clear();
664  
665                  for rng in cls.iter() {
666                      for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
667                          seq.reverse();
668                          trie.insert(seq.as_slice());
669                      }
670                  }
671                  let mut utf8_state = self.utf8_state.borrow_mut();
672                  let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
673                  trie.iter(|seq| {
674                      utf8c.add(&seq);
675                  });
676                  Ok(utf8c.finish())
677              }
678          } else {
679              // In the forward direction, we always shrink our UTF-8 automata
680              // because we can stream it right into the UTF-8 compiler. There
681              // is almost no downside (in either memory or time) to using this
682              // approach.
683              let mut utf8_state = self.utf8_state.borrow_mut();
684              let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
685              for rng in cls.iter() {
686                  for seq in Utf8Sequences::new(rng.start(), rng.end()) {
687                      utf8c.add(seq.as_slice());
688                  }
689              }
690              Ok(utf8c.finish())
691          }
692  
693          // For reference, the code below is the "naive" version of compiling a
694          // UTF-8 automaton. It is deliciously simple (and works for both the
695          // forward and reverse cases), but will unfortunately produce very
696          // large NFAs. When compiling a forward automaton, the size difference
697          // can sometimes be an order of magnitude. For example, the '\w' regex
698          // will generate about ~3000 NFA states using the naive approach below,
699          // but only 283 states when using the approach above. This is because
700          // the approach above actually compiles a *minimal* (or near minimal,
701          // because of the bounded hashmap) UTF-8 automaton.
702          //
703          // The code below is kept as a reference point in order to make it
704          // easier to understand the higher level goal here.
705          /*
706          let it = cls
707              .iter()
708              .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
709              .map(|seq| {
710                  let it = seq
711                      .as_slice()
712                      .iter()
713                      .map(|rng| Ok(self.c_range(rng.start, rng.end)));
714                  self.c_concat(it)
715              });
716          self.c_alternation(it);
717          */
718      }
719  
c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef>720      fn c_unicode_class_reverse_with_suffix(
721          &self,
722          cls: &hir::ClassUnicode,
723      ) -> Result<ThompsonRef> {
724          // N.B. It would likely be better to cache common *prefixes* in the
725          // reverse direction, but it's not quite clear how to do that. The
726          // advantage of caching suffixes is that it does give us a win, and
727          // has a very small additional overhead.
728          let mut cache = self.utf8_suffix.borrow_mut();
729          cache.clear();
730  
731          let union = self.add_union();
732          let alt_end = self.add_empty();
733          for urng in cls.iter() {
734              for seq in Utf8Sequences::new(urng.start(), urng.end()) {
735                  let mut end = alt_end;
736                  for brng in seq.as_slice() {
737                      let key = Utf8SuffixKey {
738                          from: end,
739                          start: brng.start,
740                          end: brng.end,
741                      };
742                      let hash = cache.hash(&key);
743                      if let Some(id) = cache.get(&key, hash) {
744                          end = id;
745                          continue;
746                      }
747  
748                      let compiled = self.c_range(brng.start, brng.end);
749                      self.patch(compiled.end, end);
750                      end = compiled.start;
751                      cache.set(key, hash, end);
752                  }
753                  self.patch(union, end);
754              }
755          }
756          Ok(ThompsonRef { start: union, end: alt_end })
757      }
758  
c_range(&self, start: u8, end: u8) -> ThompsonRef759      fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
760          let id = self.add_range(start, end);
761          ThompsonRef { start: id, end: id }
762      }
763  
c_empty(&self) -> ThompsonRef764      fn c_empty(&self) -> ThompsonRef {
765          let id = self.add_empty();
766          ThompsonRef { start: id, end: id }
767      }
768  
c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef>769      fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
770          self.c(&Hir::repetition(hir::Repetition {
771              kind: hir::RepetitionKind::ZeroOrMore,
772              greedy: false,
773              hir: Box::new(Hir::any(false)),
774          }))
775      }
776  
c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef>777      fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
778          self.c(&Hir::repetition(hir::Repetition {
779              kind: hir::RepetitionKind::ZeroOrMore,
780              greedy: false,
781              hir: Box::new(Hir::any(true)),
782          }))
783      }
784  
patch(&self, from: StateID, to: StateID)785      fn patch(&self, from: StateID, to: StateID) {
786          match self.states.borrow_mut()[from] {
787              CState::Empty { ref mut next } => {
788                  *next = to;
789              }
790              CState::Range { ref mut range } => {
791                  range.next = to;
792              }
793              CState::Sparse { .. } => {
794                  panic!("cannot patch from a sparse NFA state")
795              }
796              CState::Union { ref mut alternates } => {
797                  alternates.push(to);
798              }
799              CState::UnionReverse { ref mut alternates } => {
800                  alternates.push(to);
801              }
802              CState::Match => {}
803          }
804      }
805  
add_empty(&self) -> StateID806      fn add_empty(&self) -> StateID {
807          let id = self.states.borrow().len();
808          self.states.borrow_mut().push(CState::Empty { next: 0 });
809          id
810      }
811  
add_range(&self, start: u8, end: u8) -> StateID812      fn add_range(&self, start: u8, end: u8) -> StateID {
813          let id = self.states.borrow().len();
814          let trans = Transition { start, end, next: 0 };
815          let state = CState::Range { range: trans };
816          self.states.borrow_mut().push(state);
817          id
818      }
819  
add_sparse(&self, ranges: Vec<Transition>) -> StateID820      fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
821          if ranges.len() == 1 {
822              let id = self.states.borrow().len();
823              let state = CState::Range { range: ranges[0] };
824              self.states.borrow_mut().push(state);
825              return id;
826          }
827          let id = self.states.borrow().len();
828          let state = CState::Sparse { ranges };
829          self.states.borrow_mut().push(state);
830          id
831      }
832  
add_union(&self) -> StateID833      fn add_union(&self) -> StateID {
834          let id = self.states.borrow().len();
835          let state = CState::Union { alternates: vec![] };
836          self.states.borrow_mut().push(state);
837          id
838      }
839  
add_reverse_union(&self) -> StateID840      fn add_reverse_union(&self) -> StateID {
841          let id = self.states.borrow().len();
842          let state = CState::UnionReverse { alternates: vec![] };
843          self.states.borrow_mut().push(state);
844          id
845      }
846  
add_match(&self) -> StateID847      fn add_match(&self) -> StateID {
848          let id = self.states.borrow().len();
849          self.states.borrow_mut().push(CState::Match);
850          id
851      }
852  }
853  
854  #[derive(Debug)]
855  struct Utf8Compiler<'a> {
856      nfac: &'a Compiler,
857      state: &'a mut Utf8State,
858      target: StateID,
859  }
860  
861  #[derive(Clone, Debug)]
862  struct Utf8State {
863      compiled: Utf8BoundedMap,
864      uncompiled: Vec<Utf8Node>,
865  }
866  
867  #[derive(Clone, Debug)]
868  struct Utf8Node {
869      trans: Vec<Transition>,
870      last: Option<Utf8LastTransition>,
871  }
872  
873  #[derive(Clone, Debug)]
874  struct Utf8LastTransition {
875      start: u8,
876      end: u8,
877  }
878  
879  impl Utf8State {
new() -> Utf8State880      fn new() -> Utf8State {
881          Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
882      }
883  
clear(&mut self)884      fn clear(&mut self) {
885          self.compiled.clear();
886          self.uncompiled.clear();
887      }
888  }
889  
890  impl<'a> Utf8Compiler<'a> {
new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a>891      fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
892          let target = nfac.add_empty();
893          state.clear();
894          let mut utf8c = Utf8Compiler { nfac, state, target };
895          utf8c.add_empty();
896          utf8c
897      }
898  
finish(&mut self) -> ThompsonRef899      fn finish(&mut self) -> ThompsonRef {
900          self.compile_from(0);
901          let node = self.pop_root();
902          let start = self.compile(node);
903          ThompsonRef { start, end: self.target }
904      }
905  
add(&mut self, ranges: &[Utf8Range])906      fn add(&mut self, ranges: &[Utf8Range]) {
907          let prefix_len = ranges
908              .iter()
909              .zip(&self.state.uncompiled)
910              .take_while(|&(range, node)| {
911                  node.last.as_ref().map_or(false, |t| {
912                      (t.start, t.end) == (range.start, range.end)
913                  })
914              })
915              .count();
916          assert!(prefix_len < ranges.len());
917          self.compile_from(prefix_len);
918          self.add_suffix(&ranges[prefix_len..]);
919      }
920  
compile_from(&mut self, from: usize)921      fn compile_from(&mut self, from: usize) {
922          let mut next = self.target;
923          while from + 1 < self.state.uncompiled.len() {
924              let node = self.pop_freeze(next);
925              next = self.compile(node);
926          }
927          self.top_last_freeze(next);
928      }
929  
compile(&mut self, node: Vec<Transition>) -> StateID930      fn compile(&mut self, node: Vec<Transition>) -> StateID {
931          let hash = self.state.compiled.hash(&node);
932          if let Some(id) = self.state.compiled.get(&node, hash) {
933              return id;
934          }
935          let id = self.nfac.add_sparse(node.clone());
936          self.state.compiled.set(node, hash, id);
937          id
938      }
939  
add_suffix(&mut self, ranges: &[Utf8Range])940      fn add_suffix(&mut self, ranges: &[Utf8Range]) {
941          assert!(!ranges.is_empty());
942          let last = self
943              .state
944              .uncompiled
945              .len()
946              .checked_sub(1)
947              .expect("non-empty nodes");
948          assert!(self.state.uncompiled[last].last.is_none());
949          self.state.uncompiled[last].last = Some(Utf8LastTransition {
950              start: ranges[0].start,
951              end: ranges[0].end,
952          });
953          for r in &ranges[1..] {
954              self.state.uncompiled.push(Utf8Node {
955                  trans: vec![],
956                  last: Some(Utf8LastTransition { start: r.start, end: r.end }),
957              });
958          }
959      }
960  
add_empty(&mut self)961      fn add_empty(&mut self) {
962          self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
963      }
964  
pop_freeze(&mut self, next: StateID) -> Vec<Transition>965      fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
966          let mut uncompiled = self.state.uncompiled.pop().unwrap();
967          uncompiled.set_last_transition(next);
968          uncompiled.trans
969      }
970  
pop_root(&mut self) -> Vec<Transition>971      fn pop_root(&mut self) -> Vec<Transition> {
972          assert_eq!(self.state.uncompiled.len(), 1);
973          assert!(self.state.uncompiled[0].last.is_none());
974          self.state.uncompiled.pop().expect("non-empty nodes").trans
975      }
976  
top_last_freeze(&mut self, next: StateID)977      fn top_last_freeze(&mut self, next: StateID) {
978          let last = self
979              .state
980              .uncompiled
981              .len()
982              .checked_sub(1)
983              .expect("non-empty nodes");
984          self.state.uncompiled[last].set_last_transition(next);
985      }
986  }
987  
988  impl Utf8Node {
set_last_transition(&mut self, next: StateID)989      fn set_last_transition(&mut self, next: StateID) {
990          if let Some(last) = self.last.take() {
991              self.trans.push(Transition {
992                  start: last.start,
993                  end: last.end,
994                  next,
995              });
996          }
997      }
998  }
999  
1000  #[cfg(test)]
1001  mod tests {
1002      use regex_syntax::hir::Hir;
1003      use regex_syntax::ParserBuilder;
1004  
1005      use super::{Builder, State, StateID, Transition, NFA};
1006  
parse(pattern: &str) -> Hir1007      fn parse(pattern: &str) -> Hir {
1008          ParserBuilder::new().build().parse(pattern).unwrap()
1009      }
1010  
build(pattern: &str) -> NFA1011      fn build(pattern: &str) -> NFA {
1012          Builder::new().anchored(true).build(&parse(pattern)).unwrap()
1013      }
1014  
s_byte(byte: u8, next: StateID) -> State1015      fn s_byte(byte: u8, next: StateID) -> State {
1016          let trans = Transition { start: byte, end: byte, next };
1017          State::Range { range: trans }
1018      }
1019  
s_range(start: u8, end: u8, next: StateID) -> State1020      fn s_range(start: u8, end: u8, next: StateID) -> State {
1021          let trans = Transition { start, end, next };
1022          State::Range { range: trans }
1023      }
1024  
s_sparse(ranges: &[(u8, u8, StateID)]) -> State1025      fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
1026          let ranges = ranges
1027              .iter()
1028              .map(|&(start, end, next)| Transition { start, end, next })
1029              .collect();
1030          State::Sparse { ranges }
1031      }
1032  
s_union(alts: &[StateID]) -> State1033      fn s_union(alts: &[StateID]) -> State {
1034          State::Union { alternates: alts.to_vec().into_boxed_slice() }
1035      }
1036  
s_match() -> State1037      fn s_match() -> State {
1038          State::Match
1039      }
1040  
1041      #[test]
errors()1042      fn errors() {
1043          // unsupported anchors
1044          assert!(Builder::new().build(&parse(r"^")).is_err());
1045          assert!(Builder::new().build(&parse(r"$")).is_err());
1046          assert!(Builder::new().build(&parse(r"\A")).is_err());
1047          assert!(Builder::new().build(&parse(r"\z")).is_err());
1048  
1049          // unsupported word boundaries
1050          assert!(Builder::new().build(&parse(r"\b")).is_err());
1051          assert!(Builder::new().build(&parse(r"\B")).is_err());
1052          assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
1053      }
1054  
1055      // Test that building an unanchored NFA has an appropriate `.*?` prefix.
1056      #[test]
compile_unanchored_prefix()1057      fn compile_unanchored_prefix() {
1058          // When the machine can only match valid UTF-8.
1059          let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
1060          // There should be many states since the `.` in `.*?` matches any
1061          // Unicode scalar value.
1062          assert_eq!(11, nfa.len());
1063          assert_eq!(nfa.states[10], s_match());
1064          assert_eq!(nfa.states[9], s_byte(b'a', 10));
1065  
1066          // When the machine can match invalid UTF-8.
1067          let nfa = Builder::new()
1068              .anchored(false)
1069              .allow_invalid_utf8(true)
1070              .build(&parse(r"a"))
1071              .unwrap();
1072          assert_eq!(
1073              nfa.states,
1074              &[
1075                  s_union(&[2, 1]),
1076                  s_range(0, 255, 0),
1077                  s_byte(b'a', 3),
1078                  s_match(),
1079              ]
1080          );
1081      }
1082  
1083      #[test]
compile_empty()1084      fn compile_empty() {
1085          assert_eq!(build("").states, &[s_match(),]);
1086      }
1087  
1088      #[test]
compile_literal()1089      fn compile_literal() {
1090          assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
1091          assert_eq!(
1092              build("ab").states,
1093              &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1094          );
1095          assert_eq!(
1096              build("☃").states,
1097              &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
1098          );
1099  
1100          // Check that non-UTF-8 literals work.
1101          let hir = ParserBuilder::new()
1102              .allow_invalid_utf8(true)
1103              .build()
1104              .parse(r"(?-u)\xFF")
1105              .unwrap();
1106          let nfa = Builder::new()
1107              .anchored(true)
1108              .allow_invalid_utf8(true)
1109              .build(&hir)
1110              .unwrap();
1111          assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
1112      }
1113  
1114      #[test]
compile_class()1115      fn compile_class() {
1116          assert_eq!(
1117              build(r"[a-z]").states,
1118              &[s_range(b'a', b'z', 1), s_match(),]
1119          );
1120          assert_eq!(
1121              build(r"[x-za-c]").states,
1122              &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
1123          );
1124          assert_eq!(
1125              build(r"[\u03B1-\u03B4]").states,
1126              &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
1127          );
1128          assert_eq!(
1129              build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1130              &[
1131                  s_range(0xB1, 0xB4, 5),
1132                  s_range(0x99, 0x9E, 5),
1133                  s_byte(0xA4, 1),
1134                  s_byte(0x9F, 2),
1135                  s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1136                  s_match(),
1137              ]
1138          );
1139          assert_eq!(
1140              build(r"[a-z☃]").states,
1141              &[
1142                  s_byte(0x83, 3),
1143                  s_byte(0x98, 0),
1144                  s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1145                  s_match(),
1146              ]
1147          );
1148      }
1149  
1150      #[test]
compile_repetition()1151      fn compile_repetition() {
1152          assert_eq!(
1153              build(r"a?").states,
1154              &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
1155          );
1156          assert_eq!(
1157              build(r"a??").states,
1158              &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
1159          );
1160      }
1161  
1162      #[test]
compile_group()1163      fn compile_group() {
1164          assert_eq!(
1165              build(r"ab+").states,
1166              &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
1167          );
1168          assert_eq!(
1169              build(r"(ab)").states,
1170              &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1171          );
1172          assert_eq!(
1173              build(r"(ab)+").states,
1174              &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
1175          );
1176      }
1177  
1178      #[test]
compile_alternation()1179      fn compile_alternation() {
1180          assert_eq!(
1181              build(r"a|b").states,
1182              &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
1183          );
1184          assert_eq!(
1185              build(r"|b").states,
1186              &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
1187          );
1188          assert_eq!(
1189              build(r"a|").states,
1190              &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
1191          );
1192      }
1193  }
1194