1  /*!
2  Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3  */
4  
5  use std::cmp;
6  use std::fmt;
7  use std::iter;
8  use std::mem;
9  use std::ops;
10  
11  use crate::hir::{self, Hir, HirKind};
12  
13  /// A set of literal byte strings extracted from a regular expression.
14  ///
15  /// Every member of the set is a `Literal`, which is represented by a
16  /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17  /// said to be either *complete* or *cut*. A complete literal means that
18  /// it extends until the beginning (or end) of the regular expression. In
19  /// some circumstances, this can be used to indicate a match in the regular
20  /// expression.
21  ///
22  /// A key aspect of literal extraction is knowing when to stop. It is not
23  /// feasible to blindly extract all literals from a regular expression, even if
24  /// there are finitely many. For example, the regular expression `[0-9]{10}`
25  /// has `10^10` distinct literals. For this reason, literal extraction is
26  /// bounded to some low number by default using heuristics, but the limits can
27  /// be tweaked.
28  ///
29  /// **WARNING**: Literal extraction uses stack space proportional to the size
30  /// of the `Hir` expression. At some point, this drawback will be eliminated.
31  /// To protect yourself, set a reasonable
32  /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33  /// This is done for you by default.
34  #[derive(Clone, Eq, PartialEq)]
35  pub struct Literals {
36      lits: Vec<Literal>,
37      limit_size: usize,
38      limit_class: usize,
39  }
40  
41  /// A single member of a set of literals extracted from a regular expression.
42  ///
43  /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44  /// and `Vec` operations are available.
45  #[derive(Clone, Eq, Ord)]
46  pub struct Literal {
47      v: Vec<u8>,
48      cut: bool,
49  }
50  
51  impl Literals {
52      /// Returns a new empty set of literals using default limits.
empty() -> Literals53      pub fn empty() -> Literals {
54          Literals { lits: vec![], limit_size: 250, limit_class: 10 }
55      }
56  
57      /// Returns a set of literal prefixes extracted from the given `Hir`.
prefixes(expr: &Hir) -> Literals58      pub fn prefixes(expr: &Hir) -> Literals {
59          let mut lits = Literals::empty();
60          lits.union_prefixes(expr);
61          lits
62      }
63  
64      /// Returns a set of literal suffixes extracted from the given `Hir`.
suffixes(expr: &Hir) -> Literals65      pub fn suffixes(expr: &Hir) -> Literals {
66          let mut lits = Literals::empty();
67          lits.union_suffixes(expr);
68          lits
69      }
70  
71      /// Get the approximate size limit (in bytes) of this set.
limit_size(&self) -> usize72      pub fn limit_size(&self) -> usize {
73          self.limit_size
74      }
75  
76      /// Set the approximate size limit (in bytes) of this set.
77      ///
78      /// If extracting a literal would put the set over this limit, then
79      /// extraction stops.
80      ///
81      /// The new limits will only apply to additions to this set. Existing
82      /// members remain unchanged, even if the set exceeds the new limit.
set_limit_size(&mut self, size: usize) -> &mut Literals83      pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84          self.limit_size = size;
85          self
86      }
87  
88      /// Get the character class size limit for this set.
limit_class(&self) -> usize89      pub fn limit_class(&self) -> usize {
90          self.limit_class
91      }
92  
93      /// Limits the size of character(or byte) classes considered.
94      ///
95      /// A value of `0` prevents all character classes from being considered.
96      ///
97      /// This limit also applies to case insensitive literals, since each
98      /// character in the case insensitive literal is converted to a class, and
99      /// then case folded.
100      ///
101      /// The new limits will only apply to additions to this set. Existing
102      /// members remain unchanged, even if the set exceeds the new limit.
set_limit_class(&mut self, size: usize) -> &mut Literals103      pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104          self.limit_class = size;
105          self
106      }
107  
108      /// Returns the set of literals as a slice. Its order is unspecified.
literals(&self) -> &[Literal]109      pub fn literals(&self) -> &[Literal] {
110          &self.lits
111      }
112  
113      /// Returns the length of the smallest literal.
114      ///
115      /// Returns None is there are no literals in the set.
min_len(&self) -> Option<usize>116      pub fn min_len(&self) -> Option<usize> {
117          let mut min = None;
118          for lit in &self.lits {
119              match min {
120                  None => min = Some(lit.len()),
121                  Some(m) if lit.len() < m => min = Some(lit.len()),
122                  _ => {}
123              }
124          }
125          min
126      }
127  
128      /// Returns true if all members in this set are complete.
all_complete(&self) -> bool129      pub fn all_complete(&self) -> bool {
130          !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
131      }
132  
133      /// Returns true if any member in this set is complete.
any_complete(&self) -> bool134      pub fn any_complete(&self) -> bool {
135          self.lits.iter().any(|lit| !lit.is_cut())
136      }
137  
138      /// Returns true if this set contains an empty literal.
contains_empty(&self) -> bool139      pub fn contains_empty(&self) -> bool {
140          self.lits.iter().any(|lit| lit.is_empty())
141      }
142  
143      /// Returns true if this set is empty or if all of its members is empty.
is_empty(&self) -> bool144      pub fn is_empty(&self) -> bool {
145          self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
146      }
147  
148      /// Returns a new empty set of literals using this set's limits.
to_empty(&self) -> Literals149      pub fn to_empty(&self) -> Literals {
150          let mut lits = Literals::empty();
151          lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
152          lits
153      }
154  
155      /// Returns the longest common prefix of all members in this set.
longest_common_prefix(&self) -> &[u8]156      pub fn longest_common_prefix(&self) -> &[u8] {
157          if self.is_empty() {
158              return &[];
159          }
160          let lit0 = &*self.lits[0];
161          let mut len = lit0.len();
162          for lit in &self.lits[1..] {
163              len = cmp::min(
164                  len,
165                  lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
166              );
167          }
168          &self.lits[0][..len]
169      }
170  
171      /// Returns the longest common suffix of all members in this set.
longest_common_suffix(&self) -> &[u8]172      pub fn longest_common_suffix(&self) -> &[u8] {
173          if self.is_empty() {
174              return &[];
175          }
176          let lit0 = &*self.lits[0];
177          let mut len = lit0.len();
178          for lit in &self.lits[1..] {
179              len = cmp::min(
180                  len,
181                  lit.iter()
182                      .rev()
183                      .zip(lit0.iter().rev())
184                      .take_while(|&(a, b)| a == b)
185                      .count(),
186              );
187          }
188          &self.lits[0][self.lits[0].len() - len..]
189      }
190  
191      /// Returns a new set of literals with the given number of bytes trimmed
192      /// from the suffix of each literal.
193      ///
194      /// If any literal would be cut out completely by trimming, then None is
195      /// returned.
196      ///
197      /// Any duplicates that are created as a result of this transformation are
198      /// removed.
trim_suffix(&self, num_bytes: usize) -> Option<Literals>199      pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200          if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
201              return None;
202          }
203          let mut new = self.to_empty();
204          for mut lit in self.lits.iter().cloned() {
205              let new_len = lit.len() - num_bytes;
206              lit.truncate(new_len);
207              lit.cut();
208              new.lits.push(lit);
209          }
210          new.lits.sort();
211          new.lits.dedup();
212          Some(new)
213      }
214  
215      /// Returns a new set of prefixes of this set of literals that are
216      /// guaranteed to be unambiguous.
217      ///
218      /// Any substring match with a member of the set is returned is guaranteed
219      /// to never overlap with a substring match of another member of the set
220      /// at the same starting position.
221      ///
222      /// Given any two members of the returned set, neither is a substring of
223      /// the other.
unambiguous_prefixes(&self) -> Literals224      pub fn unambiguous_prefixes(&self) -> Literals {
225          if self.lits.is_empty() {
226              return self.to_empty();
227          }
228          let mut old = self.lits.to_vec();
229          let mut new = self.to_empty();
230          'OUTER: while let Some(mut candidate) = old.pop() {
231              if candidate.is_empty() {
232                  continue;
233              }
234              if new.lits.is_empty() {
235                  new.lits.push(candidate);
236                  continue;
237              }
238              for lit2 in &mut new.lits {
239                  if lit2.is_empty() {
240                      continue;
241                  }
242                  if &candidate == lit2 {
243                      // If the literal is already in the set, then we can
244                      // just drop it. But make sure that cut literals are
245                      // infectious!
246                      candidate.cut = candidate.cut || lit2.cut;
247                      lit2.cut = candidate.cut;
248                      continue 'OUTER;
249                  }
250                  if candidate.len() < lit2.len() {
251                      if let Some(i) = position(&candidate, &lit2) {
252                          candidate.cut();
253                          let mut lit3 = lit2.clone();
254                          lit3.truncate(i);
255                          lit3.cut();
256                          old.push(lit3);
257                          lit2.clear();
258                      }
259                  } else if let Some(i) = position(&lit2, &candidate) {
260                      lit2.cut();
261                      let mut new_candidate = candidate.clone();
262                      new_candidate.truncate(i);
263                      new_candidate.cut();
264                      old.push(new_candidate);
265                      candidate.clear();
266                  }
267                  // Oops, the candidate is already represented in the set.
268                  if candidate.is_empty() {
269                      continue 'OUTER;
270                  }
271              }
272              new.lits.push(candidate);
273          }
274          new.lits.retain(|lit| !lit.is_empty());
275          new.lits.sort();
276          new.lits.dedup();
277          new
278      }
279  
280      /// Returns a new set of suffixes of this set of literals that are
281      /// guaranteed to be unambiguous.
282      ///
283      /// Any substring match with a member of the set is returned is guaranteed
284      /// to never overlap with a substring match of another member of the set
285      /// at the same ending position.
286      ///
287      /// Given any two members of the returned set, neither is a substring of
288      /// the other.
unambiguous_suffixes(&self) -> Literals289      pub fn unambiguous_suffixes(&self) -> Literals {
290          // This is a touch wasteful...
291          let mut lits = self.clone();
292          lits.reverse();
293          let mut unamb = lits.unambiguous_prefixes();
294          unamb.reverse();
295          unamb
296      }
297  
298      /// Unions the prefixes from the given expression to this set.
299      ///
300      /// If prefixes could not be added (for example, this set would exceed its
301      /// size limits or the set of prefixes from `expr` includes the empty
302      /// string), then false is returned.
303      ///
304      /// Note that prefix literals extracted from `expr` are said to be complete
305      /// if and only if the literal extends from the beginning of `expr` to the
306      /// end of `expr`.
union_prefixes(&mut self, expr: &Hir) -> bool307      pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
308          let mut lits = self.to_empty();
309          prefixes(expr, &mut lits);
310          !lits.is_empty() && !lits.contains_empty() && self.union(lits)
311      }
312  
313      /// Unions the suffixes from the given expression to this set.
314      ///
315      /// If suffixes could not be added (for example, this set would exceed its
316      /// size limits or the set of suffixes from `expr` includes the empty
317      /// string), then false is returned.
318      ///
319      /// Note that prefix literals extracted from `expr` are said to be complete
320      /// if and only if the literal extends from the end of `expr` to the
321      /// beginning of `expr`.
union_suffixes(&mut self, expr: &Hir) -> bool322      pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
323          let mut lits = self.to_empty();
324          suffixes(expr, &mut lits);
325          lits.reverse();
326          !lits.is_empty() && !lits.contains_empty() && self.union(lits)
327      }
328  
329      /// Unions this set with another set.
330      ///
331      /// If the union would cause the set to exceed its limits, then the union
332      /// is skipped and it returns false. Otherwise, if the union succeeds, it
333      /// returns true.
union(&mut self, lits: Literals) -> bool334      pub fn union(&mut self, lits: Literals) -> bool {
335          if self.num_bytes() + lits.num_bytes() > self.limit_size {
336              return false;
337          }
338          if lits.is_empty() {
339              self.lits.push(Literal::empty());
340          } else {
341              self.lits.extend(lits.lits);
342          }
343          true
344      }
345  
346      /// Extends this set with another set.
347      ///
348      /// The set of literals is extended via a cross product.
349      ///
350      /// If a cross product would cause this set to exceed its limits, then the
351      /// cross product is skipped and it returns false. Otherwise, if the cross
352      /// product succeeds, it returns true.
cross_product(&mut self, lits: &Literals) -> bool353      pub fn cross_product(&mut self, lits: &Literals) -> bool {
354          if lits.is_empty() {
355              return true;
356          }
357          // Check that we make sure we stay in our limits.
358          let mut size_after;
359          if self.is_empty() || !self.any_complete() {
360              size_after = self.num_bytes();
361              for lits_lit in lits.literals() {
362                  size_after += lits_lit.len();
363              }
364          } else {
365              size_after = self.lits.iter().fold(0, |accum, lit| {
366                  accum + if lit.is_cut() { lit.len() } else { 0 }
367              });
368              for lits_lit in lits.literals() {
369                  for self_lit in self.literals() {
370                      if !self_lit.is_cut() {
371                          size_after += self_lit.len() + lits_lit.len();
372                      }
373                  }
374              }
375          }
376          if size_after > self.limit_size {
377              return false;
378          }
379  
380          let mut base = self.remove_complete();
381          if base.is_empty() {
382              base = vec![Literal::empty()];
383          }
384          for lits_lit in lits.literals() {
385              for mut self_lit in base.clone() {
386                  self_lit.extend(&**lits_lit);
387                  self_lit.cut = lits_lit.cut;
388                  self.lits.push(self_lit);
389              }
390          }
391          true
392      }
393  
394      /// Extends each literal in this set with the bytes given.
395      ///
396      /// If the set is empty, then the given literal is added to the set.
397      ///
398      /// If adding any number of bytes to all members of this set causes a limit
399      /// to be exceeded, then no bytes are added and false is returned. If a
400      /// prefix of `bytes` can be fit into this set, then it is used and all
401      /// resulting literals are cut.
cross_add(&mut self, bytes: &[u8]) -> bool402      pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
403          // N.B. This could be implemented by simply calling cross_product with
404          // a literal set containing just `bytes`, but we can be smarter about
405          // taking shorter prefixes of `bytes` if they'll fit.
406          if bytes.is_empty() {
407              return true;
408          }
409          if self.lits.is_empty() {
410              let i = cmp::min(self.limit_size, bytes.len());
411              self.lits.push(Literal::new(bytes[..i].to_owned()));
412              self.lits[0].cut = i < bytes.len();
413              return !self.lits[0].is_cut();
414          }
415          let size = self.num_bytes();
416          if size + self.lits.len() >= self.limit_size {
417              return false;
418          }
419          let mut i = 1;
420          while size + (i * self.lits.len()) <= self.limit_size
421              && i < bytes.len()
422          {
423              i += 1;
424          }
425          for lit in &mut self.lits {
426              if !lit.is_cut() {
427                  lit.extend(&bytes[..i]);
428                  if i < bytes.len() {
429                      lit.cut();
430                  }
431              }
432          }
433          true
434      }
435  
436      /// Adds the given literal to this set.
437      ///
438      /// Returns false if adding this literal would cause the class to be too
439      /// big.
add(&mut self, lit: Literal) -> bool440      pub fn add(&mut self, lit: Literal) -> bool {
441          if self.num_bytes() + lit.len() > self.limit_size {
442              return false;
443          }
444          self.lits.push(lit);
445          true
446      }
447  
448      /// Extends each literal in this set with the character class given.
449      ///
450      /// Returns false if the character class was too big to add.
add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool451      pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
452          self._add_char_class(cls, false)
453      }
454  
455      /// Extends each literal in this set with the character class given,
456      /// writing the bytes of each character in reverse.
457      ///
458      /// Returns false if the character class was too big to add.
add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool459      fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
460          self._add_char_class(cls, true)
461      }
462  
_add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool463      fn _add_char_class(
464          &mut self,
465          cls: &hir::ClassUnicode,
466          reverse: bool,
467      ) -> bool {
468          use std::char;
469  
470          if self.class_exceeds_limits(cls_char_count(cls)) {
471              return false;
472          }
473          let mut base = self.remove_complete();
474          if base.is_empty() {
475              base = vec![Literal::empty()];
476          }
477          for r in cls.iter() {
478              let (s, e) = (r.start as u32, r.end as u32 + 1);
479              for c in (s..e).filter_map(char::from_u32) {
480                  for mut lit in base.clone() {
481                      let mut bytes = c.to_string().into_bytes();
482                      if reverse {
483                          bytes.reverse();
484                      }
485                      lit.extend(&bytes);
486                      self.lits.push(lit);
487                  }
488              }
489          }
490          true
491      }
492  
493      /// Extends each literal in this set with the byte class given.
494      ///
495      /// Returns false if the byte class was too big to add.
add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool496      pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
497          if self.class_exceeds_limits(cls_byte_count(cls)) {
498              return false;
499          }
500          let mut base = self.remove_complete();
501          if base.is_empty() {
502              base = vec![Literal::empty()];
503          }
504          for r in cls.iter() {
505              let (s, e) = (r.start as u32, r.end as u32 + 1);
506              for b in (s..e).map(|b| b as u8) {
507                  for mut lit in base.clone() {
508                      lit.push(b);
509                      self.lits.push(lit);
510                  }
511              }
512          }
513          true
514      }
515  
516      /// Cuts every member of this set. When a member is cut, it can never
517      /// be extended.
cut(&mut self)518      pub fn cut(&mut self) {
519          for lit in &mut self.lits {
520              lit.cut();
521          }
522      }
523  
524      /// Reverses all members in place.
reverse(&mut self)525      pub fn reverse(&mut self) {
526          for lit in &mut self.lits {
527              lit.reverse();
528          }
529      }
530  
531      /// Clears this set of all members.
clear(&mut self)532      pub fn clear(&mut self) {
533          self.lits.clear();
534      }
535  
536      /// Pops all complete literals out of this set.
remove_complete(&mut self) -> Vec<Literal>537      fn remove_complete(&mut self) -> Vec<Literal> {
538          let mut base = vec![];
539          for lit in mem::replace(&mut self.lits, vec![]) {
540              if lit.is_cut() {
541                  self.lits.push(lit);
542              } else {
543                  base.push(lit);
544              }
545          }
546          base
547      }
548  
549      /// Returns the total number of bytes in this set.
num_bytes(&self) -> usize550      fn num_bytes(&self) -> usize {
551          self.lits.iter().fold(0, |accum, lit| accum + lit.len())
552      }
553  
554      /// Returns true if a character class with the given size would cause this
555      /// set to exceed its limits.
556      ///
557      /// The size given should correspond to the number of items in the class.
class_exceeds_limits(&self, size: usize) -> bool558      fn class_exceeds_limits(&self, size: usize) -> bool {
559          if size > self.limit_class {
560              return true;
561          }
562          // This is an approximation since codepoints in a char class can encode
563          // to 1-4 bytes.
564          let new_byte_count = if self.lits.is_empty() {
565              size
566          } else {
567              self.lits.iter().fold(0, |accum, lit| {
568                  accum
569                      + if lit.is_cut() {
570                          // If the literal is cut, then we'll never add
571                          // anything to it, so don't count it.
572                          0
573                      } else {
574                          (lit.len() + 1) * size
575                      }
576              })
577          };
578          new_byte_count > self.limit_size
579      }
580  }
581  
prefixes(expr: &Hir, lits: &mut Literals)582  fn prefixes(expr: &Hir, lits: &mut Literals) {
583      match *expr.kind() {
584          HirKind::Literal(hir::Literal::Unicode(c)) => {
585              let mut buf = [0; 4];
586              lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
587          }
588          HirKind::Literal(hir::Literal::Byte(b)) => {
589              lits.cross_add(&[b]);
590          }
591          HirKind::Class(hir::Class::Unicode(ref cls)) => {
592              if !lits.add_char_class(cls) {
593                  lits.cut();
594              }
595          }
596          HirKind::Class(hir::Class::Bytes(ref cls)) => {
597              if !lits.add_byte_class(cls) {
598                  lits.cut();
599              }
600          }
601          HirKind::Group(hir::Group { ref hir, .. }) => {
602              prefixes(&**hir, lits);
603          }
604          HirKind::Repetition(ref x) => match x.kind {
605              hir::RepetitionKind::ZeroOrOne => {
606                  repeat_zero_or_one_literals(&x.hir, lits, prefixes);
607              }
608              hir::RepetitionKind::ZeroOrMore => {
609                  repeat_zero_or_more_literals(&x.hir, lits, prefixes);
610              }
611              hir::RepetitionKind::OneOrMore => {
612                  repeat_one_or_more_literals(&x.hir, lits, prefixes);
613              }
614              hir::RepetitionKind::Range(ref rng) => {
615                  let (min, max) = match *rng {
616                      hir::RepetitionRange::Exactly(m) => (m, Some(m)),
617                      hir::RepetitionRange::AtLeast(m) => (m, None),
618                      hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
619                  };
620                  repeat_range_literals(
621                      &x.hir, min, max, x.greedy, lits, prefixes,
622                  )
623              }
624          },
625          HirKind::Concat(ref es) if es.is_empty() => {}
626          HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
627          HirKind::Concat(ref es) => {
628              for e in es {
629                  if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630                      if !lits.is_empty() {
631                          lits.cut();
632                          break;
633                      }
634                      lits.add(Literal::empty());
635                      continue;
636                  }
637                  let mut lits2 = lits.to_empty();
638                  prefixes(e, &mut lits2);
639                  if !lits.cross_product(&lits2) || !lits2.any_complete() {
640                      // If this expression couldn't yield any literal that
641                      // could be extended, then we need to quit. Since we're
642                      // short-circuiting, we also need to freeze every member.
643                      lits.cut();
644                      break;
645                  }
646              }
647          }
648          HirKind::Alternation(ref es) => {
649              alternate_literals(es, lits, prefixes);
650          }
651          _ => lits.cut(),
652      }
653  }
654  
suffixes(expr: &Hir, lits: &mut Literals)655  fn suffixes(expr: &Hir, lits: &mut Literals) {
656      match *expr.kind() {
657          HirKind::Literal(hir::Literal::Unicode(c)) => {
658              let mut buf = [0u8; 4];
659              let i = c.encode_utf8(&mut buf).len();
660              let buf = &mut buf[..i];
661              buf.reverse();
662              lits.cross_add(buf);
663          }
664          HirKind::Literal(hir::Literal::Byte(b)) => {
665              lits.cross_add(&[b]);
666          }
667          HirKind::Class(hir::Class::Unicode(ref cls)) => {
668              if !lits.add_char_class_reverse(cls) {
669                  lits.cut();
670              }
671          }
672          HirKind::Class(hir::Class::Bytes(ref cls)) => {
673              if !lits.add_byte_class(cls) {
674                  lits.cut();
675              }
676          }
677          HirKind::Group(hir::Group { ref hir, .. }) => {
678              suffixes(&**hir, lits);
679          }
680          HirKind::Repetition(ref x) => match x.kind {
681              hir::RepetitionKind::ZeroOrOne => {
682                  repeat_zero_or_one_literals(&x.hir, lits, suffixes);
683              }
684              hir::RepetitionKind::ZeroOrMore => {
685                  repeat_zero_or_more_literals(&x.hir, lits, suffixes);
686              }
687              hir::RepetitionKind::OneOrMore => {
688                  repeat_one_or_more_literals(&x.hir, lits, suffixes);
689              }
690              hir::RepetitionKind::Range(ref rng) => {
691                  let (min, max) = match *rng {
692                      hir::RepetitionRange::Exactly(m) => (m, Some(m)),
693                      hir::RepetitionRange::AtLeast(m) => (m, None),
694                      hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
695                  };
696                  repeat_range_literals(
697                      &x.hir, min, max, x.greedy, lits, suffixes,
698                  )
699              }
700          },
701          HirKind::Concat(ref es) if es.is_empty() => {}
702          HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
703          HirKind::Concat(ref es) => {
704              for e in es.iter().rev() {
705                  if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706                      if !lits.is_empty() {
707                          lits.cut();
708                          break;
709                      }
710                      lits.add(Literal::empty());
711                      continue;
712                  }
713                  let mut lits2 = lits.to_empty();
714                  suffixes(e, &mut lits2);
715                  if !lits.cross_product(&lits2) || !lits2.any_complete() {
716                      // If this expression couldn't yield any literal that
717                      // could be extended, then we need to quit. Since we're
718                      // short-circuiting, we also need to freeze every member.
719                      lits.cut();
720                      break;
721                  }
722              }
723          }
724          HirKind::Alternation(ref es) => {
725              alternate_literals(es, lits, suffixes);
726          }
727          _ => lits.cut(),
728      }
729  }
730  
repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )731  fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
732      e: &Hir,
733      lits: &mut Literals,
734      mut f: F,
735  ) {
736      f(
737          &Hir::repetition(hir::Repetition {
738              kind: hir::RepetitionKind::ZeroOrMore,
739              // FIXME: Our literal extraction doesn't care about greediness.
740              // Which is partially why we're treating 'e?' as 'e*'. Namely,
741              // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
742              // [Complete(a), Complete(ab)] because of the non-greediness.
743              greedy: true,
744              hir: Box::new(e.clone()),
745          }),
746          lits,
747      );
748  }
749  
repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )750  fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
751      e: &Hir,
752      lits: &mut Literals,
753      mut f: F,
754  ) {
755      let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756      lits3.set_limit_size(lits.limit_size() / 2);
757      f(e, &mut lits3);
758  
759      if lits3.is_empty() || !lits2.cross_product(&lits3) {
760          lits.cut();
761          return;
762      }
763      lits2.cut();
764      lits2.add(Literal::empty());
765      if !lits.union(lits2) {
766          lits.cut();
767      }
768  }
769  
repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )770  fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
771      e: &Hir,
772      lits: &mut Literals,
773      mut f: F,
774  ) {
775      f(e, lits);
776      lits.cut();
777  }
778  
repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )779  fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
780      e: &Hir,
781      min: u32,
782      max: Option<u32>,
783      greedy: bool,
784      lits: &mut Literals,
785      mut f: F,
786  ) {
787      if min == 0 {
788          // This is a bit conservative. If `max` is set, then we could
789          // treat this as a finite set of alternations. For now, we
790          // just treat it as `e*`.
791          f(
792              &Hir::repetition(hir::Repetition {
793                  kind: hir::RepetitionKind::ZeroOrMore,
794                  greedy,
795                  hir: Box::new(e.clone()),
796              }),
797              lits,
798          );
799      } else {
800          if min > 0 {
801              let n = cmp::min(lits.limit_size, min as usize);
802              let es = iter::repeat(e.clone()).take(n).collect();
803              f(&Hir::concat(es), lits);
804              if n < min as usize || lits.contains_empty() {
805                  lits.cut();
806              }
807          }
808          if max.map_or(true, |max| min < max) {
809              lits.cut();
810          }
811      }
812  }
813  
alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )814  fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
815      es: &[Hir],
816      lits: &mut Literals,
817      mut f: F,
818  ) {
819      let mut lits2 = lits.to_empty();
820      for e in es {
821          let mut lits3 = lits.to_empty();
822          lits3.set_limit_size(lits.limit_size() / 5);
823          f(e, &mut lits3);
824          if lits3.is_empty() || !lits2.union(lits3) {
825              // If we couldn't find suffixes for *any* of the
826              // alternates, then the entire alternation has to be thrown
827              // away and any existing members must be frozen. Similarly,
828              // if the union couldn't complete, stop and freeze.
829              lits.cut();
830              return;
831          }
832      }
833      if !lits.cross_product(&lits2) {
834          lits.cut();
835      }
836  }
837  
838  impl fmt::Debug for Literals {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result839      fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
840          f.debug_struct("Literals")
841              .field("lits", &self.lits)
842              .field("limit_size", &self.limit_size)
843              .field("limit_class", &self.limit_class)
844              .finish()
845      }
846  }
847  
848  impl Literal {
849      /// Returns a new complete literal with the bytes given.
new(bytes: Vec<u8>) -> Literal850      pub fn new(bytes: Vec<u8>) -> Literal {
851          Literal { v: bytes, cut: false }
852      }
853  
854      /// Returns a new complete empty literal.
empty() -> Literal855      pub fn empty() -> Literal {
856          Literal { v: vec![], cut: false }
857      }
858  
859      /// Returns true if this literal was "cut."
is_cut(&self) -> bool860      pub fn is_cut(&self) -> bool {
861          self.cut
862      }
863  
864      /// Cuts this literal.
cut(&mut self)865      pub fn cut(&mut self) {
866          self.cut = true;
867      }
868  }
869  
870  impl PartialEq for Literal {
eq(&self, other: &Literal) -> bool871      fn eq(&self, other: &Literal) -> bool {
872          self.v == other.v
873      }
874  }
875  
876  impl PartialOrd for Literal {
partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>877      fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
878          self.v.partial_cmp(&other.v)
879      }
880  }
881  
882  impl fmt::Debug for Literal {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result883      fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
884          if self.is_cut() {
885              write!(f, "Cut({})", escape_unicode(&self.v))
886          } else {
887              write!(f, "Complete({})", escape_unicode(&self.v))
888          }
889      }
890  }
891  
892  impl AsRef<[u8]> for Literal {
as_ref(&self) -> &[u8]893      fn as_ref(&self) -> &[u8] {
894          &self.v
895      }
896  }
897  
898  impl ops::Deref for Literal {
899      type Target = Vec<u8>;
deref(&self) -> &Vec<u8>900      fn deref(&self) -> &Vec<u8> {
901          &self.v
902      }
903  }
904  
905  impl ops::DerefMut for Literal {
deref_mut(&mut self) -> &mut Vec<u8>906      fn deref_mut(&mut self) -> &mut Vec<u8> {
907          &mut self.v
908      }
909  }
910  
position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>911  fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
912      let mut i = 0;
913      while haystack.len() >= needle.len() {
914          if needle == &haystack[..needle.len()] {
915              return Some(i);
916          }
917          i += 1;
918          haystack = &haystack[1..];
919      }
920      None
921  }
922  
escape_unicode(bytes: &[u8]) -> String923  fn escape_unicode(bytes: &[u8]) -> String {
924      let show = match ::std::str::from_utf8(bytes) {
925          Ok(v) => v.to_string(),
926          Err(_) => escape_bytes(bytes),
927      };
928      let mut space_escaped = String::new();
929      for c in show.chars() {
930          if c.is_whitespace() {
931              let escaped = if c as u32 <= 0x7F {
932                  escape_byte(c as u8)
933              } else if c as u32 <= 0xFFFF {
934                  format!(r"\u{{{:04x}}}", c as u32)
935              } else {
936                  format!(r"\U{{{:08x}}}", c as u32)
937              };
938              space_escaped.push_str(&escaped);
939          } else {
940              space_escaped.push(c);
941          }
942      }
943      space_escaped
944  }
945  
escape_bytes(bytes: &[u8]) -> String946  fn escape_bytes(bytes: &[u8]) -> String {
947      let mut s = String::new();
948      for &b in bytes {
949          s.push_str(&escape_byte(b));
950      }
951      s
952  }
953  
escape_byte(byte: u8) -> String954  fn escape_byte(byte: u8) -> String {
955      use std::ascii::escape_default;
956  
957      let escaped: Vec<u8> = escape_default(byte).collect();
958      String::from_utf8_lossy(&escaped).into_owned()
959  }
960  
cls_char_count(cls: &hir::ClassUnicode) -> usize961  fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962      cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
963          as usize
964  }
965  
cls_byte_count(cls: &hir::ClassBytes) -> usize966  fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967      cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
968          as usize
969  }
970  
971  #[cfg(test)]
972  mod tests {
973      use std::fmt;
974  
975      use super::{escape_bytes, Literal, Literals};
976      use crate::hir::Hir;
977      use crate::ParserBuilder;
978  
979      // To make test failures easier to read.
980      #[derive(Debug, Eq, PartialEq)]
981      struct Bytes(Vec<ULiteral>);
982      #[derive(Debug, Eq, PartialEq)]
983      struct Unicode(Vec<ULiteral>);
984  
escape_lits(blits: &[Literal]) -> Vec<ULiteral>985      fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
986          let mut ulits = vec![];
987          for blit in blits {
988              ulits
989                  .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
990          }
991          ulits
992      }
993  
create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals994      fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
995          Literals {
996              lits: it.into_iter().collect(),
997              limit_size: 0,
998              limit_class: 0,
999          }
1000      }
1001  
1002      // Needs to be pub for 1.3?
1003      #[derive(Clone, Eq, PartialEq)]
1004      pub struct ULiteral {
1005          v: String,
1006          cut: bool,
1007      }
1008  
1009      impl ULiteral {
is_cut(&self) -> bool1010          fn is_cut(&self) -> bool {
1011              self.cut
1012          }
1013      }
1014  
1015      impl fmt::Debug for ULiteral {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1016          fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1017              if self.is_cut() {
1018                  write!(f, "Cut({})", self.v)
1019              } else {
1020                  write!(f, "Complete({})", self.v)
1021              }
1022          }
1023      }
1024  
1025      impl PartialEq<Literal> for ULiteral {
eq(&self, other: &Literal) -> bool1026          fn eq(&self, other: &Literal) -> bool {
1027              self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028          }
1029      }
1030  
1031      impl PartialEq<ULiteral> for Literal {
eq(&self, other: &ULiteral) -> bool1032          fn eq(&self, other: &ULiteral) -> bool {
1033              &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034          }
1035      }
1036  
1037      #[allow(non_snake_case)]
C(s: &'static str) -> ULiteral1038      fn C(s: &'static str) -> ULiteral {
1039          ULiteral { v: s.to_owned(), cut: true }
1040      }
1041      #[allow(non_snake_case)]
M(s: &'static str) -> ULiteral1042      fn M(s: &'static str) -> ULiteral {
1043          ULiteral { v: s.to_owned(), cut: false }
1044      }
1045  
prefixes(lits: &mut Literals, expr: &Hir)1046      fn prefixes(lits: &mut Literals, expr: &Hir) {
1047          lits.union_prefixes(expr);
1048      }
1049  
suffixes(lits: &mut Literals, expr: &Hir)1050      fn suffixes(lits: &mut Literals, expr: &Hir) {
1051          lits.union_suffixes(expr);
1052      }
1053  
1054      macro_rules! assert_lit_eq {
1055          ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1056              let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1057              let lits = $got_lits;
1058              assert_eq!(
1059                  $which(expected.clone()),
1060                  $which(escape_lits(lits.literals())));
1061              assert_eq!(
1062                  !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1063                  lits.all_complete());
1064              assert_eq!(
1065                  expected.iter().any(|l| !l.is_cut()),
1066                  lits.any_complete());
1067          }};
1068      }
1069  
1070      macro_rules! test_lit {
1071          ($name:ident, $which:ident, $re:expr) => {
1072              test_lit!($name, $which, $re,);
1073          };
1074          ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1075              #[test]
1076              fn $name() {
1077                  let expr = ParserBuilder::new()
1078                      .build()
1079                      .parse($re)
1080                      .unwrap();
1081                  let lits = Literals::$which(&expr);
1082                  assert_lit_eq!(Unicode, lits, $($lit),*);
1083  
1084                  let expr = ParserBuilder::new()
1085                      .allow_invalid_utf8(true)
1086                      .unicode(false)
1087                      .build()
1088                      .parse($re)
1089                      .unwrap();
1090                  let lits = Literals::$which(&expr);
1091                  assert_lit_eq!(Bytes, lits, $($lit),*);
1092              }
1093          };
1094      }
1095  
1096      // ************************************************************************
1097      // Tests for prefix literal extraction.
1098      // ************************************************************************
1099  
1100      // Elementary tests.
1101      test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1102      test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1103      test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1104      #[cfg(feature = "unicode-case")]
1105      test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1106      test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1107      test_lit!(
1108          pfx_class2,
1109          prefixes,
1110          "(?u)[☃Ⅰ]",
1111          M("\\xe2\\x85\\xa0"),
1112          M("\\xe2\\x98\\x83")
1113      );
1114      #[cfg(feature = "unicode-case")]
1115      test_lit!(
1116          pfx_class3,
1117          prefixes,
1118          "(?ui)[☃Ⅰ]",
1119          M("\\xe2\\x85\\xa0"),
1120          M("\\xe2\\x85\\xb0"),
1121          M("\\xe2\\x98\\x83")
1122      );
1123      test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1124      test_lit!(
1125          pfx_one_lit_casei2,
1126          prefixes,
1127          "(?i-u)abc",
1128          M("ABC"),
1129          M("aBC"),
1130          M("AbC"),
1131          M("abC"),
1132          M("ABc"),
1133          M("aBc"),
1134          M("Abc"),
1135          M("abc")
1136      );
1137      test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1138      test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1139      test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1140      test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1141      // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1142      // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1143      // a cut literal.
1144      test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1145      test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1146      test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1147      test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1148      test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1149      test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1150      test_lit!(pfx_rep_range1, prefixes, "a{0}");
1151      test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1152      test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1153      test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1154      test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1155      test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1156      test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1157  
1158      // Test regexes with concatenations.
1159      test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1160      test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1161      test_lit!(
1162          pfx_cat3,
1163          prefixes,
1164          "(?i-u)[ab]z",
1165          M("AZ"),
1166          M("BZ"),
1167          M("aZ"),
1168          M("bZ"),
1169          M("Az"),
1170          M("Bz"),
1171          M("az"),
1172          M("bz")
1173      );
1174      test_lit!(
1175          pfx_cat4,
1176          prefixes,
1177          "[ab][yz]",
1178          M("ay"),
1179          M("by"),
1180          M("az"),
1181          M("bz")
1182      );
1183      test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1184      test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1185      test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1186      test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1187      test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1188      test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1189      test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1190      test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1191      test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1192      test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1193      test_lit!(pfx_cat15, prefixes, "$a");
1194      test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1195      test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1196      test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1197      test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1198  
1199      // Test regexes with alternations.
1200      test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1201      test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1202      test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1203      test_lit!(pfx_alt4, prefixes, "a|b*");
1204      test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1205      test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1206      test_lit!(
1207          pfx_alt7,
1208          prefixes,
1209          "(a|b)*c|(a|ab)*c",
1210          C("a"),
1211          C("b"),
1212          M("c"),
1213          C("a"),
1214          C("ab"),
1215          M("c")
1216      );
1217      test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1218  
1219      // Test regexes with empty assertions.
1220      test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1221      test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1222      test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1223      test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1224  
1225      // Make sure some curious regexes have no prefixes.
1226      test_lit!(pfx_nothing1, prefixes, ".");
1227      test_lit!(pfx_nothing2, prefixes, "(?s).");
1228      test_lit!(pfx_nothing3, prefixes, "^");
1229      test_lit!(pfx_nothing4, prefixes, "$");
1230      test_lit!(pfx_nothing6, prefixes, "(?m)$");
1231      test_lit!(pfx_nothing7, prefixes, r"\b");
1232      test_lit!(pfx_nothing8, prefixes, r"\B");
1233  
1234      // Test a few regexes that defeat any prefix literal detection.
1235      test_lit!(pfx_defeated1, prefixes, ".a");
1236      test_lit!(pfx_defeated2, prefixes, "(?s).a");
1237      test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1238      test_lit!(pfx_defeated4, prefixes, "a|.");
1239      test_lit!(pfx_defeated5, prefixes, ".|a");
1240      test_lit!(pfx_defeated6, prefixes, "a|^");
1241      test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1242      test_lit!(pfx_defeated8, prefixes, "$a");
1243      test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1244      test_lit!(pfx_defeated10, prefixes, r"\ba");
1245      test_lit!(pfx_defeated11, prefixes, r"\Ba");
1246      test_lit!(pfx_defeated12, prefixes, "^*a");
1247      test_lit!(pfx_defeated13, prefixes, "^+a");
1248  
1249      test_lit!(
1250          pfx_crazy1,
1251          prefixes,
1252          r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1253          C("Mo\\'"),
1254          C("Mu\\'"),
1255          C("Moam"),
1256          C("Muam")
1257      );
1258  
1259      // ************************************************************************
1260      // Tests for quiting prefix literal search.
1261      // ************************************************************************
1262  
1263      macro_rules! test_exhausted {
1264          ($name:ident, $which:ident, $re:expr) => {
1265              test_exhausted!($name, $which, $re,);
1266          };
1267          ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1268              #[test]
1269              fn $name() {
1270                  let expr = ParserBuilder::new()
1271                      .build()
1272                      .parse($re)
1273                      .unwrap();
1274                  let mut lits = Literals::empty();
1275                  lits.set_limit_size(20).set_limit_class(10);
1276                  $which(&mut lits, &expr);
1277                  assert_lit_eq!(Unicode, lits, $($lit),*);
1278  
1279                  let expr = ParserBuilder::new()
1280                      .allow_invalid_utf8(true)
1281                      .unicode(false)
1282                      .build()
1283                      .parse($re)
1284                      .unwrap();
1285                  let mut lits = Literals::empty();
1286                  lits.set_limit_size(20).set_limit_class(10);
1287                  $which(&mut lits, &expr);
1288                  assert_lit_eq!(Bytes, lits, $($lit),*);
1289              }
1290          };
1291      }
1292  
1293      // These test use a much lower limit than the default so that we can
1294      // write test cases of reasonable size.
1295      test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1296      test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1297      test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1298      test_exhausted!(
1299          pfx_exhausted4,
1300          prefixes,
1301          "(?i-u)foobar",
1302          C("FO"),
1303          C("fO"),
1304          C("Fo"),
1305          C("fo")
1306      );
1307      test_exhausted!(
1308          pfx_exhausted5,
1309          prefixes,
1310          "(?:ab){100}",
1311          C("abababababababababab")
1312      );
1313      test_exhausted!(
1314          pfx_exhausted6,
1315          prefixes,
1316          "(?:(?:ab){100})*cd",
1317          C("ababababab"),
1318          M("cd")
1319      );
1320      test_exhausted!(
1321          pfx_exhausted7,
1322          prefixes,
1323          "z(?:(?:ab){100})*cd",
1324          C("zababababab"),
1325          M("zcd")
1326      );
1327      test_exhausted!(
1328          pfx_exhausted8,
1329          prefixes,
1330          "aaaaaaaaaaaaaaaaaaaaz",
1331          C("aaaaaaaaaaaaaaaaaaaa")
1332      );
1333  
1334      // ************************************************************************
1335      // Tests for suffix literal extraction.
1336      // ************************************************************************
1337  
1338      // Elementary tests.
1339      test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1340      test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1341      test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1342      #[cfg(feature = "unicode-case")]
1343      test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1344      test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1345      test_lit!(
1346          sfx_class2,
1347          suffixes,
1348          "(?u)[☃Ⅰ]",
1349          M("\\xe2\\x85\\xa0"),
1350          M("\\xe2\\x98\\x83")
1351      );
1352      #[cfg(feature = "unicode-case")]
1353      test_lit!(
1354          sfx_class3,
1355          suffixes,
1356          "(?ui)[☃Ⅰ]",
1357          M("\\xe2\\x85\\xa0"),
1358          M("\\xe2\\x85\\xb0"),
1359          M("\\xe2\\x98\\x83")
1360      );
1361      test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1362      test_lit!(
1363          sfx_one_lit_casei2,
1364          suffixes,
1365          "(?i-u)abc",
1366          M("ABC"),
1367          M("ABc"),
1368          M("AbC"),
1369          M("Abc"),
1370          M("aBC"),
1371          M("aBc"),
1372          M("abC"),
1373          M("abc")
1374      );
1375      test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1376      test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1377      test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1378      test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1379      test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1380      test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1381      test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1382      test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1383      test_lit!(sfx_rep_range1, suffixes, "a{0}");
1384      test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1385      test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1386      test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1387      test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1388      test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1389      test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1390  
1391      // Test regexes with concatenations.
1392      test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1393      test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1394      test_lit!(
1395          sfx_cat3,
1396          suffixes,
1397          "(?i-u)[ab]z",
1398          M("AZ"),
1399          M("Az"),
1400          M("BZ"),
1401          M("Bz"),
1402          M("aZ"),
1403          M("az"),
1404          M("bZ"),
1405          M("bz")
1406      );
1407      test_lit!(
1408          sfx_cat4,
1409          suffixes,
1410          "[ab][yz]",
1411          M("ay"),
1412          M("az"),
1413          M("by"),
1414          M("bz")
1415      );
1416      test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1417      test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1418      test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1419      test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1420      test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1421      test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1422      test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1423      test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1424      test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1425      test_lit!(sfx_cat14, suffixes, "a^");
1426      test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1427      test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1428      test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1429      test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1430      test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1431  
1432      // Test regexes with alternations.
1433      test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1434      test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1435      test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1436      test_lit!(sfx_alt4, suffixes, "a|b*");
1437      test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1438      test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1439      test_lit!(
1440          sfx_alt7,
1441          suffixes,
1442          "(a|b)*c|(a|ab)*c",
1443          C("ac"),
1444          C("bc"),
1445          M("c"),
1446          C("ac"),
1447          C("abc"),
1448          M("c")
1449      );
1450      test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1451  
1452      // Test regexes with empty assertions.
1453      test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1454      test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1455  
1456      // Make sure some curious regexes have no suffixes.
1457      test_lit!(sfx_nothing1, suffixes, ".");
1458      test_lit!(sfx_nothing2, suffixes, "(?s).");
1459      test_lit!(sfx_nothing3, suffixes, "^");
1460      test_lit!(sfx_nothing4, suffixes, "$");
1461      test_lit!(sfx_nothing6, suffixes, "(?m)$");
1462      test_lit!(sfx_nothing7, suffixes, r"\b");
1463      test_lit!(sfx_nothing8, suffixes, r"\B");
1464  
1465      // Test a few regexes that defeat any suffix literal detection.
1466      test_lit!(sfx_defeated1, suffixes, "a.");
1467      test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1468      test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1469      test_lit!(sfx_defeated4, suffixes, "a|.");
1470      test_lit!(sfx_defeated5, suffixes, ".|a");
1471      test_lit!(sfx_defeated6, suffixes, "a|^");
1472      test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1473      test_lit!(sfx_defeated8, suffixes, "a^");
1474      test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1475      test_lit!(sfx_defeated10, suffixes, r"a\b");
1476      test_lit!(sfx_defeated11, suffixes, r"a\B");
1477      test_lit!(sfx_defeated12, suffixes, "a^*");
1478      test_lit!(sfx_defeated13, suffixes, "a^+");
1479  
1480      // These test use a much lower limit than the default so that we can
1481      // write test cases of reasonable size.
1482      test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1483      test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1484      test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1485      test_exhausted!(
1486          sfx_exhausted4,
1487          suffixes,
1488          "(?i-u)foobar",
1489          C("AR"),
1490          C("Ar"),
1491          C("aR"),
1492          C("ar")
1493      );
1494      test_exhausted!(
1495          sfx_exhausted5,
1496          suffixes,
1497          "(?:ab){100}",
1498          C("abababababababababab")
1499      );
1500      test_exhausted!(
1501          sfx_exhausted6,
1502          suffixes,
1503          "cd(?:(?:ab){100})*",
1504          C("ababababab"),
1505          M("cd")
1506      );
1507      test_exhausted!(
1508          sfx_exhausted7,
1509          suffixes,
1510          "cd(?:(?:ab){100})*z",
1511          C("abababababz"),
1512          M("cdz")
1513      );
1514      test_exhausted!(
1515          sfx_exhausted8,
1516          suffixes,
1517          "zaaaaaaaaaaaaaaaaaaaa",
1518          C("aaaaaaaaaaaaaaaaaaaa")
1519      );
1520  
1521      // ************************************************************************
1522      // Tests for generating unambiguous literal sets.
1523      // ************************************************************************
1524  
1525      macro_rules! test_unamb {
1526          ($name:ident, $given:expr, $expected:expr) => {
1527              #[test]
1528              fn $name() {
1529                  let given: Vec<Literal> = $given
1530                      .into_iter()
1531                      .map(|ul| {
1532                          let cut = ul.is_cut();
1533                          Literal { v: ul.v.into_bytes(), cut: cut }
1534                      })
1535                      .collect();
1536                  let lits = create_lits(given);
1537                  let got = lits.unambiguous_prefixes();
1538                  assert_eq!($expected, escape_lits(got.literals()));
1539              }
1540          };
1541      }
1542  
1543      test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1544      test_unamb!(
1545          unambiguous2,
1546          vec![M("zaaaaaa"), M("aa")],
1547          vec![C("aa"), C("z")]
1548      );
1549      test_unamb!(
1550          unambiguous3,
1551          vec![M("Sherlock"), M("Watson")],
1552          vec![M("Sherlock"), M("Watson")]
1553      );
1554      test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1555      test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1556      test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1557      test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1558      test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1559      test_unamb!(
1560          unambiguous9,
1561          vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1562          vec![C("a"), C("b"), C("c")]
1563      );
1564      test_unamb!(
1565          unambiguous10,
1566          vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1567          vec![C("Mo"), C("Mu")]
1568      );
1569      test_unamb!(
1570          unambiguous11,
1571          vec![M("zazb"), M("azb")],
1572          vec![C("a"), C("z")]
1573      );
1574      test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1575      test_unamb!(
1576          unambiguous13,
1577          vec![M("ABCX"), M("CDAX"), M("BCX")],
1578          vec![C("A"), C("BCX"), C("CD")]
1579      );
1580      test_unamb!(
1581          unambiguous14,
1582          vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1583          vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1584      );
1585      test_unamb!(
1586          unambiguous15,
1587          vec![M("IMG_"), M("MG_"), M("CIMG")],
1588          vec![C("C"), C("I"), C("MG_")]
1589      );
1590  
1591      // ************************************************************************
1592      // Tests for suffix trimming.
1593      // ************************************************************************
1594      macro_rules! test_trim {
1595          ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1596              #[test]
1597              fn $name() {
1598                  let given: Vec<Literal> = $given
1599                      .into_iter()
1600                      .map(|ul| {
1601                          let cut = ul.is_cut();
1602                          Literal { v: ul.v.into_bytes(), cut: cut }
1603                      })
1604                      .collect();
1605                  let lits = create_lits(given);
1606                  let got = lits.trim_suffix($trim).unwrap();
1607                  assert_eq!($expected, escape_lits(got.literals()));
1608              }
1609          };
1610      }
1611  
1612      test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1613      test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1614      test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1615      test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1616  
1617      // ************************************************************************
1618      // Tests for longest common prefix.
1619      // ************************************************************************
1620  
1621      macro_rules! test_lcp {
1622          ($name:ident, $given:expr, $expected:expr) => {
1623              #[test]
1624              fn $name() {
1625                  let given: Vec<Literal> = $given
1626                      .into_iter()
1627                      .map(|s: &str| Literal {
1628                          v: s.to_owned().into_bytes(),
1629                          cut: false,
1630                      })
1631                      .collect();
1632                  let lits = create_lits(given);
1633                  let got = lits.longest_common_prefix();
1634                  assert_eq!($expected, escape_bytes(got));
1635              }
1636          };
1637      }
1638  
1639      test_lcp!(lcp1, vec!["a"], "a");
1640      test_lcp!(lcp2, vec![], "");
1641      test_lcp!(lcp3, vec!["a", "b"], "");
1642      test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1643      test_lcp!(lcp5, vec!["ab", "a"], "a");
1644      test_lcp!(lcp6, vec!["a", "ab"], "a");
1645      test_lcp!(lcp7, vec!["ab", "b"], "");
1646      test_lcp!(lcp8, vec!["b", "ab"], "");
1647      test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1648      test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1649      test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1650      test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1651  
1652      // ************************************************************************
1653      // Tests for longest common suffix.
1654      // ************************************************************************
1655  
1656      macro_rules! test_lcs {
1657          ($name:ident, $given:expr, $expected:expr) => {
1658              #[test]
1659              fn $name() {
1660                  let given: Vec<Literal> = $given
1661                      .into_iter()
1662                      .map(|s: &str| Literal {
1663                          v: s.to_owned().into_bytes(),
1664                          cut: false,
1665                      })
1666                      .collect();
1667                  let lits = create_lits(given);
1668                  let got = lits.longest_common_suffix();
1669                  assert_eq!($expected, escape_bytes(got));
1670              }
1671          };
1672      }
1673  
1674      test_lcs!(lcs1, vec!["a"], "a");
1675      test_lcs!(lcs2, vec![], "");
1676      test_lcs!(lcs3, vec!["a", "b"], "");
1677      test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1678      test_lcs!(lcs5, vec!["ab", "a"], "");
1679      test_lcs!(lcs6, vec!["a", "ab"], "");
1680      test_lcs!(lcs7, vec!["ab", "b"], "b");
1681      test_lcs!(lcs8, vec!["b", "ab"], "b");
1682      test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1683      test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1684      test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1685      test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1686  }
1687