1 /*! 2 Provides routines for extracting literal prefixes and suffixes from an `Hir`. 3 */ 4 5 use std::cmp; 6 use std::fmt; 7 use std::iter; 8 use std::mem; 9 use std::ops; 10 11 use crate::hir::{self, Hir, HirKind}; 12 13 /// A set of literal byte strings extracted from a regular expression. 14 /// 15 /// Every member of the set is a `Literal`, which is represented by a 16 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is 17 /// said to be either *complete* or *cut*. A complete literal means that 18 /// it extends until the beginning (or end) of the regular expression. In 19 /// some circumstances, this can be used to indicate a match in the regular 20 /// expression. 21 /// 22 /// A key aspect of literal extraction is knowing when to stop. It is not 23 /// feasible to blindly extract all literals from a regular expression, even if 24 /// there are finitely many. For example, the regular expression `[0-9]{10}` 25 /// has `10^10` distinct literals. For this reason, literal extraction is 26 /// bounded to some low number by default using heuristics, but the limits can 27 /// be tweaked. 28 /// 29 /// **WARNING**: Literal extraction uses stack space proportional to the size 30 /// of the `Hir` expression. At some point, this drawback will be eliminated. 31 /// To protect yourself, set a reasonable 32 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). 33 /// This is done for you by default. 34 #[derive(Clone, Eq, PartialEq)] 35 pub struct Literals { 36 lits: Vec<Literal>, 37 limit_size: usize, 38 limit_class: usize, 39 } 40 41 /// A single member of a set of literals extracted from a regular expression. 42 /// 43 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice 44 /// and `Vec` operations are available. 45 #[derive(Clone, Eq, Ord)] 46 pub struct Literal { 47 v: Vec<u8>, 48 cut: bool, 49 } 50 51 impl Literals { 52 /// Returns a new empty set of literals using default limits. empty() -> Literals53 pub fn empty() -> Literals { 54 Literals { lits: vec![], limit_size: 250, limit_class: 10 } 55 } 56 57 /// Returns a set of literal prefixes extracted from the given `Hir`. prefixes(expr: &Hir) -> Literals58 pub fn prefixes(expr: &Hir) -> Literals { 59 let mut lits = Literals::empty(); 60 lits.union_prefixes(expr); 61 lits 62 } 63 64 /// Returns a set of literal suffixes extracted from the given `Hir`. suffixes(expr: &Hir) -> Literals65 pub fn suffixes(expr: &Hir) -> Literals { 66 let mut lits = Literals::empty(); 67 lits.union_suffixes(expr); 68 lits 69 } 70 71 /// Get the approximate size limit (in bytes) of this set. limit_size(&self) -> usize72 pub fn limit_size(&self) -> usize { 73 self.limit_size 74 } 75 76 /// Set the approximate size limit (in bytes) of this set. 77 /// 78 /// If extracting a literal would put the set over this limit, then 79 /// extraction stops. 80 /// 81 /// The new limits will only apply to additions to this set. Existing 82 /// members remain unchanged, even if the set exceeds the new limit. set_limit_size(&mut self, size: usize) -> &mut Literals83 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { 84 self.limit_size = size; 85 self 86 } 87 88 /// Get the character class size limit for this set. limit_class(&self) -> usize89 pub fn limit_class(&self) -> usize { 90 self.limit_class 91 } 92 93 /// Limits the size of character(or byte) classes considered. 94 /// 95 /// A value of `0` prevents all character classes from being considered. 96 /// 97 /// This limit also applies to case insensitive literals, since each 98 /// character in the case insensitive literal is converted to a class, and 99 /// then case folded. 100 /// 101 /// The new limits will only apply to additions to this set. Existing 102 /// members remain unchanged, even if the set exceeds the new limit. set_limit_class(&mut self, size: usize) -> &mut Literals103 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { 104 self.limit_class = size; 105 self 106 } 107 108 /// Returns the set of literals as a slice. Its order is unspecified. literals(&self) -> &[Literal]109 pub fn literals(&self) -> &[Literal] { 110 &self.lits 111 } 112 113 /// Returns the length of the smallest literal. 114 /// 115 /// Returns None is there are no literals in the set. min_len(&self) -> Option<usize>116 pub fn min_len(&self) -> Option<usize> { 117 let mut min = None; 118 for lit in &self.lits { 119 match min { 120 None => min = Some(lit.len()), 121 Some(m) if lit.len() < m => min = Some(lit.len()), 122 _ => {} 123 } 124 } 125 min 126 } 127 128 /// Returns true if all members in this set are complete. all_complete(&self) -> bool129 pub fn all_complete(&self) -> bool { 130 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) 131 } 132 133 /// Returns true if any member in this set is complete. any_complete(&self) -> bool134 pub fn any_complete(&self) -> bool { 135 self.lits.iter().any(|lit| !lit.is_cut()) 136 } 137 138 /// Returns true if this set contains an empty literal. contains_empty(&self) -> bool139 pub fn contains_empty(&self) -> bool { 140 self.lits.iter().any(|lit| lit.is_empty()) 141 } 142 143 /// Returns true if this set is empty or if all of its members is empty. is_empty(&self) -> bool144 pub fn is_empty(&self) -> bool { 145 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) 146 } 147 148 /// Returns a new empty set of literals using this set's limits. to_empty(&self) -> Literals149 pub fn to_empty(&self) -> Literals { 150 let mut lits = Literals::empty(); 151 lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); 152 lits 153 } 154 155 /// Returns the longest common prefix of all members in this set. longest_common_prefix(&self) -> &[u8]156 pub fn longest_common_prefix(&self) -> &[u8] { 157 if self.is_empty() { 158 return &[]; 159 } 160 let lit0 = &*self.lits[0]; 161 let mut len = lit0.len(); 162 for lit in &self.lits[1..] { 163 len = cmp::min( 164 len, 165 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), 166 ); 167 } 168 &self.lits[0][..len] 169 } 170 171 /// Returns the longest common suffix of all members in this set. longest_common_suffix(&self) -> &[u8]172 pub fn longest_common_suffix(&self) -> &[u8] { 173 if self.is_empty() { 174 return &[]; 175 } 176 let lit0 = &*self.lits[0]; 177 let mut len = lit0.len(); 178 for lit in &self.lits[1..] { 179 len = cmp::min( 180 len, 181 lit.iter() 182 .rev() 183 .zip(lit0.iter().rev()) 184 .take_while(|&(a, b)| a == b) 185 .count(), 186 ); 187 } 188 &self.lits[0][self.lits[0].len() - len..] 189 } 190 191 /// Returns a new set of literals with the given number of bytes trimmed 192 /// from the suffix of each literal. 193 /// 194 /// If any literal would be cut out completely by trimming, then None is 195 /// returned. 196 /// 197 /// Any duplicates that are created as a result of this transformation are 198 /// removed. trim_suffix(&self, num_bytes: usize) -> Option<Literals>199 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { 200 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { 201 return None; 202 } 203 let mut new = self.to_empty(); 204 for mut lit in self.lits.iter().cloned() { 205 let new_len = lit.len() - num_bytes; 206 lit.truncate(new_len); 207 lit.cut(); 208 new.lits.push(lit); 209 } 210 new.lits.sort(); 211 new.lits.dedup(); 212 Some(new) 213 } 214 215 /// Returns a new set of prefixes of this set of literals that are 216 /// guaranteed to be unambiguous. 217 /// 218 /// Any substring match with a member of the set is returned is guaranteed 219 /// to never overlap with a substring match of another member of the set 220 /// at the same starting position. 221 /// 222 /// Given any two members of the returned set, neither is a substring of 223 /// the other. unambiguous_prefixes(&self) -> Literals224 pub fn unambiguous_prefixes(&self) -> Literals { 225 if self.lits.is_empty() { 226 return self.to_empty(); 227 } 228 let mut old = self.lits.to_vec(); 229 let mut new = self.to_empty(); 230 'OUTER: while let Some(mut candidate) = old.pop() { 231 if candidate.is_empty() { 232 continue; 233 } 234 if new.lits.is_empty() { 235 new.lits.push(candidate); 236 continue; 237 } 238 for lit2 in &mut new.lits { 239 if lit2.is_empty() { 240 continue; 241 } 242 if &candidate == lit2 { 243 // If the literal is already in the set, then we can 244 // just drop it. But make sure that cut literals are 245 // infectious! 246 candidate.cut = candidate.cut || lit2.cut; 247 lit2.cut = candidate.cut; 248 continue 'OUTER; 249 } 250 if candidate.len() < lit2.len() { 251 if let Some(i) = position(&candidate, &lit2) { 252 candidate.cut(); 253 let mut lit3 = lit2.clone(); 254 lit3.truncate(i); 255 lit3.cut(); 256 old.push(lit3); 257 lit2.clear(); 258 } 259 } else if let Some(i) = position(&lit2, &candidate) { 260 lit2.cut(); 261 let mut new_candidate = candidate.clone(); 262 new_candidate.truncate(i); 263 new_candidate.cut(); 264 old.push(new_candidate); 265 candidate.clear(); 266 } 267 // Oops, the candidate is already represented in the set. 268 if candidate.is_empty() { 269 continue 'OUTER; 270 } 271 } 272 new.lits.push(candidate); 273 } 274 new.lits.retain(|lit| !lit.is_empty()); 275 new.lits.sort(); 276 new.lits.dedup(); 277 new 278 } 279 280 /// Returns a new set of suffixes of this set of literals that are 281 /// guaranteed to be unambiguous. 282 /// 283 /// Any substring match with a member of the set is returned is guaranteed 284 /// to never overlap with a substring match of another member of the set 285 /// at the same ending position. 286 /// 287 /// Given any two members of the returned set, neither is a substring of 288 /// the other. unambiguous_suffixes(&self) -> Literals289 pub fn unambiguous_suffixes(&self) -> Literals { 290 // This is a touch wasteful... 291 let mut lits = self.clone(); 292 lits.reverse(); 293 let mut unamb = lits.unambiguous_prefixes(); 294 unamb.reverse(); 295 unamb 296 } 297 298 /// Unions the prefixes from the given expression to this set. 299 /// 300 /// If prefixes could not be added (for example, this set would exceed its 301 /// size limits or the set of prefixes from `expr` includes the empty 302 /// string), then false is returned. 303 /// 304 /// Note that prefix literals extracted from `expr` are said to be complete 305 /// if and only if the literal extends from the beginning of `expr` to the 306 /// end of `expr`. union_prefixes(&mut self, expr: &Hir) -> bool307 pub fn union_prefixes(&mut self, expr: &Hir) -> bool { 308 let mut lits = self.to_empty(); 309 prefixes(expr, &mut lits); 310 !lits.is_empty() && !lits.contains_empty() && self.union(lits) 311 } 312 313 /// Unions the suffixes from the given expression to this set. 314 /// 315 /// If suffixes could not be added (for example, this set would exceed its 316 /// size limits or the set of suffixes from `expr` includes the empty 317 /// string), then false is returned. 318 /// 319 /// Note that prefix literals extracted from `expr` are said to be complete 320 /// if and only if the literal extends from the end of `expr` to the 321 /// beginning of `expr`. union_suffixes(&mut self, expr: &Hir) -> bool322 pub fn union_suffixes(&mut self, expr: &Hir) -> bool { 323 let mut lits = self.to_empty(); 324 suffixes(expr, &mut lits); 325 lits.reverse(); 326 !lits.is_empty() && !lits.contains_empty() && self.union(lits) 327 } 328 329 /// Unions this set with another set. 330 /// 331 /// If the union would cause the set to exceed its limits, then the union 332 /// is skipped and it returns false. Otherwise, if the union succeeds, it 333 /// returns true. union(&mut self, lits: Literals) -> bool334 pub fn union(&mut self, lits: Literals) -> bool { 335 if self.num_bytes() + lits.num_bytes() > self.limit_size { 336 return false; 337 } 338 if lits.is_empty() { 339 self.lits.push(Literal::empty()); 340 } else { 341 self.lits.extend(lits.lits); 342 } 343 true 344 } 345 346 /// Extends this set with another set. 347 /// 348 /// The set of literals is extended via a cross product. 349 /// 350 /// If a cross product would cause this set to exceed its limits, then the 351 /// cross product is skipped and it returns false. Otherwise, if the cross 352 /// product succeeds, it returns true. cross_product(&mut self, lits: &Literals) -> bool353 pub fn cross_product(&mut self, lits: &Literals) -> bool { 354 if lits.is_empty() { 355 return true; 356 } 357 // Check that we make sure we stay in our limits. 358 let mut size_after; 359 if self.is_empty() || !self.any_complete() { 360 size_after = self.num_bytes(); 361 for lits_lit in lits.literals() { 362 size_after += lits_lit.len(); 363 } 364 } else { 365 size_after = self.lits.iter().fold(0, |accum, lit| { 366 accum + if lit.is_cut() { lit.len() } else { 0 } 367 }); 368 for lits_lit in lits.literals() { 369 for self_lit in self.literals() { 370 if !self_lit.is_cut() { 371 size_after += self_lit.len() + lits_lit.len(); 372 } 373 } 374 } 375 } 376 if size_after > self.limit_size { 377 return false; 378 } 379 380 let mut base = self.remove_complete(); 381 if base.is_empty() { 382 base = vec![Literal::empty()]; 383 } 384 for lits_lit in lits.literals() { 385 for mut self_lit in base.clone() { 386 self_lit.extend(&**lits_lit); 387 self_lit.cut = lits_lit.cut; 388 self.lits.push(self_lit); 389 } 390 } 391 true 392 } 393 394 /// Extends each literal in this set with the bytes given. 395 /// 396 /// If the set is empty, then the given literal is added to the set. 397 /// 398 /// If adding any number of bytes to all members of this set causes a limit 399 /// to be exceeded, then no bytes are added and false is returned. If a 400 /// prefix of `bytes` can be fit into this set, then it is used and all 401 /// resulting literals are cut. cross_add(&mut self, bytes: &[u8]) -> bool402 pub fn cross_add(&mut self, bytes: &[u8]) -> bool { 403 // N.B. This could be implemented by simply calling cross_product with 404 // a literal set containing just `bytes`, but we can be smarter about 405 // taking shorter prefixes of `bytes` if they'll fit. 406 if bytes.is_empty() { 407 return true; 408 } 409 if self.lits.is_empty() { 410 let i = cmp::min(self.limit_size, bytes.len()); 411 self.lits.push(Literal::new(bytes[..i].to_owned())); 412 self.lits[0].cut = i < bytes.len(); 413 return !self.lits[0].is_cut(); 414 } 415 let size = self.num_bytes(); 416 if size + self.lits.len() >= self.limit_size { 417 return false; 418 } 419 let mut i = 1; 420 while size + (i * self.lits.len()) <= self.limit_size 421 && i < bytes.len() 422 { 423 i += 1; 424 } 425 for lit in &mut self.lits { 426 if !lit.is_cut() { 427 lit.extend(&bytes[..i]); 428 if i < bytes.len() { 429 lit.cut(); 430 } 431 } 432 } 433 true 434 } 435 436 /// Adds the given literal to this set. 437 /// 438 /// Returns false if adding this literal would cause the class to be too 439 /// big. add(&mut self, lit: Literal) -> bool440 pub fn add(&mut self, lit: Literal) -> bool { 441 if self.num_bytes() + lit.len() > self.limit_size { 442 return false; 443 } 444 self.lits.push(lit); 445 true 446 } 447 448 /// Extends each literal in this set with the character class given. 449 /// 450 /// Returns false if the character class was too big to add. add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool451 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { 452 self._add_char_class(cls, false) 453 } 454 455 /// Extends each literal in this set with the character class given, 456 /// writing the bytes of each character in reverse. 457 /// 458 /// Returns false if the character class was too big to add. add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool459 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { 460 self._add_char_class(cls, true) 461 } 462 _add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool463 fn _add_char_class( 464 &mut self, 465 cls: &hir::ClassUnicode, 466 reverse: bool, 467 ) -> bool { 468 use std::char; 469 470 if self.class_exceeds_limits(cls_char_count(cls)) { 471 return false; 472 } 473 let mut base = self.remove_complete(); 474 if base.is_empty() { 475 base = vec![Literal::empty()]; 476 } 477 for r in cls.iter() { 478 let (s, e) = (r.start as u32, r.end as u32 + 1); 479 for c in (s..e).filter_map(char::from_u32) { 480 for mut lit in base.clone() { 481 let mut bytes = c.to_string().into_bytes(); 482 if reverse { 483 bytes.reverse(); 484 } 485 lit.extend(&bytes); 486 self.lits.push(lit); 487 } 488 } 489 } 490 true 491 } 492 493 /// Extends each literal in this set with the byte class given. 494 /// 495 /// Returns false if the byte class was too big to add. add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool496 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { 497 if self.class_exceeds_limits(cls_byte_count(cls)) { 498 return false; 499 } 500 let mut base = self.remove_complete(); 501 if base.is_empty() { 502 base = vec![Literal::empty()]; 503 } 504 for r in cls.iter() { 505 let (s, e) = (r.start as u32, r.end as u32 + 1); 506 for b in (s..e).map(|b| b as u8) { 507 for mut lit in base.clone() { 508 lit.push(b); 509 self.lits.push(lit); 510 } 511 } 512 } 513 true 514 } 515 516 /// Cuts every member of this set. When a member is cut, it can never 517 /// be extended. cut(&mut self)518 pub fn cut(&mut self) { 519 for lit in &mut self.lits { 520 lit.cut(); 521 } 522 } 523 524 /// Reverses all members in place. reverse(&mut self)525 pub fn reverse(&mut self) { 526 for lit in &mut self.lits { 527 lit.reverse(); 528 } 529 } 530 531 /// Clears this set of all members. clear(&mut self)532 pub fn clear(&mut self) { 533 self.lits.clear(); 534 } 535 536 /// Pops all complete literals out of this set. remove_complete(&mut self) -> Vec<Literal>537 fn remove_complete(&mut self) -> Vec<Literal> { 538 let mut base = vec![]; 539 for lit in mem::replace(&mut self.lits, vec![]) { 540 if lit.is_cut() { 541 self.lits.push(lit); 542 } else { 543 base.push(lit); 544 } 545 } 546 base 547 } 548 549 /// Returns the total number of bytes in this set. num_bytes(&self) -> usize550 fn num_bytes(&self) -> usize { 551 self.lits.iter().fold(0, |accum, lit| accum + lit.len()) 552 } 553 554 /// Returns true if a character class with the given size would cause this 555 /// set to exceed its limits. 556 /// 557 /// The size given should correspond to the number of items in the class. class_exceeds_limits(&self, size: usize) -> bool558 fn class_exceeds_limits(&self, size: usize) -> bool { 559 if size > self.limit_class { 560 return true; 561 } 562 // This is an approximation since codepoints in a char class can encode 563 // to 1-4 bytes. 564 let new_byte_count = if self.lits.is_empty() { 565 size 566 } else { 567 self.lits.iter().fold(0, |accum, lit| { 568 accum 569 + if lit.is_cut() { 570 // If the literal is cut, then we'll never add 571 // anything to it, so don't count it. 572 0 573 } else { 574 (lit.len() + 1) * size 575 } 576 }) 577 }; 578 new_byte_count > self.limit_size 579 } 580 } 581 prefixes(expr: &Hir, lits: &mut Literals)582 fn prefixes(expr: &Hir, lits: &mut Literals) { 583 match *expr.kind() { 584 HirKind::Literal(hir::Literal::Unicode(c)) => { 585 let mut buf = [0; 4]; 586 lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); 587 } 588 HirKind::Literal(hir::Literal::Byte(b)) => { 589 lits.cross_add(&[b]); 590 } 591 HirKind::Class(hir::Class::Unicode(ref cls)) => { 592 if !lits.add_char_class(cls) { 593 lits.cut(); 594 } 595 } 596 HirKind::Class(hir::Class::Bytes(ref cls)) => { 597 if !lits.add_byte_class(cls) { 598 lits.cut(); 599 } 600 } 601 HirKind::Group(hir::Group { ref hir, .. }) => { 602 prefixes(&**hir, lits); 603 } 604 HirKind::Repetition(ref x) => match x.kind { 605 hir::RepetitionKind::ZeroOrOne => { 606 repeat_zero_or_one_literals(&x.hir, lits, prefixes); 607 } 608 hir::RepetitionKind::ZeroOrMore => { 609 repeat_zero_or_more_literals(&x.hir, lits, prefixes); 610 } 611 hir::RepetitionKind::OneOrMore => { 612 repeat_one_or_more_literals(&x.hir, lits, prefixes); 613 } 614 hir::RepetitionKind::Range(ref rng) => { 615 let (min, max) = match *rng { 616 hir::RepetitionRange::Exactly(m) => (m, Some(m)), 617 hir::RepetitionRange::AtLeast(m) => (m, None), 618 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), 619 }; 620 repeat_range_literals( 621 &x.hir, min, max, x.greedy, lits, prefixes, 622 ) 623 } 624 }, 625 HirKind::Concat(ref es) if es.is_empty() => {} 626 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), 627 HirKind::Concat(ref es) => { 628 for e in es { 629 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { 630 if !lits.is_empty() { 631 lits.cut(); 632 break; 633 } 634 lits.add(Literal::empty()); 635 continue; 636 } 637 let mut lits2 = lits.to_empty(); 638 prefixes(e, &mut lits2); 639 if !lits.cross_product(&lits2) || !lits2.any_complete() { 640 // If this expression couldn't yield any literal that 641 // could be extended, then we need to quit. Since we're 642 // short-circuiting, we also need to freeze every member. 643 lits.cut(); 644 break; 645 } 646 } 647 } 648 HirKind::Alternation(ref es) => { 649 alternate_literals(es, lits, prefixes); 650 } 651 _ => lits.cut(), 652 } 653 } 654 suffixes(expr: &Hir, lits: &mut Literals)655 fn suffixes(expr: &Hir, lits: &mut Literals) { 656 match *expr.kind() { 657 HirKind::Literal(hir::Literal::Unicode(c)) => { 658 let mut buf = [0u8; 4]; 659 let i = c.encode_utf8(&mut buf).len(); 660 let buf = &mut buf[..i]; 661 buf.reverse(); 662 lits.cross_add(buf); 663 } 664 HirKind::Literal(hir::Literal::Byte(b)) => { 665 lits.cross_add(&[b]); 666 } 667 HirKind::Class(hir::Class::Unicode(ref cls)) => { 668 if !lits.add_char_class_reverse(cls) { 669 lits.cut(); 670 } 671 } 672 HirKind::Class(hir::Class::Bytes(ref cls)) => { 673 if !lits.add_byte_class(cls) { 674 lits.cut(); 675 } 676 } 677 HirKind::Group(hir::Group { ref hir, .. }) => { 678 suffixes(&**hir, lits); 679 } 680 HirKind::Repetition(ref x) => match x.kind { 681 hir::RepetitionKind::ZeroOrOne => { 682 repeat_zero_or_one_literals(&x.hir, lits, suffixes); 683 } 684 hir::RepetitionKind::ZeroOrMore => { 685 repeat_zero_or_more_literals(&x.hir, lits, suffixes); 686 } 687 hir::RepetitionKind::OneOrMore => { 688 repeat_one_or_more_literals(&x.hir, lits, suffixes); 689 } 690 hir::RepetitionKind::Range(ref rng) => { 691 let (min, max) = match *rng { 692 hir::RepetitionRange::Exactly(m) => (m, Some(m)), 693 hir::RepetitionRange::AtLeast(m) => (m, None), 694 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), 695 }; 696 repeat_range_literals( 697 &x.hir, min, max, x.greedy, lits, suffixes, 698 ) 699 } 700 }, 701 HirKind::Concat(ref es) if es.is_empty() => {} 702 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), 703 HirKind::Concat(ref es) => { 704 for e in es.iter().rev() { 705 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { 706 if !lits.is_empty() { 707 lits.cut(); 708 break; 709 } 710 lits.add(Literal::empty()); 711 continue; 712 } 713 let mut lits2 = lits.to_empty(); 714 suffixes(e, &mut lits2); 715 if !lits.cross_product(&lits2) || !lits2.any_complete() { 716 // If this expression couldn't yield any literal that 717 // could be extended, then we need to quit. Since we're 718 // short-circuiting, we also need to freeze every member. 719 lits.cut(); 720 break; 721 } 722 } 723 } 724 HirKind::Alternation(ref es) => { 725 alternate_literals(es, lits, suffixes); 726 } 727 _ => lits.cut(), 728 } 729 } 730 repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )731 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( 732 e: &Hir, 733 lits: &mut Literals, 734 mut f: F, 735 ) { 736 f( 737 &Hir::repetition(hir::Repetition { 738 kind: hir::RepetitionKind::ZeroOrMore, 739 // FIXME: Our literal extraction doesn't care about greediness. 740 // Which is partially why we're treating 'e?' as 'e*'. Namely, 741 // 'ab??' yields [Complete(ab), Complete(a)], but it should yield 742 // [Complete(a), Complete(ab)] because of the non-greediness. 743 greedy: true, 744 hir: Box::new(e.clone()), 745 }), 746 lits, 747 ); 748 } 749 repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )750 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( 751 e: &Hir, 752 lits: &mut Literals, 753 mut f: F, 754 ) { 755 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); 756 lits3.set_limit_size(lits.limit_size() / 2); 757 f(e, &mut lits3); 758 759 if lits3.is_empty() || !lits2.cross_product(&lits3) { 760 lits.cut(); 761 return; 762 } 763 lits2.cut(); 764 lits2.add(Literal::empty()); 765 if !lits.union(lits2) { 766 lits.cut(); 767 } 768 } 769 repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )770 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( 771 e: &Hir, 772 lits: &mut Literals, 773 mut f: F, 774 ) { 775 f(e, lits); 776 lits.cut(); 777 } 778 repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )779 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( 780 e: &Hir, 781 min: u32, 782 max: Option<u32>, 783 greedy: bool, 784 lits: &mut Literals, 785 mut f: F, 786 ) { 787 if min == 0 { 788 // This is a bit conservative. If `max` is set, then we could 789 // treat this as a finite set of alternations. For now, we 790 // just treat it as `e*`. 791 f( 792 &Hir::repetition(hir::Repetition { 793 kind: hir::RepetitionKind::ZeroOrMore, 794 greedy, 795 hir: Box::new(e.clone()), 796 }), 797 lits, 798 ); 799 } else { 800 if min > 0 { 801 let n = cmp::min(lits.limit_size, min as usize); 802 let es = iter::repeat(e.clone()).take(n).collect(); 803 f(&Hir::concat(es), lits); 804 if n < min as usize || lits.contains_empty() { 805 lits.cut(); 806 } 807 } 808 if max.map_or(true, |max| min < max) { 809 lits.cut(); 810 } 811 } 812 } 813 alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )814 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( 815 es: &[Hir], 816 lits: &mut Literals, 817 mut f: F, 818 ) { 819 let mut lits2 = lits.to_empty(); 820 for e in es { 821 let mut lits3 = lits.to_empty(); 822 lits3.set_limit_size(lits.limit_size() / 5); 823 f(e, &mut lits3); 824 if lits3.is_empty() || !lits2.union(lits3) { 825 // If we couldn't find suffixes for *any* of the 826 // alternates, then the entire alternation has to be thrown 827 // away and any existing members must be frozen. Similarly, 828 // if the union couldn't complete, stop and freeze. 829 lits.cut(); 830 return; 831 } 832 } 833 if !lits.cross_product(&lits2) { 834 lits.cut(); 835 } 836 } 837 838 impl fmt::Debug for Literals { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result839 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 840 f.debug_struct("Literals") 841 .field("lits", &self.lits) 842 .field("limit_size", &self.limit_size) 843 .field("limit_class", &self.limit_class) 844 .finish() 845 } 846 } 847 848 impl Literal { 849 /// Returns a new complete literal with the bytes given. new(bytes: Vec<u8>) -> Literal850 pub fn new(bytes: Vec<u8>) -> Literal { 851 Literal { v: bytes, cut: false } 852 } 853 854 /// Returns a new complete empty literal. empty() -> Literal855 pub fn empty() -> Literal { 856 Literal { v: vec![], cut: false } 857 } 858 859 /// Returns true if this literal was "cut." is_cut(&self) -> bool860 pub fn is_cut(&self) -> bool { 861 self.cut 862 } 863 864 /// Cuts this literal. cut(&mut self)865 pub fn cut(&mut self) { 866 self.cut = true; 867 } 868 } 869 870 impl PartialEq for Literal { eq(&self, other: &Literal) -> bool871 fn eq(&self, other: &Literal) -> bool { 872 self.v == other.v 873 } 874 } 875 876 impl PartialOrd for Literal { partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>877 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { 878 self.v.partial_cmp(&other.v) 879 } 880 } 881 882 impl fmt::Debug for Literal { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result883 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 884 if self.is_cut() { 885 write!(f, "Cut({})", escape_unicode(&self.v)) 886 } else { 887 write!(f, "Complete({})", escape_unicode(&self.v)) 888 } 889 } 890 } 891 892 impl AsRef<[u8]> for Literal { as_ref(&self) -> &[u8]893 fn as_ref(&self) -> &[u8] { 894 &self.v 895 } 896 } 897 898 impl ops::Deref for Literal { 899 type Target = Vec<u8>; deref(&self) -> &Vec<u8>900 fn deref(&self) -> &Vec<u8> { 901 &self.v 902 } 903 } 904 905 impl ops::DerefMut for Literal { deref_mut(&mut self) -> &mut Vec<u8>906 fn deref_mut(&mut self) -> &mut Vec<u8> { 907 &mut self.v 908 } 909 } 910 position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>911 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { 912 let mut i = 0; 913 while haystack.len() >= needle.len() { 914 if needle == &haystack[..needle.len()] { 915 return Some(i); 916 } 917 i += 1; 918 haystack = &haystack[1..]; 919 } 920 None 921 } 922 escape_unicode(bytes: &[u8]) -> String923 fn escape_unicode(bytes: &[u8]) -> String { 924 let show = match ::std::str::from_utf8(bytes) { 925 Ok(v) => v.to_string(), 926 Err(_) => escape_bytes(bytes), 927 }; 928 let mut space_escaped = String::new(); 929 for c in show.chars() { 930 if c.is_whitespace() { 931 let escaped = if c as u32 <= 0x7F { 932 escape_byte(c as u8) 933 } else if c as u32 <= 0xFFFF { 934 format!(r"\u{{{:04x}}}", c as u32) 935 } else { 936 format!(r"\U{{{:08x}}}", c as u32) 937 }; 938 space_escaped.push_str(&escaped); 939 } else { 940 space_escaped.push(c); 941 } 942 } 943 space_escaped 944 } 945 escape_bytes(bytes: &[u8]) -> String946 fn escape_bytes(bytes: &[u8]) -> String { 947 let mut s = String::new(); 948 for &b in bytes { 949 s.push_str(&escape_byte(b)); 950 } 951 s 952 } 953 escape_byte(byte: u8) -> String954 fn escape_byte(byte: u8) -> String { 955 use std::ascii::escape_default; 956 957 let escaped: Vec<u8> = escape_default(byte).collect(); 958 String::from_utf8_lossy(&escaped).into_owned() 959 } 960 cls_char_count(cls: &hir::ClassUnicode) -> usize961 fn cls_char_count(cls: &hir::ClassUnicode) -> usize { 962 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() 963 as usize 964 } 965 cls_byte_count(cls: &hir::ClassBytes) -> usize966 fn cls_byte_count(cls: &hir::ClassBytes) -> usize { 967 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() 968 as usize 969 } 970 971 #[cfg(test)] 972 mod tests { 973 use std::fmt; 974 975 use super::{escape_bytes, Literal, Literals}; 976 use crate::hir::Hir; 977 use crate::ParserBuilder; 978 979 // To make test failures easier to read. 980 #[derive(Debug, Eq, PartialEq)] 981 struct Bytes(Vec<ULiteral>); 982 #[derive(Debug, Eq, PartialEq)] 983 struct Unicode(Vec<ULiteral>); 984 escape_lits(blits: &[Literal]) -> Vec<ULiteral>985 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { 986 let mut ulits = vec![]; 987 for blit in blits { 988 ulits 989 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); 990 } 991 ulits 992 } 993 create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals994 fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { 995 Literals { 996 lits: it.into_iter().collect(), 997 limit_size: 0, 998 limit_class: 0, 999 } 1000 } 1001 1002 // Needs to be pub for 1.3? 1003 #[derive(Clone, Eq, PartialEq)] 1004 pub struct ULiteral { 1005 v: String, 1006 cut: bool, 1007 } 1008 1009 impl ULiteral { is_cut(&self) -> bool1010 fn is_cut(&self) -> bool { 1011 self.cut 1012 } 1013 } 1014 1015 impl fmt::Debug for ULiteral { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1016 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1017 if self.is_cut() { 1018 write!(f, "Cut({})", self.v) 1019 } else { 1020 write!(f, "Complete({})", self.v) 1021 } 1022 } 1023 } 1024 1025 impl PartialEq<Literal> for ULiteral { eq(&self, other: &Literal) -> bool1026 fn eq(&self, other: &Literal) -> bool { 1027 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() 1028 } 1029 } 1030 1031 impl PartialEq<ULiteral> for Literal { eq(&self, other: &ULiteral) -> bool1032 fn eq(&self, other: &ULiteral) -> bool { 1033 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() 1034 } 1035 } 1036 1037 #[allow(non_snake_case)] C(s: &'static str) -> ULiteral1038 fn C(s: &'static str) -> ULiteral { 1039 ULiteral { v: s.to_owned(), cut: true } 1040 } 1041 #[allow(non_snake_case)] M(s: &'static str) -> ULiteral1042 fn M(s: &'static str) -> ULiteral { 1043 ULiteral { v: s.to_owned(), cut: false } 1044 } 1045 prefixes(lits: &mut Literals, expr: &Hir)1046 fn prefixes(lits: &mut Literals, expr: &Hir) { 1047 lits.union_prefixes(expr); 1048 } 1049 suffixes(lits: &mut Literals, expr: &Hir)1050 fn suffixes(lits: &mut Literals, expr: &Hir) { 1051 lits.union_suffixes(expr); 1052 } 1053 1054 macro_rules! assert_lit_eq { 1055 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ 1056 let expected: Vec<ULiteral> = vec![$($expected_lit),*]; 1057 let lits = $got_lits; 1058 assert_eq!( 1059 $which(expected.clone()), 1060 $which(escape_lits(lits.literals()))); 1061 assert_eq!( 1062 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), 1063 lits.all_complete()); 1064 assert_eq!( 1065 expected.iter().any(|l| !l.is_cut()), 1066 lits.any_complete()); 1067 }}; 1068 } 1069 1070 macro_rules! test_lit { 1071 ($name:ident, $which:ident, $re:expr) => { 1072 test_lit!($name, $which, $re,); 1073 }; 1074 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { 1075 #[test] 1076 fn $name() { 1077 let expr = ParserBuilder::new() 1078 .build() 1079 .parse($re) 1080 .unwrap(); 1081 let lits = Literals::$which(&expr); 1082 assert_lit_eq!(Unicode, lits, $($lit),*); 1083 1084 let expr = ParserBuilder::new() 1085 .allow_invalid_utf8(true) 1086 .unicode(false) 1087 .build() 1088 .parse($re) 1089 .unwrap(); 1090 let lits = Literals::$which(&expr); 1091 assert_lit_eq!(Bytes, lits, $($lit),*); 1092 } 1093 }; 1094 } 1095 1096 // ************************************************************************ 1097 // Tests for prefix literal extraction. 1098 // ************************************************************************ 1099 1100 // Elementary tests. 1101 test_lit!(pfx_one_lit1, prefixes, "a", M("a")); 1102 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); 1103 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); 1104 #[cfg(feature = "unicode-case")] 1105 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); 1106 test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); 1107 test_lit!( 1108 pfx_class2, 1109 prefixes, 1110 "(?u)[â˜ƒâ… ]", 1111 M("\\xe2\\x85\\xa0"), 1112 M("\\xe2\\x98\\x83") 1113 ); 1114 #[cfg(feature = "unicode-case")] 1115 test_lit!( 1116 pfx_class3, 1117 prefixes, 1118 "(?ui)[â˜ƒâ… ]", 1119 M("\\xe2\\x85\\xa0"), 1120 M("\\xe2\\x85\\xb0"), 1121 M("\\xe2\\x98\\x83") 1122 ); 1123 test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); 1124 test_lit!( 1125 pfx_one_lit_casei2, 1126 prefixes, 1127 "(?i-u)abc", 1128 M("ABC"), 1129 M("aBC"), 1130 M("AbC"), 1131 M("abC"), 1132 M("ABc"), 1133 M("aBc"), 1134 M("Abc"), 1135 M("abc") 1136 ); 1137 test_lit!(pfx_group1, prefixes, "(a)", M("a")); 1138 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); 1139 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); 1140 test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); 1141 // FIXME: This should return [M("a"), M("ab")] because of the non-greedy 1142 // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get 1143 // a cut literal. 1144 test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); 1145 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); 1146 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); 1147 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); 1148 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); 1149 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); 1150 test_lit!(pfx_rep_range1, prefixes, "a{0}"); 1151 test_lit!(pfx_rep_range2, prefixes, "a{0,}"); 1152 test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); 1153 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); 1154 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); 1155 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); 1156 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); 1157 1158 // Test regexes with concatenations. 1159 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); 1160 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); 1161 test_lit!( 1162 pfx_cat3, 1163 prefixes, 1164 "(?i-u)[ab]z", 1165 M("AZ"), 1166 M("BZ"), 1167 M("aZ"), 1168 M("bZ"), 1169 M("Az"), 1170 M("Bz"), 1171 M("az"), 1172 M("bz") 1173 ); 1174 test_lit!( 1175 pfx_cat4, 1176 prefixes, 1177 "[ab][yz]", 1178 M("ay"), 1179 M("by"), 1180 M("az"), 1181 M("bz") 1182 ); 1183 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); 1184 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); 1185 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); 1186 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); 1187 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); 1188 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); 1189 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); 1190 test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); 1191 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); 1192 test_lit!(pfx_cat14, prefixes, "a^", C("a")); 1193 test_lit!(pfx_cat15, prefixes, "$a"); 1194 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); 1195 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); 1196 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); 1197 test_lit!(pfx_cat19, prefixes, "a.z", C("a")); 1198 1199 // Test regexes with alternations. 1200 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); 1201 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); 1202 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); 1203 test_lit!(pfx_alt4, prefixes, "a|b*"); 1204 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); 1205 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); 1206 test_lit!( 1207 pfx_alt7, 1208 prefixes, 1209 "(a|b)*c|(a|ab)*c", 1210 C("a"), 1211 C("b"), 1212 M("c"), 1213 C("a"), 1214 C("ab"), 1215 M("c") 1216 ); 1217 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); 1218 1219 // Test regexes with empty assertions. 1220 test_lit!(pfx_empty1, prefixes, "^a", M("a")); 1221 test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); 1222 test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); 1223 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); 1224 1225 // Make sure some curious regexes have no prefixes. 1226 test_lit!(pfx_nothing1, prefixes, "."); 1227 test_lit!(pfx_nothing2, prefixes, "(?s)."); 1228 test_lit!(pfx_nothing3, prefixes, "^"); 1229 test_lit!(pfx_nothing4, prefixes, "$"); 1230 test_lit!(pfx_nothing6, prefixes, "(?m)$"); 1231 test_lit!(pfx_nothing7, prefixes, r"\b"); 1232 test_lit!(pfx_nothing8, prefixes, r"\B"); 1233 1234 // Test a few regexes that defeat any prefix literal detection. 1235 test_lit!(pfx_defeated1, prefixes, ".a"); 1236 test_lit!(pfx_defeated2, prefixes, "(?s).a"); 1237 test_lit!(pfx_defeated3, prefixes, "a*b*c*"); 1238 test_lit!(pfx_defeated4, prefixes, "a|."); 1239 test_lit!(pfx_defeated5, prefixes, ".|a"); 1240 test_lit!(pfx_defeated6, prefixes, "a|^"); 1241 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); 1242 test_lit!(pfx_defeated8, prefixes, "$a"); 1243 test_lit!(pfx_defeated9, prefixes, "(?m)$a"); 1244 test_lit!(pfx_defeated10, prefixes, r"\ba"); 1245 test_lit!(pfx_defeated11, prefixes, r"\Ba"); 1246 test_lit!(pfx_defeated12, prefixes, "^*a"); 1247 test_lit!(pfx_defeated13, prefixes, "^+a"); 1248 1249 test_lit!( 1250 pfx_crazy1, 1251 prefixes, 1252 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", 1253 C("Mo\\'"), 1254 C("Mu\\'"), 1255 C("Moam"), 1256 C("Muam") 1257 ); 1258 1259 // ************************************************************************ 1260 // Tests for quiting prefix literal search. 1261 // ************************************************************************ 1262 1263 macro_rules! test_exhausted { 1264 ($name:ident, $which:ident, $re:expr) => { 1265 test_exhausted!($name, $which, $re,); 1266 }; 1267 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { 1268 #[test] 1269 fn $name() { 1270 let expr = ParserBuilder::new() 1271 .build() 1272 .parse($re) 1273 .unwrap(); 1274 let mut lits = Literals::empty(); 1275 lits.set_limit_size(20).set_limit_class(10); 1276 $which(&mut lits, &expr); 1277 assert_lit_eq!(Unicode, lits, $($lit),*); 1278 1279 let expr = ParserBuilder::new() 1280 .allow_invalid_utf8(true) 1281 .unicode(false) 1282 .build() 1283 .parse($re) 1284 .unwrap(); 1285 let mut lits = Literals::empty(); 1286 lits.set_limit_size(20).set_limit_class(10); 1287 $which(&mut lits, &expr); 1288 assert_lit_eq!(Bytes, lits, $($lit),*); 1289 } 1290 }; 1291 } 1292 1293 // These test use a much lower limit than the default so that we can 1294 // write test cases of reasonable size. 1295 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); 1296 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); 1297 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); 1298 test_exhausted!( 1299 pfx_exhausted4, 1300 prefixes, 1301 "(?i-u)foobar", 1302 C("FO"), 1303 C("fO"), 1304 C("Fo"), 1305 C("fo") 1306 ); 1307 test_exhausted!( 1308 pfx_exhausted5, 1309 prefixes, 1310 "(?:ab){100}", 1311 C("abababababababababab") 1312 ); 1313 test_exhausted!( 1314 pfx_exhausted6, 1315 prefixes, 1316 "(?:(?:ab){100})*cd", 1317 C("ababababab"), 1318 M("cd") 1319 ); 1320 test_exhausted!( 1321 pfx_exhausted7, 1322 prefixes, 1323 "z(?:(?:ab){100})*cd", 1324 C("zababababab"), 1325 M("zcd") 1326 ); 1327 test_exhausted!( 1328 pfx_exhausted8, 1329 prefixes, 1330 "aaaaaaaaaaaaaaaaaaaaz", 1331 C("aaaaaaaaaaaaaaaaaaaa") 1332 ); 1333 1334 // ************************************************************************ 1335 // Tests for suffix literal extraction. 1336 // ************************************************************************ 1337 1338 // Elementary tests. 1339 test_lit!(sfx_one_lit1, suffixes, "a", M("a")); 1340 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); 1341 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); 1342 #[cfg(feature = "unicode-case")] 1343 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); 1344 test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); 1345 test_lit!( 1346 sfx_class2, 1347 suffixes, 1348 "(?u)[â˜ƒâ… ]", 1349 M("\\xe2\\x85\\xa0"), 1350 M("\\xe2\\x98\\x83") 1351 ); 1352 #[cfg(feature = "unicode-case")] 1353 test_lit!( 1354 sfx_class3, 1355 suffixes, 1356 "(?ui)[â˜ƒâ… ]", 1357 M("\\xe2\\x85\\xa0"), 1358 M("\\xe2\\x85\\xb0"), 1359 M("\\xe2\\x98\\x83") 1360 ); 1361 test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); 1362 test_lit!( 1363 sfx_one_lit_casei2, 1364 suffixes, 1365 "(?i-u)abc", 1366 M("ABC"), 1367 M("ABc"), 1368 M("AbC"), 1369 M("Abc"), 1370 M("aBC"), 1371 M("aBc"), 1372 M("abC"), 1373 M("abc") 1374 ); 1375 test_lit!(sfx_group1, suffixes, "(a)", M("a")); 1376 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); 1377 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); 1378 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); 1379 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); 1380 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); 1381 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); 1382 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); 1383 test_lit!(sfx_rep_range1, suffixes, "a{0}"); 1384 test_lit!(sfx_rep_range2, suffixes, "a{0,}"); 1385 test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); 1386 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); 1387 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); 1388 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); 1389 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); 1390 1391 // Test regexes with concatenations. 1392 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); 1393 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); 1394 test_lit!( 1395 sfx_cat3, 1396 suffixes, 1397 "(?i-u)[ab]z", 1398 M("AZ"), 1399 M("Az"), 1400 M("BZ"), 1401 M("Bz"), 1402 M("aZ"), 1403 M("az"), 1404 M("bZ"), 1405 M("bz") 1406 ); 1407 test_lit!( 1408 sfx_cat4, 1409 suffixes, 1410 "[ab][yz]", 1411 M("ay"), 1412 M("az"), 1413 M("by"), 1414 M("bz") 1415 ); 1416 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); 1417 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); 1418 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); 1419 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); 1420 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); 1421 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); 1422 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); 1423 test_lit!(sfx_cat12, suffixes, "ab+", C("b")); 1424 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); 1425 test_lit!(sfx_cat14, suffixes, "a^"); 1426 test_lit!(sfx_cat15, suffixes, "$a", C("a")); 1427 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); 1428 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); 1429 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); 1430 test_lit!(sfx_cat19, suffixes, "a.z", C("z")); 1431 1432 // Test regexes with alternations. 1433 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); 1434 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); 1435 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); 1436 test_lit!(sfx_alt4, suffixes, "a|b*"); 1437 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); 1438 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); 1439 test_lit!( 1440 sfx_alt7, 1441 suffixes, 1442 "(a|b)*c|(a|ab)*c", 1443 C("ac"), 1444 C("bc"), 1445 M("c"), 1446 C("ac"), 1447 C("abc"), 1448 M("c") 1449 ); 1450 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); 1451 1452 // Test regexes with empty assertions. 1453 test_lit!(sfx_empty1, suffixes, "a$", M("a")); 1454 test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); 1455 1456 // Make sure some curious regexes have no suffixes. 1457 test_lit!(sfx_nothing1, suffixes, "."); 1458 test_lit!(sfx_nothing2, suffixes, "(?s)."); 1459 test_lit!(sfx_nothing3, suffixes, "^"); 1460 test_lit!(sfx_nothing4, suffixes, "$"); 1461 test_lit!(sfx_nothing6, suffixes, "(?m)$"); 1462 test_lit!(sfx_nothing7, suffixes, r"\b"); 1463 test_lit!(sfx_nothing8, suffixes, r"\B"); 1464 1465 // Test a few regexes that defeat any suffix literal detection. 1466 test_lit!(sfx_defeated1, suffixes, "a."); 1467 test_lit!(sfx_defeated2, suffixes, "(?s)a."); 1468 test_lit!(sfx_defeated3, suffixes, "a*b*c*"); 1469 test_lit!(sfx_defeated4, suffixes, "a|."); 1470 test_lit!(sfx_defeated5, suffixes, ".|a"); 1471 test_lit!(sfx_defeated6, suffixes, "a|^"); 1472 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); 1473 test_lit!(sfx_defeated8, suffixes, "a^"); 1474 test_lit!(sfx_defeated9, suffixes, "(?m)a$"); 1475 test_lit!(sfx_defeated10, suffixes, r"a\b"); 1476 test_lit!(sfx_defeated11, suffixes, r"a\B"); 1477 test_lit!(sfx_defeated12, suffixes, "a^*"); 1478 test_lit!(sfx_defeated13, suffixes, "a^+"); 1479 1480 // These test use a much lower limit than the default so that we can 1481 // write test cases of reasonable size. 1482 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); 1483 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); 1484 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); 1485 test_exhausted!( 1486 sfx_exhausted4, 1487 suffixes, 1488 "(?i-u)foobar", 1489 C("AR"), 1490 C("Ar"), 1491 C("aR"), 1492 C("ar") 1493 ); 1494 test_exhausted!( 1495 sfx_exhausted5, 1496 suffixes, 1497 "(?:ab){100}", 1498 C("abababababababababab") 1499 ); 1500 test_exhausted!( 1501 sfx_exhausted6, 1502 suffixes, 1503 "cd(?:(?:ab){100})*", 1504 C("ababababab"), 1505 M("cd") 1506 ); 1507 test_exhausted!( 1508 sfx_exhausted7, 1509 suffixes, 1510 "cd(?:(?:ab){100})*z", 1511 C("abababababz"), 1512 M("cdz") 1513 ); 1514 test_exhausted!( 1515 sfx_exhausted8, 1516 suffixes, 1517 "zaaaaaaaaaaaaaaaaaaaa", 1518 C("aaaaaaaaaaaaaaaaaaaa") 1519 ); 1520 1521 // ************************************************************************ 1522 // Tests for generating unambiguous literal sets. 1523 // ************************************************************************ 1524 1525 macro_rules! test_unamb { 1526 ($name:ident, $given:expr, $expected:expr) => { 1527 #[test] 1528 fn $name() { 1529 let given: Vec<Literal> = $given 1530 .into_iter() 1531 .map(|ul| { 1532 let cut = ul.is_cut(); 1533 Literal { v: ul.v.into_bytes(), cut: cut } 1534 }) 1535 .collect(); 1536 let lits = create_lits(given); 1537 let got = lits.unambiguous_prefixes(); 1538 assert_eq!($expected, escape_lits(got.literals())); 1539 } 1540 }; 1541 } 1542 1543 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); 1544 test_unamb!( 1545 unambiguous2, 1546 vec![M("zaaaaaa"), M("aa")], 1547 vec![C("aa"), C("z")] 1548 ); 1549 test_unamb!( 1550 unambiguous3, 1551 vec![M("Sherlock"), M("Watson")], 1552 vec![M("Sherlock"), M("Watson")] 1553 ); 1554 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); 1555 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); 1556 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); 1557 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); 1558 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); 1559 test_unamb!( 1560 unambiguous9, 1561 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], 1562 vec![C("a"), C("b"), C("c")] 1563 ); 1564 test_unamb!( 1565 unambiguous10, 1566 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], 1567 vec![C("Mo"), C("Mu")] 1568 ); 1569 test_unamb!( 1570 unambiguous11, 1571 vec![M("zazb"), M("azb")], 1572 vec![C("a"), C("z")] 1573 ); 1574 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); 1575 test_unamb!( 1576 unambiguous13, 1577 vec![M("ABCX"), M("CDAX"), M("BCX")], 1578 vec![C("A"), C("BCX"), C("CD")] 1579 ); 1580 test_unamb!( 1581 unambiguous14, 1582 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], 1583 vec![M("DSX"), C("I"), C("MGX"), C("MV")] 1584 ); 1585 test_unamb!( 1586 unambiguous15, 1587 vec![M("IMG_"), M("MG_"), M("CIMG")], 1588 vec![C("C"), C("I"), C("MG_")] 1589 ); 1590 1591 // ************************************************************************ 1592 // Tests for suffix trimming. 1593 // ************************************************************************ 1594 macro_rules! test_trim { 1595 ($name:ident, $trim:expr, $given:expr, $expected:expr) => { 1596 #[test] 1597 fn $name() { 1598 let given: Vec<Literal> = $given 1599 .into_iter() 1600 .map(|ul| { 1601 let cut = ul.is_cut(); 1602 Literal { v: ul.v.into_bytes(), cut: cut } 1603 }) 1604 .collect(); 1605 let lits = create_lits(given); 1606 let got = lits.trim_suffix($trim).unwrap(); 1607 assert_eq!($expected, escape_lits(got.literals())); 1608 } 1609 }; 1610 } 1611 1612 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); 1613 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); 1614 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); 1615 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); 1616 1617 // ************************************************************************ 1618 // Tests for longest common prefix. 1619 // ************************************************************************ 1620 1621 macro_rules! test_lcp { 1622 ($name:ident, $given:expr, $expected:expr) => { 1623 #[test] 1624 fn $name() { 1625 let given: Vec<Literal> = $given 1626 .into_iter() 1627 .map(|s: &str| Literal { 1628 v: s.to_owned().into_bytes(), 1629 cut: false, 1630 }) 1631 .collect(); 1632 let lits = create_lits(given); 1633 let got = lits.longest_common_prefix(); 1634 assert_eq!($expected, escape_bytes(got)); 1635 } 1636 }; 1637 } 1638 1639 test_lcp!(lcp1, vec!["a"], "a"); 1640 test_lcp!(lcp2, vec![], ""); 1641 test_lcp!(lcp3, vec!["a", "b"], ""); 1642 test_lcp!(lcp4, vec!["ab", "ab"], "ab"); 1643 test_lcp!(lcp5, vec!["ab", "a"], "a"); 1644 test_lcp!(lcp6, vec!["a", "ab"], "a"); 1645 test_lcp!(lcp7, vec!["ab", "b"], ""); 1646 test_lcp!(lcp8, vec!["b", "ab"], ""); 1647 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); 1648 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); 1649 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); 1650 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); 1651 1652 // ************************************************************************ 1653 // Tests for longest common suffix. 1654 // ************************************************************************ 1655 1656 macro_rules! test_lcs { 1657 ($name:ident, $given:expr, $expected:expr) => { 1658 #[test] 1659 fn $name() { 1660 let given: Vec<Literal> = $given 1661 .into_iter() 1662 .map(|s: &str| Literal { 1663 v: s.to_owned().into_bytes(), 1664 cut: false, 1665 }) 1666 .collect(); 1667 let lits = create_lits(given); 1668 let got = lits.longest_common_suffix(); 1669 assert_eq!($expected, escape_bytes(got)); 1670 } 1671 }; 1672 } 1673 1674 test_lcs!(lcs1, vec!["a"], "a"); 1675 test_lcs!(lcs2, vec![], ""); 1676 test_lcs!(lcs3, vec!["a", "b"], ""); 1677 test_lcs!(lcs4, vec!["ab", "ab"], "ab"); 1678 test_lcs!(lcs5, vec!["ab", "a"], ""); 1679 test_lcs!(lcs6, vec!["a", "ab"], ""); 1680 test_lcs!(lcs7, vec!["ab", "b"], "b"); 1681 test_lcs!(lcs8, vec!["b", "ab"], "b"); 1682 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); 1683 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); 1684 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); 1685 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); 1686 } 1687