1 use std::fmt;
2 
3 use classes::ByteClasses;
4 pub use nfa::compiler::Builder;
5 
6 mod compiler;
7 mod map;
8 mod range_trie;
9 
10 /// The representation for an NFA state identifier.
11 pub type StateID = usize;
12 
13 /// A final compiled NFA.
14 ///
15 /// The states of the NFA are indexed by state IDs, which are how transitions
16 /// are expressed.
17 #[derive(Clone)]
18 pub struct NFA {
19     /// Whether this NFA can only match at the beginning of input or not.
20     ///
21     /// When true, a match should only be reported if it begins at the 0th
22     /// index of the haystack.
23     anchored: bool,
24     /// The starting state of this NFA.
25     start: StateID,
26     /// The state list. This list is guaranteed to be indexable by the starting
27     /// state ID, and it is also guaranteed to contain exactly one `Match`
28     /// state.
29     states: Vec<State>,
30     /// A mapping from any byte value to its corresponding equivalence class
31     /// identifier. Two bytes in the same equivalence class cannot discriminate
32     /// between a match or a non-match. This map can be used to shrink the
33     /// total size of a DFA's transition table with a small match-time cost.
34     ///
35     /// Note that the NFA's transitions are *not* defined in terms of these
36     /// equivalence classes. The NFA's transitions are defined on the original
37     /// byte values. For the most part, this is because they wouldn't really
38     /// help the NFA much since the NFA already uses a sparse representation
39     /// to represent transitions. Byte classes are most effective in a dense
40     /// representation.
41     byte_classes: ByteClasses,
42 }
43 
44 impl NFA {
45     /// Returns an NFA that always matches at every position.
always_match() -> NFA46     pub fn always_match() -> NFA {
47         NFA {
48             anchored: false,
49             start: 0,
50             states: vec![State::Match],
51             byte_classes: ByteClasses::empty(),
52         }
53     }
54 
55     /// Returns an NFA that never matches at any position.
never_match() -> NFA56     pub fn never_match() -> NFA {
57         NFA {
58             anchored: false,
59             start: 0,
60             states: vec![State::Fail],
61             byte_classes: ByteClasses::empty(),
62         }
63     }
64 
65     /// Returns true if and only if this NFA is anchored.
is_anchored(&self) -> bool66     pub fn is_anchored(&self) -> bool {
67         self.anchored
68     }
69 
70     /// Return the number of states in this NFA.
len(&self) -> usize71     pub fn len(&self) -> usize {
72         self.states.len()
73     }
74 
75     /// Return the ID of the initial state of this NFA.
start(&self) -> StateID76     pub fn start(&self) -> StateID {
77         self.start
78     }
79 
80     /// Return the NFA state corresponding to the given ID.
state(&self, id: StateID) -> &State81     pub fn state(&self, id: StateID) -> &State {
82         &self.states[id]
83     }
84 
85     /// Return the set of equivalence classes for this NFA. The slice returned
86     /// always has length 256 and maps each possible byte value to its
87     /// corresponding equivalence class ID (which is never more than 255).
byte_classes(&self) -> &ByteClasses88     pub fn byte_classes(&self) -> &ByteClasses {
89         &self.byte_classes
90     }
91 }
92 
93 impl fmt::Debug for NFA {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result94     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95         for (i, state) in self.states.iter().enumerate() {
96             let status = if i == self.start { '>' } else { ' ' };
97             writeln!(f, "{}{:06}: {:?}", status, i, state)?;
98         }
99         Ok(())
100     }
101 }
102 
103 /// A state in a final compiled NFA.
104 #[derive(Clone, Eq, PartialEq)]
105 pub enum State {
106     /// A state that transitions to `next` if and only if the current input
107     /// byte is in the range `[start, end]` (inclusive).
108     ///
109     /// This is a special case of Sparse in that it encodes only one transition
110     /// (and therefore avoids the allocation).
111     Range { range: Transition },
112     /// A state with possibly many transitions, represented in a sparse
113     /// fashion. Transitions are ordered lexicographically by input range.
114     /// As such, this may only be used when every transition has equal
115     /// priority. (In practice, this is only used for encoding large UTF-8
116     /// automata.)
117     Sparse { ranges: Box<[Transition]> },
118     /// An alternation such that there exists an epsilon transition to all
119     /// states in `alternates`, where matches found via earlier transitions
120     /// are preferred over later transitions.
121     Union { alternates: Box<[StateID]> },
122     /// A fail state. When encountered, the automaton is guaranteed to never
123     /// reach a match state.
124     Fail,
125     /// A match state. There is exactly one such occurrence of this state in
126     /// an NFA.
127     Match,
128 }
129 
130 /// A transition to another state, only if the given byte falls in the
131 /// inclusive range specified.
132 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
133 pub struct Transition {
134     pub start: u8,
135     pub end: u8,
136     pub next: StateID,
137 }
138 
139 impl State {
140     /// Returns true if and only if this state contains one or more epsilon
141     /// transitions.
is_epsilon(&self) -> bool142     pub fn is_epsilon(&self) -> bool {
143         match *self {
144             State::Range { .. }
145             | State::Sparse { .. }
146             | State::Fail
147             | State::Match => false,
148             State::Union { .. } => true,
149         }
150     }
151 
152     /// Remap the transitions in this state using the given map. Namely, the
153     /// given map should be indexed according to the transitions currently
154     /// in this state.
155     ///
156     /// This is used during the final phase of the NFA compiler, which turns
157     /// its intermediate NFA into the final NFA.
remap(&mut self, remap: &[StateID])158     fn remap(&mut self, remap: &[StateID]) {
159         match *self {
160             State::Range { ref mut range } => range.next = remap[range.next],
161             State::Sparse { ref mut ranges } => {
162                 for r in ranges.iter_mut() {
163                     r.next = remap[r.next];
164                 }
165             }
166             State::Union { ref mut alternates } => {
167                 for alt in alternates.iter_mut() {
168                     *alt = remap[*alt];
169                 }
170             }
171             State::Fail => {}
172             State::Match => {}
173         }
174     }
175 }
176 
177 impl fmt::Debug for State {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result178     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179         match *self {
180             State::Range { ref range } => range.fmt(f),
181             State::Sparse { ref ranges } => {
182                 let rs = ranges
183                     .iter()
184                     .map(|t| format!("{:?}", t))
185                     .collect::<Vec<String>>()
186                     .join(", ");
187                 write!(f, "sparse({})", rs)
188             }
189             State::Union { ref alternates } => {
190                 let alts = alternates
191                     .iter()
192                     .map(|id| format!("{}", id))
193                     .collect::<Vec<String>>()
194                     .join(", ");
195                 write!(f, "alt({})", alts)
196             }
197             State::Fail => write!(f, "FAIL"),
198             State::Match => write!(f, "MATCH"),
199         }
200     }
201 }
202 
203 impl fmt::Debug for Transition {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result204     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
205         let Transition { start, end, next } = *self;
206         if self.start == self.end {
207             write!(f, "{} => {}", escape(start), next)
208         } else {
209             write!(f, "{}-{} => {}", escape(start), escape(end), next)
210         }
211     }
212 }
213 
214 /// Return the given byte as its escaped string form.
escape(b: u8) -> String215 fn escape(b: u8) -> String {
216     use std::ascii;
217 
218     String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
219 }
220 
221 #[cfg(test)]
222 mod tests {
223     use super::*;
224     use dense;
225     use dfa::DFA;
226 
227     #[test]
always_match()228     fn always_match() {
229         let nfa = NFA::always_match();
230         let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
231 
232         assert_eq!(Some(0), dfa.find_at(b"", 0));
233         assert_eq!(Some(0), dfa.find_at(b"a", 0));
234         assert_eq!(Some(1), dfa.find_at(b"a", 1));
235         assert_eq!(Some(0), dfa.find_at(b"ab", 0));
236         assert_eq!(Some(1), dfa.find_at(b"ab", 1));
237         assert_eq!(Some(2), dfa.find_at(b"ab", 2));
238     }
239 
240     #[test]
never_match()241     fn never_match() {
242         let nfa = NFA::never_match();
243         let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
244 
245         assert_eq!(None, dfa.find_at(b"", 0));
246         assert_eq!(None, dfa.find_at(b"a", 0));
247         assert_eq!(None, dfa.find_at(b"a", 1));
248         assert_eq!(None, dfa.find_at(b"ab", 0));
249         assert_eq!(None, dfa.find_at(b"ab", 1));
250         assert_eq!(None, dfa.find_at(b"ab", 2));
251     }
252 }
253