1use std::{collections::VecDeque, mem};
2
3use crate::{Error, Pattern, PatternObject, Quantifier, Result, SetPatternObject, Token};
4
5pub(crate) struct Parser {
6 tokens: VecDeque<Token>,
7 curr_tok: Token,
8 next_tok: Token,
9 capture_id: u8,
10}
11
12impl Parser {
13 pub fn parse(tokens: Vec<Token>) -> Result<Pattern> {
14 let mut parser = Self {
15 tokens: tokens.into(),
16 curr_tok: Token::Eof,
18 next_tok: Token::Eof,
19 capture_id: 1,
20 };
21 parser.next();
23 parser.next();
24 parser.parse_pattern(Token::Eof)
25 }
26
27 fn next(&mut self) {
28 mem::swap(&mut self.next_tok, &mut self.curr_tok);
30 self.next_tok = self.tokens.pop_front().unwrap_or(Token::Eof);
32 }
33
34 fn check_quantifier(&mut self, child: PatternObject) -> PatternObject {
35 match Quantifier::try_from(self.curr_tok) {
36 Ok(quantifier) => {
37 self.next();
38 PatternObject::Quantifier(quantifier, Box::new(child))
39 }
40 Err(()) => child,
41 }
42 }
43
44 fn parse_pattern(&mut self, end: Token) -> Result<Pattern> {
47 let mut objects = vec![];
48
49 while self.curr_tok != end {
50 match self.curr_tok {
51 Token::Char(_) => objects.push(self.parse_string()),
52 Token::LBrack => {
53 let set = self.parse_set()?;
54 objects.push(self.check_quantifier(PatternObject::Set(set.0, set.1)));
55 }
56 Token::LParen => objects.push(self.parse_capture()?),
57
58 Token::Start => {
59 self.next();
60 objects.push(PatternObject::Start);
61 }
62 Token::End => {
63 self.next();
64 objects.push(PatternObject::End);
65 }
66 Token::Balanced(open, close) => {
67 self.next();
68 objects.push(PatternObject::Balanced(open, close));
69 }
70 Token::Frontier => {
71 self.next();
72 let set = self.parse_set()?;
73 objects.push(PatternObject::Frontier(set.0, set.1));
74 }
75 Token::CaptureRef(id) if id <= self.capture_id => {
76 self.next();
77 objects.push(PatternObject::CaptureRef(id));
78 }
79 Token::CaptureRef(id) => return Err(Error::InvalidCaptureRef(id)),
80
81 Token::Any => {
82 self.next();
83 objects.push(self.check_quantifier(PatternObject::Any));
84 }
85 Token::Escaped(char) => {
86 self.next();
87 objects.push(self.check_quantifier(PatternObject::Escaped(char)));
88 }
89 Token::Class(class) => {
90 self.next();
91 objects.push(self.check_quantifier(PatternObject::Class(class)));
92 }
93
94 tok if tok == end => {}
95 tok => return Err(Error::UnexpectedToken(tok)),
96 }
97 }
98
99 Ok(objects)
100 }
101
102 fn parse_string(&mut self) -> PatternObject {
103 let mut string = String::new();
104
105 while let Token::Char(char) = self.curr_tok {
106 string.push(char);
107 self.next();
108
109 if Quantifier::try_from(self.next_tok).is_ok() {
110 break;
111 }
112 }
113
114 self.check_quantifier(PatternObject::String(string))
115 }
116
117 fn parse_set(&mut self) -> Result<(bool, Vec<SetPatternObject>)> {
118 self.next();
119 let mut children = vec![];
120 let inverted = self.curr_tok == Token::Inverse;
121 if inverted {
122 self.next();
123 }
124
125 while self.curr_tok != Token::RBrack {
126 match self.curr_tok {
127 Token::Class(class) => {
128 self.next();
129 children.push(SetPatternObject::Class(class));
130 }
131 Token::Char(start) if self.next_tok == Token::Char('-') => {
132 self.next();
133 self.next();
134 let end = match self.curr_tok {
135 Token::Char(end) => end,
136 tok => return Err(Error::OpenEndedRange(tok)),
137 };
138 self.next();
139
140 children.push(SetPatternObject::Range(start, end));
141 }
142 Token::Char(char) => {
143 self.next();
144 children.push(SetPatternObject::Char(char));
145 }
146 Token::Escaped(char) => {
147 self.next();
148 children.push(SetPatternObject::Escaped(char));
149 }
150 tok => return Err(Error::UnexpectedToken(tok)),
151 }
152 }
153 self.next();
154
155 Ok((inverted, children))
156 }
157
158 fn parse_capture(&mut self) -> Result<PatternObject> {
159 let id = self.capture_id;
160 self.capture_id += 1;
161
162 self.next();
163 let children = self.parse_pattern(Token::RParen)?;
164 self.next();
165
166 Ok(PatternObject::Capture(id, children))
167 }
168}
169
170#[cfg(test)]
171mod tests {
172 use crate::Class;
173
174 use super::*;
175
176 #[test]
177 fn everything() {
178 let input = vec![
179 Token::Start,
180 Token::Char('^'),
181 Token::Char('c'),
182 Token::Char('h'),
183 Token::Char('a'),
184 Token::Char('r'),
185 Token::Char('s'),
186 Token::Char('q'),
187 Token::OneOrMore,
188 Token::Char('w'),
189 Token::ZeroOrMoreLazy,
190 Token::Char('e'),
191 Token::ZeroOrMore,
192 Token::Char('r'),
193 Token::ZeroOrOne,
194 Token::Any,
195 Token::Escaped('.'),
196 Token::LParen,
197 Token::Class(Class::Letters),
198 Token::Class(Class::Controls),
199 Token::Class(Class::Digits),
200 Token::Class(Class::Printable),
201 Token::Class(Class::Lowercase),
202 Token::Class(Class::Punctuations),
203 Token::Class(Class::Spaces),
204 Token::Class(Class::Uppercase),
205 Token::Class(Class::Alphanumerics),
206 Token::Class(Class::Hexadecimals),
207 Token::Class(Class::ZeroByte),
208 Token::Class(Class::NotLetters),
209 Token::RParen,
210 Token::LBrack,
211 Token::Char('a'),
212 Token::Char('s'),
213 Token::Char('d'),
214 Token::RBrack,
215 Token::Frontier,
216 Token::LBrack,
217 Token::Inverse,
218 Token::Char('n'),
219 Token::Char('o'),
220 Token::Char('t'),
221 Token::RBrack,
222 Token::CaptureRef(1),
223 Token::Balanced('{', '}'),
224 Token::LBrack,
225 Token::Escaped(']'),
226 Token::Char('a'),
227 Token::Char('-'),
228 Token::Char('z'),
229 Token::RBrack,
230 Token::Char('$'),
231 Token::End,
232 ];
233
234 let output = Ok(vec![
235 PatternObject::Start,
236 PatternObject::String("^chars".to_owned()),
237 PatternObject::Quantifier(
238 Quantifier::OneOrMore,
239 PatternObject::String("q".to_owned()).into(),
240 ),
241 PatternObject::Quantifier(
242 Quantifier::ZeroOrMoreLazy,
243 PatternObject::String("w".to_owned()).into(),
244 ),
245 PatternObject::Quantifier(
246 Quantifier::ZeroOrMore,
247 PatternObject::String("e".to_owned()).into(),
248 ),
249 PatternObject::Quantifier(
250 Quantifier::ZeroOrOne,
251 PatternObject::String("r".to_owned()).into(),
252 ),
253 PatternObject::Any,
254 PatternObject::Escaped('.'),
255 PatternObject::Capture(
256 1,
257 vec![
258 PatternObject::Class(Class::Letters),
259 PatternObject::Class(Class::Controls),
260 PatternObject::Class(Class::Digits),
261 PatternObject::Class(Class::Printable),
262 PatternObject::Class(Class::Lowercase),
263 PatternObject::Class(Class::Punctuations),
264 PatternObject::Class(Class::Spaces),
265 PatternObject::Class(Class::Uppercase),
266 PatternObject::Class(Class::Alphanumerics),
267 PatternObject::Class(Class::Hexadecimals),
268 PatternObject::Class(Class::ZeroByte),
269 PatternObject::Class(Class::NotLetters),
270 ],
271 ),
272 PatternObject::Set(
273 false,
274 vec![
275 SetPatternObject::Char('a'),
276 SetPatternObject::Char('s'),
277 SetPatternObject::Char('d'),
278 ],
279 ),
280 PatternObject::Frontier(
281 true,
282 vec![
283 SetPatternObject::Char('n'),
284 SetPatternObject::Char('o'),
285 SetPatternObject::Char('t'),
286 ],
287 ),
288 PatternObject::CaptureRef(1),
289 PatternObject::Balanced('{', '}'),
290 PatternObject::Set(
291 false,
292 vec![
293 SetPatternObject::Escaped(']'),
294 SetPatternObject::Range('a', 'z'),
295 ],
296 ),
297 PatternObject::String("$".to_owned()),
298 PatternObject::End,
299 ]);
300
301 assert_eq!(Parser::parse(input), output);
302 }
303}