lua_pattern/
parser.rs

1use std::{collections::VecDeque, mem};
2
3use crate::{Error, Pattern, PatternObject, Quantifier, Result, SetPatternObject, Token};
4
5pub(crate) struct Parser {
6    tokens: VecDeque<Token>,
7    curr_tok: Token,
8    next_tok: Token,
9    capture_id: u8,
10}
11
12impl Parser {
13    pub fn parse(tokens: Vec<Token>) -> Result<Pattern> {
14        let mut parser = Self {
15            tokens: tokens.into(),
16            // initialize with dummy `Eof` tokens
17            curr_tok: Token::Eof,
18            next_tok: Token::Eof,
19            capture_id: 1,
20        };
21        // advance the parser twice so that curr_tok and next_tok have correct values
22        parser.next();
23        parser.next();
24        parser.parse_pattern(Token::Eof)
25    }
26
27    fn next(&mut self) {
28        // swap next_tok and curr_tok in memory so that what was curr_tok is now next_tok
29        mem::swap(&mut self.next_tok, &mut self.curr_tok);
30        // overwrite next_tok (which is now what curr_tok was) with the next token
31        self.next_tok = self.tokens.pop_front().unwrap_or(Token::Eof);
32    }
33
34    fn check_quantifier(&mut self, child: PatternObject) -> PatternObject {
35        match Quantifier::try_from(self.curr_tok) {
36            Ok(quantifier) => {
37                self.next();
38                PatternObject::Quantifier(quantifier, Box::new(child))
39            }
40            Err(()) => child,
41        }
42    }
43
44    /////////////////////////
45
46    fn parse_pattern(&mut self, end: Token) -> Result<Pattern> {
47        let mut objects = vec![];
48
49        while self.curr_tok != end {
50            match self.curr_tok {
51                Token::Char(_) => objects.push(self.parse_string()),
52                Token::LBrack => {
53                    let set = self.parse_set()?;
54                    objects.push(self.check_quantifier(PatternObject::Set(set.0, set.1)));
55                }
56                Token::LParen => objects.push(self.parse_capture()?),
57
58                Token::Start => {
59                    self.next();
60                    objects.push(PatternObject::Start);
61                }
62                Token::End => {
63                    self.next();
64                    objects.push(PatternObject::End);
65                }
66                Token::Balanced(open, close) => {
67                    self.next();
68                    objects.push(PatternObject::Balanced(open, close));
69                }
70                Token::Frontier => {
71                    self.next();
72                    let set = self.parse_set()?;
73                    objects.push(PatternObject::Frontier(set.0, set.1));
74                }
75                Token::CaptureRef(id) if id <= self.capture_id => {
76                    self.next();
77                    objects.push(PatternObject::CaptureRef(id));
78                }
79                Token::CaptureRef(id) => return Err(Error::InvalidCaptureRef(id)),
80
81                Token::Any => {
82                    self.next();
83                    objects.push(self.check_quantifier(PatternObject::Any));
84                }
85                Token::Escaped(char) => {
86                    self.next();
87                    objects.push(self.check_quantifier(PatternObject::Escaped(char)));
88                }
89                Token::Class(class) => {
90                    self.next();
91                    objects.push(self.check_quantifier(PatternObject::Class(class)));
92                }
93
94                tok if tok == end => {}
95                tok => return Err(Error::UnexpectedToken(tok)),
96            }
97        }
98
99        Ok(objects)
100    }
101
102    fn parse_string(&mut self) -> PatternObject {
103        let mut string = String::new();
104
105        while let Token::Char(char) = self.curr_tok {
106            string.push(char);
107            self.next();
108
109            if Quantifier::try_from(self.next_tok).is_ok() {
110                break;
111            }
112        }
113
114        self.check_quantifier(PatternObject::String(string))
115    }
116
117    fn parse_set(&mut self) -> Result<(bool, Vec<SetPatternObject>)> {
118        self.next();
119        let mut children = vec![];
120        let inverted = self.curr_tok == Token::Inverse;
121        if inverted {
122            self.next();
123        }
124
125        while self.curr_tok != Token::RBrack {
126            match self.curr_tok {
127                Token::Class(class) => {
128                    self.next();
129                    children.push(SetPatternObject::Class(class));
130                }
131                Token::Char(start) if self.next_tok == Token::Char('-') => {
132                    self.next();
133                    self.next();
134                    let end = match self.curr_tok {
135                        Token::Char(end) => end,
136                        tok => return Err(Error::OpenEndedRange(tok)),
137                    };
138                    self.next();
139
140                    children.push(SetPatternObject::Range(start, end));
141                }
142                Token::Char(char) => {
143                    self.next();
144                    children.push(SetPatternObject::Char(char));
145                }
146                Token::Escaped(char) => {
147                    self.next();
148                    children.push(SetPatternObject::Escaped(char));
149                }
150                tok => return Err(Error::UnexpectedToken(tok)),
151            }
152        }
153        self.next();
154
155        Ok((inverted, children))
156    }
157
158    fn parse_capture(&mut self) -> Result<PatternObject> {
159        let id = self.capture_id;
160        self.capture_id += 1;
161
162        self.next();
163        let children = self.parse_pattern(Token::RParen)?;
164        self.next();
165
166        Ok(PatternObject::Capture(id, children))
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    use crate::Class;
173
174    use super::*;
175
176    #[test]
177    fn everything() {
178        let input = vec![
179            Token::Start,
180            Token::Char('^'),
181            Token::Char('c'),
182            Token::Char('h'),
183            Token::Char('a'),
184            Token::Char('r'),
185            Token::Char('s'),
186            Token::Char('q'),
187            Token::OneOrMore,
188            Token::Char('w'),
189            Token::ZeroOrMoreLazy,
190            Token::Char('e'),
191            Token::ZeroOrMore,
192            Token::Char('r'),
193            Token::ZeroOrOne,
194            Token::Any,
195            Token::Escaped('.'),
196            Token::LParen,
197            Token::Class(Class::Letters),
198            Token::Class(Class::Controls),
199            Token::Class(Class::Digits),
200            Token::Class(Class::Printable),
201            Token::Class(Class::Lowercase),
202            Token::Class(Class::Punctuations),
203            Token::Class(Class::Spaces),
204            Token::Class(Class::Uppercase),
205            Token::Class(Class::Alphanumerics),
206            Token::Class(Class::Hexadecimals),
207            Token::Class(Class::ZeroByte),
208            Token::Class(Class::NotLetters),
209            Token::RParen,
210            Token::LBrack,
211            Token::Char('a'),
212            Token::Char('s'),
213            Token::Char('d'),
214            Token::RBrack,
215            Token::Frontier,
216            Token::LBrack,
217            Token::Inverse,
218            Token::Char('n'),
219            Token::Char('o'),
220            Token::Char('t'),
221            Token::RBrack,
222            Token::CaptureRef(1),
223            Token::Balanced('{', '}'),
224            Token::LBrack,
225            Token::Escaped(']'),
226            Token::Char('a'),
227            Token::Char('-'),
228            Token::Char('z'),
229            Token::RBrack,
230            Token::Char('$'),
231            Token::End,
232        ];
233
234        let output = Ok(vec![
235            PatternObject::Start,
236            PatternObject::String("^chars".to_owned()),
237            PatternObject::Quantifier(
238                Quantifier::OneOrMore,
239                PatternObject::String("q".to_owned()).into(),
240            ),
241            PatternObject::Quantifier(
242                Quantifier::ZeroOrMoreLazy,
243                PatternObject::String("w".to_owned()).into(),
244            ),
245            PatternObject::Quantifier(
246                Quantifier::ZeroOrMore,
247                PatternObject::String("e".to_owned()).into(),
248            ),
249            PatternObject::Quantifier(
250                Quantifier::ZeroOrOne,
251                PatternObject::String("r".to_owned()).into(),
252            ),
253            PatternObject::Any,
254            PatternObject::Escaped('.'),
255            PatternObject::Capture(
256                1,
257                vec![
258                    PatternObject::Class(Class::Letters),
259                    PatternObject::Class(Class::Controls),
260                    PatternObject::Class(Class::Digits),
261                    PatternObject::Class(Class::Printable),
262                    PatternObject::Class(Class::Lowercase),
263                    PatternObject::Class(Class::Punctuations),
264                    PatternObject::Class(Class::Spaces),
265                    PatternObject::Class(Class::Uppercase),
266                    PatternObject::Class(Class::Alphanumerics),
267                    PatternObject::Class(Class::Hexadecimals),
268                    PatternObject::Class(Class::ZeroByte),
269                    PatternObject::Class(Class::NotLetters),
270                ],
271            ),
272            PatternObject::Set(
273                false,
274                vec![
275                    SetPatternObject::Char('a'),
276                    SetPatternObject::Char('s'),
277                    SetPatternObject::Char('d'),
278                ],
279            ),
280            PatternObject::Frontier(
281                true,
282                vec![
283                    SetPatternObject::Char('n'),
284                    SetPatternObject::Char('o'),
285                    SetPatternObject::Char('t'),
286                ],
287            ),
288            PatternObject::CaptureRef(1),
289            PatternObject::Balanced('{', '}'),
290            PatternObject::Set(
291                false,
292                vec![
293                    SetPatternObject::Escaped(']'),
294                    SetPatternObject::Range('a', 'z'),
295                ],
296            ),
297            PatternObject::String("$".to_owned()),
298            PatternObject::End,
299        ]);
300
301        assert_eq!(Parser::parse(input), output);
302    }
303}