lua_pattern/
lexer.rs

1use std::{mem, str::Chars};
2
3use crate::{Class, Error, Result, Token};
4
5pub(crate) struct Lexer<'src> {
6    reader: Chars<'src>,
7    curr_char: Option<char>,
8    next_char: Option<char>,
9    tokens: Vec<Token>,
10}
11
12impl<'src> Lexer<'src> {
13    pub fn lex(pattern: &'src str) -> Result<Vec<Token>> {
14        let mut lexer = Self {
15            reader: pattern.chars(),
16            curr_char: None,
17            next_char: None,
18            tokens: vec![],
19        };
20        // advance the lexer twice so that curr_char and next_char are populated
21        lexer.next();
22        lexer.next();
23        lexer.read_pattern()?;
24        Ok(lexer.tokens)
25    }
26
27    fn next(&mut self) {
28        // swap the current and next char so that the old next is the new current
29        mem::swap(&mut self.curr_char, &mut self.next_char);
30        self.next_char = self.reader.next();
31    }
32
33    fn next_and_push(&mut self, tok: Token) {
34        self.next();
35        self.tokens.push(tok);
36    }
37
38    ////////////////////////
39
40    fn read_pattern(&mut self) -> Result<()> {
41        if let Some('^') = self.curr_char {
42            self.next_and_push(Token::Start);
43        }
44
45        while let Some(curr) = self.curr_char {
46            match curr {
47                '(' => self.next_and_push(Token::LParen),
48                ')' => self.next_and_push(Token::RParen),
49                '$' if self.next_char.is_none() => self.next_and_push(Token::End),
50                '$' => {
51                    self.next_and_push(Token::Char('$'));
52                    self.read_quantity()?;
53                }
54                '%' => {
55                    self.next();
56                    match self.curr_char {
57                        Some('b') => {
58                            self.next();
59                            let open = self.curr_char.ok_or(Error::MissingCharsForBalanced)?;
60                            self.next();
61                            let close = self.curr_char.ok_or(Error::MissingCharsForBalanced)?;
62                            self.next_and_push(Token::Balanced(open, close));
63                        }
64                        Some('f') => {
65                            self.next();
66                            if !matches!(self.curr_char, Some('[')) {
67                                return Err(Error::MissingSetForFrontier);
68                            }
69                            self.tokens.push(Token::Frontier);
70                            self.read_set()?;
71                        }
72                        Some(
73                            group @ ('0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'),
74                        ) => self.next_and_push(Token::CaptureRef(group as u8 - b'0')),
75                        _ => {
76                            self.read_escape()?;
77                            self.read_quantity()?;
78                        }
79                    }
80                }
81                '[' => {
82                    self.read_set()?;
83                    self.read_quantity()?;
84                }
85                '.' => {
86                    self.next_and_push(Token::Any);
87                    self.read_quantity()?;
88                }
89                char => {
90                    self.next_and_push(Token::Char(char));
91                    self.read_quantity()?;
92                }
93            }
94        }
95
96        Ok(())
97    }
98
99    fn read_quantity(&mut self) -> Result<()> {
100        match self.curr_char {
101            Some('+') => self.next_and_push(Token::OneOrMore),
102            Some('-') => self.next_and_push(Token::ZeroOrMoreLazy),
103            Some('*') => self.next_and_push(Token::ZeroOrMore),
104            Some('?') => self.next_and_push(Token::ZeroOrOne),
105            _ => {}
106        }
107
108        Ok(())
109    }
110
111    fn read_escape(&mut self) -> Result<()> {
112        let curr = self.curr_char.ok_or(Error::UnfinishedEscape)?;
113
114        match Class::try_from(curr) {
115            Ok(class) => self.tokens.push(Token::Class(class)),
116            Err(()) => self.tokens.push(Token::Escaped(curr)),
117        }
118        self.next();
119
120        Ok(())
121    }
122
123    fn read_set(&mut self) -> Result<()> {
124        debug_assert_eq!(self.curr_char, Some('['));
125        self.next_and_push(Token::LBrack);
126
127        if let Some('^') = self.curr_char {
128            self.next_and_push(Token::Inverse);
129        }
130
131        loop {
132            let curr = self.curr_char.ok_or(Error::UnclosedSet)?;
133
134            if curr == '%' {
135                self.next();
136                self.read_escape()?;
137            } else {
138                self.next_and_push(Token::Char(curr));
139            }
140
141            if let Some(']') = self.curr_char {
142                break;
143            }
144        }
145
146        self.next_and_push(Token::RBrack);
147
148        Ok(())
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn everything() {
158        let input = r"^^charsq+w-e*r?.%.(%a%c%d%g%l%p%s%u%w%x%z%A)[asd]%f[^not]%1%b{}[%]a-z]$$";
159        assert_eq!(
160            Lexer::lex(input),
161            Ok(vec![
162                Token::Start,
163                Token::Char('^'),
164                Token::Char('c'),
165                Token::Char('h'),
166                Token::Char('a'),
167                Token::Char('r'),
168                Token::Char('s'),
169                Token::Char('q'),
170                Token::OneOrMore,
171                Token::Char('w'),
172                Token::ZeroOrMoreLazy,
173                Token::Char('e'),
174                Token::ZeroOrMore,
175                Token::Char('r'),
176                Token::ZeroOrOne,
177                Token::Any,
178                Token::Escaped('.'),
179                Token::LParen,
180                Token::Class(Class::Letters),
181                Token::Class(Class::Controls),
182                Token::Class(Class::Digits),
183                Token::Class(Class::Printable),
184                Token::Class(Class::Lowercase),
185                Token::Class(Class::Punctuations),
186                Token::Class(Class::Spaces),
187                Token::Class(Class::Uppercase),
188                Token::Class(Class::Alphanumerics),
189                Token::Class(Class::Hexadecimals),
190                Token::Class(Class::ZeroByte),
191                Token::Class(Class::NotLetters),
192                Token::RParen,
193                Token::LBrack,
194                Token::Char('a'),
195                Token::Char('s'),
196                Token::Char('d'),
197                Token::RBrack,
198                Token::Frontier,
199                Token::LBrack,
200                Token::Inverse,
201                Token::Char('n'),
202                Token::Char('o'),
203                Token::Char('t'),
204                Token::RBrack,
205                Token::CaptureRef(1),
206                Token::Balanced('{', '}'),
207                Token::LBrack,
208                Token::Escaped(']'),
209                Token::Char('a'),
210                Token::Char('-'),
211                Token::Char('z'),
212                Token::RBrack,
213                Token::Char('$'),
214                Token::End,
215            ])
216        )
217    }
218}