1use std::{mem, str::Chars};
2
3use crate::{Class, Error, Result, Token};
4
5pub(crate) struct Lexer<'src> {
6 reader: Chars<'src>,
7 curr_char: Option<char>,
8 next_char: Option<char>,
9 tokens: Vec<Token>,
10}
11
12impl<'src> Lexer<'src> {
13 pub fn lex(pattern: &'src str) -> Result<Vec<Token>> {
14 let mut lexer = Self {
15 reader: pattern.chars(),
16 curr_char: None,
17 next_char: None,
18 tokens: vec![],
19 };
20 lexer.next();
22 lexer.next();
23 lexer.read_pattern()?;
24 Ok(lexer.tokens)
25 }
26
27 fn next(&mut self) {
28 mem::swap(&mut self.curr_char, &mut self.next_char);
30 self.next_char = self.reader.next();
31 }
32
33 fn next_and_push(&mut self, tok: Token) {
34 self.next();
35 self.tokens.push(tok);
36 }
37
38 fn read_pattern(&mut self) -> Result<()> {
41 if let Some('^') = self.curr_char {
42 self.next_and_push(Token::Start);
43 }
44
45 while let Some(curr) = self.curr_char {
46 match curr {
47 '(' => self.next_and_push(Token::LParen),
48 ')' => self.next_and_push(Token::RParen),
49 '$' if self.next_char.is_none() => self.next_and_push(Token::End),
50 '$' => {
51 self.next_and_push(Token::Char('$'));
52 self.read_quantity()?;
53 }
54 '%' => {
55 self.next();
56 match self.curr_char {
57 Some('b') => {
58 self.next();
59 let open = self.curr_char.ok_or(Error::MissingCharsForBalanced)?;
60 self.next();
61 let close = self.curr_char.ok_or(Error::MissingCharsForBalanced)?;
62 self.next_and_push(Token::Balanced(open, close));
63 }
64 Some('f') => {
65 self.next();
66 if !matches!(self.curr_char, Some('[')) {
67 return Err(Error::MissingSetForFrontier);
68 }
69 self.tokens.push(Token::Frontier);
70 self.read_set()?;
71 }
72 Some(
73 group @ ('0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'),
74 ) => self.next_and_push(Token::CaptureRef(group as u8 - b'0')),
75 _ => {
76 self.read_escape()?;
77 self.read_quantity()?;
78 }
79 }
80 }
81 '[' => {
82 self.read_set()?;
83 self.read_quantity()?;
84 }
85 '.' => {
86 self.next_and_push(Token::Any);
87 self.read_quantity()?;
88 }
89 char => {
90 self.next_and_push(Token::Char(char));
91 self.read_quantity()?;
92 }
93 }
94 }
95
96 Ok(())
97 }
98
99 fn read_quantity(&mut self) -> Result<()> {
100 match self.curr_char {
101 Some('+') => self.next_and_push(Token::OneOrMore),
102 Some('-') => self.next_and_push(Token::ZeroOrMoreLazy),
103 Some('*') => self.next_and_push(Token::ZeroOrMore),
104 Some('?') => self.next_and_push(Token::ZeroOrOne),
105 _ => {}
106 }
107
108 Ok(())
109 }
110
111 fn read_escape(&mut self) -> Result<()> {
112 let curr = self.curr_char.ok_or(Error::UnfinishedEscape)?;
113
114 match Class::try_from(curr) {
115 Ok(class) => self.tokens.push(Token::Class(class)),
116 Err(()) => self.tokens.push(Token::Escaped(curr)),
117 }
118 self.next();
119
120 Ok(())
121 }
122
123 fn read_set(&mut self) -> Result<()> {
124 debug_assert_eq!(self.curr_char, Some('['));
125 self.next_and_push(Token::LBrack);
126
127 if let Some('^') = self.curr_char {
128 self.next_and_push(Token::Inverse);
129 }
130
131 loop {
132 let curr = self.curr_char.ok_or(Error::UnclosedSet)?;
133
134 if curr == '%' {
135 self.next();
136 self.read_escape()?;
137 } else {
138 self.next_and_push(Token::Char(curr));
139 }
140
141 if let Some(']') = self.curr_char {
142 break;
143 }
144 }
145
146 self.next_and_push(Token::RBrack);
147
148 Ok(())
149 }
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn everything() {
158 let input = r"^^charsq+w-e*r?.%.(%a%c%d%g%l%p%s%u%w%x%z%A)[asd]%f[^not]%1%b{}[%]a-z]$$";
159 assert_eq!(
160 Lexer::lex(input),
161 Ok(vec![
162 Token::Start,
163 Token::Char('^'),
164 Token::Char('c'),
165 Token::Char('h'),
166 Token::Char('a'),
167 Token::Char('r'),
168 Token::Char('s'),
169 Token::Char('q'),
170 Token::OneOrMore,
171 Token::Char('w'),
172 Token::ZeroOrMoreLazy,
173 Token::Char('e'),
174 Token::ZeroOrMore,
175 Token::Char('r'),
176 Token::ZeroOrOne,
177 Token::Any,
178 Token::Escaped('.'),
179 Token::LParen,
180 Token::Class(Class::Letters),
181 Token::Class(Class::Controls),
182 Token::Class(Class::Digits),
183 Token::Class(Class::Printable),
184 Token::Class(Class::Lowercase),
185 Token::Class(Class::Punctuations),
186 Token::Class(Class::Spaces),
187 Token::Class(Class::Uppercase),
188 Token::Class(Class::Alphanumerics),
189 Token::Class(Class::Hexadecimals),
190 Token::Class(Class::ZeroByte),
191 Token::Class(Class::NotLetters),
192 Token::RParen,
193 Token::LBrack,
194 Token::Char('a'),
195 Token::Char('s'),
196 Token::Char('d'),
197 Token::RBrack,
198 Token::Frontier,
199 Token::LBrack,
200 Token::Inverse,
201 Token::Char('n'),
202 Token::Char('o'),
203 Token::Char('t'),
204 Token::RBrack,
205 Token::CaptureRef(1),
206 Token::Balanced('{', '}'),
207 Token::LBrack,
208 Token::Escaped(']'),
209 Token::Char('a'),
210 Token::Char('-'),
211 Token::Char('z'),
212 Token::RBrack,
213 Token::Char('$'),
214 Token::End,
215 ])
216 )
217 }
218}