rsexpr/
lex.rs

1use std::borrow::Cow;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub(crate) enum Token<'src> {
5    /// (
6    LParen,
7    /// )
8    RParen,
9    /// [
10    LBrack,
11    /// ]
12    RBrack,
13
14    String(Cow<'src, [u8]>),
15    Atom(&'src [u8]),
16    #[cfg(feature = "comments")]
17    Comment(&'src [u8]),
18}
19
20const WHITESPACE: &[u8] = b" \t\r\n";
21const LINE_BREAKS: &[u8] = b"\r\n";
22const NOT_IN_ATOM: &[u8] = b"()[]\"";
23
24pub(crate) fn lex(input: &[u8]) -> Vec<Token<'_>> {
25    let mut index = 0;
26    let mut tokens = vec![];
27
28    while index < input.len() {
29        match input[index] {
30            byte if WHITESPACE.contains(&byte) => index += 1,
31            b';' => {
32                let _start_index = index;
33                while index < input.len() && !LINE_BREAKS.contains(&input[index]) {
34                    index += 1;
35                }
36                #[cfg(feature = "comments")]
37                tokens.push(Token::Comment(&input[_start_index..index]));
38            }
39            b'(' => {
40                tokens.push(Token::LParen);
41                index += 1;
42            }
43            b')' => {
44                tokens.push(Token::RParen);
45                index += 1;
46            }
47            b'[' => {
48                tokens.push(Token::LBrack);
49                index += 1;
50            }
51            b']' => {
52                tokens.push(Token::RBrack);
53                index += 1;
54            }
55            b'"' => tokens.push(lex_string(input, &mut index)),
56            _ => {
57                let start_index = index;
58                while index < input.len()
59                    && !WHITESPACE.contains(&input[index])
60                    && !NOT_IN_ATOM.contains(&input[index])
61                {
62                    index += 1;
63                }
64                tokens.push(Token::Atom(&input[start_index..index]));
65            }
66        }
67    }
68
69    tokens
70}
71
72fn lex_string<'src>(input: &'src [u8], index: &mut usize) -> Token<'src> {
73    *index += 1; // skip opening quote
74    let start_index = *index;
75    let mut end_index = input.len();
76    let mut requires_allocation = false;
77    let mut allocated_string = vec![];
78
79    while *index < input.len() {
80        match input[*index] {
81            b'"' => {
82                end_index = *index;
83                *index += 1;
84                break;
85            }
86            b'\\' if *index == input.len().wrapping_sub(1) => {
87                end_index = *index;
88                *index += 1;
89                break;
90            }
91            b'\\' => {
92                if !requires_allocation {
93                    allocated_string = input[start_index..*index].to_vec();
94                    requires_allocation = true;
95                }
96                allocated_string.push(input[*index + 1]);
97                *index += 2;
98            }
99            _ => {
100                if requires_allocation {
101                    allocated_string.push(input[*index]);
102                }
103                *index += 1;
104            }
105        }
106    }
107
108    match requires_allocation {
109        true => Token::String(Cow::Owned(allocated_string)),
110        false => Token::String(Cow::Borrowed(&input[start_index..end_index])),
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use std::assert_eq;
117
118    use super::*;
119
120    fn string_test(input: &str, expected: Cow<'static, [u8]>) {
121        let mut index = 0;
122        assert_eq!(
123            lex_string(input.as_bytes(), &mut index),
124            Token::String(expected)
125        );
126    }
127
128    #[test]
129    fn strings() {
130        // TODO: verify owned/borrowed
131        string_test(r#""Hello, World!""#, Cow::Borrowed(b"Hello, World!"));
132        string_test(r#""Hello, World!"#, Cow::Borrowed(b"Hello, World!"));
133        string_test(r#""\"\\""#, Cow::Borrowed(b"\"\\"));
134        string_test(r#""\"\"#, Cow::Borrowed(b"\""));
135        string_test(r#""a\"#, Cow::Borrowed(b"a"));
136    }
137
138    #[test]
139    fn parens() {
140        assert_eq!(
141            lex(b"()[]"),
142            vec![Token::LParen, Token::RParen, Token::LBrack, Token::RBrack]
143        );
144    }
145
146    #[test]
147    fn atoms() {
148        assert_eq!(
149            lex(b"abc def"),
150            vec![Token::Atom(b"abc"), Token::Atom(b"def")],
151        );
152    }
153
154    #[test]
155    #[cfg(feature = "comments")]
156    fn comments() {
157        assert_eq!(
158            lex(b"; comment\natom"),
159            vec![Token::Comment(b"; comment"), Token::Atom(b"atom")]
160        );
161    }
162
163    #[test]
164    #[cfg(not(feature = "comments"))]
165    fn comments() {
166        assert_eq!(lex(b"; comment\natom"), vec![Token::Atom(b"atom")]);
167    }
168}