lua_pattern/
to_regex.rs

1use std::borrow::Cow;
2
3use crate::{Class, PatternObject, Quantifier, SetPatternObject};
4
5#[derive(Debug, thiserror::Error, PartialEq, Eq)]
6#[allow(clippy::enum_variant_names)]
7/// The error type for errors that can occur during conversion to regular expressions.
8/// See [`try_to_regex`] for more information.
9pub enum ToRegexError {
10    /// The input pattern includes a balanced pattern (eg. `%b{}`) which cannot be represented by
11    /// regular expressions.
12    #[error("the input pattern includes a balanced pattern (eg. `%b{{}}`) which cannot be represented by regex")]
13    BalancedUsed,
14
15    /// The input pattern includes a capture backreference (eg. `%1`), but `allow_capture_refs` was
16    /// set to `false`.
17    #[error("the input pattern includes a capture backreference, which may not be supported by some regex engines")]
18    CaptureRefUsed,
19
20    /// The input pattern includes a frontier pattern (eg. `%f[a-z]`), but `allow_lookaround` was
21    /// set to `false`.
22    #[error("the input pattern includes a frontier pattern (eg. `%f[a-z]`) which cannot be represented by regex")]
23    FrontierUsed,
24}
25
26/// Try to convert a parsed Lua pattern into a regular expression string.
27///
28/// The `allow_capture_refs` parameter specifies whether to allow backreferences to capture groups.
29/// Set this to `false` when using the output with the
30/// [`regex` crate](https://crates.io/crates/regex), or to `true` when using the
31/// [`fancy-regex` crate](https://crates.io/crates/fancy-regex).
32///
33/// # Returns
34/// The function returns a [`String`] if the conversion was successful, and a [`ToRegexError`]
35/// otherwise.
36///
37/// # Errors
38/// Converting a Lua pattern to a RegEx can fail in up to three ways.
39///
40/// 1. Lua patterns support balanced bracket matching using the `%b` operator. This is not
41///    supported by RegEx. Thus, an error will be returned if the input pattern makes use of this
42///    feature.
43/// 2. Lua patterns support references to previous capture groups. Some RegEx engines also support
44///    this feature, but not all. For this reason, uses of such backreferences will result in an
45///    error, if `allow_capture_refs` is set to `false`.
46/// 3. Lua patterns support so-called frontier patterns. Their behaviour can be emulated using
47///    lookaround, but only some RegEx engines support that. Therefore, if the input includes a
48///    frontier pattern and `allow_lookaround` is set to `false`, an error will be returned.
49///
50/// Also see [`ToRegexError`] for further information.
51pub fn try_to_regex(
52    pattern: &[PatternObject],
53    allow_capture_refs: bool,
54    allow_lookaround: bool,
55) -> Result<String, ToRegexError> {
56    from_pattern(pattern, allow_capture_refs, allow_lookaround)
57}
58
59fn from_pattern(
60    pattern: &[PatternObject],
61    allow_capture_refs: bool,
62    allow_lookaround: bool,
63) -> Result<String, ToRegexError> {
64    pattern
65        .iter()
66        .map(|obj| from_pattern_object(obj, allow_capture_refs, allow_lookaround))
67        .collect::<Result<_, _>>()
68}
69
70fn from_pattern_object(
71    object: &PatternObject,
72    allow_capture_refs: bool,
73    allow_lookaround: bool,
74) -> Result<Cow<'static, str>, ToRegexError> {
75    match object {
76        PatternObject::Balanced(_, _) => Err(ToRegexError::BalancedUsed),
77        PatternObject::Frontier(_, _) if !allow_lookaround => Err(ToRegexError::FrontierUsed),
78        PatternObject::CaptureRef(_) if !allow_capture_refs => Err(ToRegexError::CaptureRefUsed),
79
80        PatternObject::Any => Ok("[\\s\\S]".into()),
81        PatternObject::Start => Ok("^".into()),
82        PatternObject::End => Ok("$".into()),
83
84        PatternObject::String(string) => {
85            Ok(string.chars().map(from_char).collect::<String>().into())
86        }
87        PatternObject::Escaped(char) => Ok(from_char(*char).into()),
88
89        PatternObject::Quantifier(quantifier, child) => Ok(format!(
90            "{}{}",
91            from_pattern_object(child, allow_capture_refs, allow_lookaround)?,
92            from_quantifier(quantifier)
93        )
94        .into()),
95        PatternObject::Class(class) => Ok(from_class(class).into()),
96        PatternObject::CaptureRef(id) => Ok(format!("\\{id}").into()),
97        PatternObject::Capture(_, pattern) => Ok(format!(
98            "({})",
99            from_pattern(pattern, allow_capture_refs, allow_lookaround)?
100        )
101        .into()),
102        PatternObject::Set(inverted, set) => Ok(from_set(set, *inverted).into()),
103        PatternObject::Frontier(inverted, set) => Ok(format!(
104            "(?<{}{})(?{}{})",
105            if *inverted { "=" } else { "!" },
106            from_set(set, false),
107            if *inverted { "!" } else { "=" },
108            from_set(set, false),
109        )
110        .into()),
111    }
112}
113
114fn from_quantifier(quantifier: &Quantifier) -> &'static str {
115    match quantifier {
116        Quantifier::ZeroOrMore => "*",
117        Quantifier::OneOrMore => "+",
118        Quantifier::ZeroOrMoreLazy => "*?",
119        Quantifier::ZeroOrOne => "?",
120    }
121}
122
123fn from_class(class: &Class) -> &'static str {
124    match class {
125        Class::Letters => r"[a-zA-Z]",
126        Class::Controls => r"[\0-\31]",
127        Class::Digits => r"[0-9]",
128        Class::Printable => r"[\33-\126]",
129        Class::Lowercase => r"[a-z]",
130        Class::Punctuations => r##"[!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~]"##,
131        Class::Spaces => r"[ \t\n\v\f\r]",
132        Class::Uppercase => r"[A-Z]",
133        Class::Alphanumerics => r"[a-zA-Z0-9]",
134        Class::Hexadecimals => r"[0-9a-fA-F]",
135        Class::ZeroByte => r"\0",
136
137        Class::NotLetters => r"[^a-zA-Z]",
138        Class::NotControls => r"[^\0-\31]",
139        Class::NotDigits => r"[^0-9]",
140        Class::NotPrintable => r"[^\33-\126]",
141        Class::NotLowercase => r"[^a-z]",
142        Class::NotPunctuations => r##"[^!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]"##,
143        Class::NotSpaces => r"[^ \t\n\v\f\r]",
144        Class::NotUppercase => r"[^A-Z]",
145        Class::NotAlphanumerics => r"[^a-zA-Z0-9]",
146        Class::NotHexadecimals => r"[^0-9a-fA-F]",
147        Class::NotZeroByte => r"[^\0]",
148    }
149}
150
151fn from_set(set: &[SetPatternObject], inverse: bool) -> String {
152    format!(
153        "[{}{}]",
154        if inverse { "^" } else { "" },
155        set.iter().map(from_set_pattern_object).collect::<String>()
156    )
157}
158
159fn from_set_pattern_object(object: &SetPatternObject) -> Cow<'static, str> {
160    match object {
161        SetPatternObject::Char(char) | SetPatternObject::Escaped(char) => from_char(*char).into(),
162        SetPatternObject::Range(start, end) => {
163            format!("{}-{}", from_char(*start), from_char(*end)).into()
164        }
165        SetPatternObject::Class(class) => from_class(class).into(),
166    }
167}
168
169fn from_char(char: char) -> String {
170    const SPECIAL_CHARS: &str = "\\.()[]{}|*+?^$/";
171
172    match SPECIAL_CHARS.contains(char) {
173        true => format!("\\{char}"),
174        false => char.to_string(),
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn everything() {
184        let input = vec![
185            PatternObject::Start,
186            PatternObject::String("^chars".to_owned()),
187            PatternObject::Quantifier(
188                Quantifier::OneOrMore,
189                PatternObject::String("q".to_owned()).into(),
190            ),
191            PatternObject::Quantifier(
192                Quantifier::ZeroOrMoreLazy,
193                PatternObject::String("w".to_owned()).into(),
194            ),
195            PatternObject::Quantifier(
196                Quantifier::ZeroOrMore,
197                PatternObject::String("e".to_owned()).into(),
198            ),
199            PatternObject::Quantifier(
200                Quantifier::ZeroOrOne,
201                PatternObject::String("r".to_owned()).into(),
202            ),
203            PatternObject::Any,
204            PatternObject::Escaped('.'),
205            PatternObject::Capture(
206                1,
207                vec![
208                    PatternObject::Class(Class::Letters),
209                    PatternObject::Class(Class::Controls),
210                    PatternObject::Class(Class::Digits),
211                    PatternObject::Class(Class::Printable),
212                    PatternObject::Class(Class::Lowercase),
213                    PatternObject::Class(Class::Punctuations),
214                    PatternObject::Class(Class::Spaces),
215                    PatternObject::Class(Class::Uppercase),
216                    PatternObject::Class(Class::Alphanumerics),
217                    PatternObject::Class(Class::Hexadecimals),
218                    PatternObject::Class(Class::ZeroByte),
219                    PatternObject::Class(Class::NotLetters),
220                ],
221            ),
222            PatternObject::Set(
223                false,
224                vec![
225                    SetPatternObject::Char('a'),
226                    SetPatternObject::Char('s'),
227                    SetPatternObject::Char('d'),
228                ],
229            ),
230            PatternObject::Set(
231                true,
232                vec![
233                    SetPatternObject::Char('n'),
234                    SetPatternObject::Char('o'),
235                    SetPatternObject::Char('t'),
236                ],
237            ),
238            PatternObject::Frontier(
239                true,
240                vec![
241                    SetPatternObject::Char('n'),
242                    SetPatternObject::Char('o'),
243                    SetPatternObject::Char('t'),
244                ],
245            ),
246            PatternObject::CaptureRef(1),
247            PatternObject::Set(
248                false,
249                vec![
250                    SetPatternObject::Escaped(']'),
251                    SetPatternObject::Range('a', 'z'),
252                ],
253            ),
254            PatternObject::String("$".to_owned()),
255            PatternObject::End,
256        ];
257
258        assert_eq!(try_to_regex(&input, true, true), Ok(r##"^\^charsq+w*?e*r?[\s\S]\.([a-zA-Z][\0-\31][0-9][\33-\126][a-z][!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~][ \t\n\v\f\r][A-Z][a-zA-Z0-9][0-9a-fA-F]\0[^a-zA-Z])[asd][^not](?<=[not])(?![not])\1[\]a-z]\$$"##.to_owned()));
259    }
260}