lua_pattern/
lib.rs

1#![doc = include_str!("../README.md")]
2//! ## Usage
3//! - Lua patterns can be parsed to a tree with [`parse`].
4//! - Parsed patterns can be converted to regex strings with [`try_to_regex`].
5//!
6//! For example:
7//! ```
8//! use lua_pattern::{Class, PatternObject};
9//!
10//! let tree = lua_pattern::parse("%l").unwrap();
11//! assert_eq!(tree, [PatternObject::Class(Class::Lowercase)]);
12//! #[cfg(feature = "to-regex")]
13//! assert_eq!(
14//!     lua_pattern::try_to_regex(&tree, false, false).unwrap(),
15//!     "[a-z]"
16//! );
17//! ```
18#![cfg_attr(
19    feature = "docs",
20    cfg_attr(doc, doc = ::document_features::document_features!(feature_label = r#"<span class="stab portability"><code>{feature}</code></span>"#))
21)]
22#![cfg_attr(all(doc, CHANNEL_NIGHTLY), feature(doc_auto_cfg))]
23#![warn(rust_2018_idioms)]
24#![deny(missing_docs)]
25
26mod error;
27mod lexer;
28mod parser;
29
30#[cfg(feature = "to-regex")]
31mod to_regex;
32
33use std::fmt::{self, Display, Formatter};
34
35pub use error::*;
36use lexer::Lexer;
37use parser::Parser;
38
39#[cfg(feature = "to-regex")]
40pub use to_regex::*;
41
42///////////////
43// Functions //
44///////////////
45
46/// Parse the given input string as a Lua pattern.
47///
48/// # Returns
49/// This function returns a vector of [`PatternObject`]s if parsing was successful, or an [`Error`]
50/// if the pattern could not be parsed.
51///
52/// # Errors
53/// To see the possible errors, have a look at [`Error`].
54pub fn parse(pattern: impl AsRef<str>) -> Result<Pattern> {
55    Parser::parse(Lexer::lex(pattern.as_ref())?)
56}
57
58///////////
59// Types //
60///////////
61
62/// A list of [`PatternObject`]s, representing an entire Lua pattern.
63pub type Pattern = Vec<PatternObject>;
64
65/// A single object of a Lua pattern.
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub enum PatternObject {
68    /// Match any character (`.`).
69    Any,
70    /// Match the start of the string (`^`).
71    Start,
72    /// Match the end of the string (`$`).
73    End,
74
75    /// A sequence of characters to match literally (eg. `Hello, World!`).
76    String(String),
77    /// A [`PatternObject`] followed by a [`Quantifier`] (eg. `a?`, `.*`).
78    Quantifier(Quantifier, Box<PatternObject>),
79    /// An escaped character to match literally (eg. `%%`).
80    Escaped(char),
81    /// A [character class](Class) (eg. `%w`, `%L`).
82    Class(Class),
83    /// A reference to a previous capture group (eg. `%1`).
84    CaptureRef(u8),
85    /// A balanced pattern (eg. `%bxy`). Matches all characters starting at `x` until the
86    /// corresponding `y`.
87    Balanced(char, char),
88    /// A frontier pattern (eg. `%f[a-z]`). Matches if the following character matches the set and
89    /// the previous character does not match the set. The `bool` indicated whether the set is
90    /// inverted.
91    Frontier(bool, Vec<SetPatternObject>),
92
93    /// A capture group with a numeric ID and the contained [`Pattern`] (eg. `(a)`).
94    Capture(u8, Pattern),
95    /// A set of [`SetPatternObject`]s (eg. `[a-z_%u]`, `[^a-z_%u]`), the `bool` specifies whether
96    /// the set is inverted. If the set is _not_ inverted, it matches if any of the contained
97    /// entries matches. Otherwise, it matches if none of the contained entries match.
98    Set(bool, Vec<SetPatternObject>),
99}
100
101/// An entry of a [set](PatternObject::Set).
102#[derive(Debug, Clone, PartialEq, Eq)]
103pub enum SetPatternObject {
104    /// A character to match literally (eg. `a`).
105    Char(char),
106    /// An escaped character to match literally (eg. `%%`, `%]`).
107    Escaped(char),
108    /// A range of characters (eg. `a-z`). Matches if any character in the range matches.
109    Range(char, char),
110    /// A [character class](Class) (eg. `%w`, `%L`).
111    Class(Class),
112}
113
114/// A quantifier, specifying the amount of times the leading [`PatternObject`] can occur.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum Quantifier {
117    /// Allow zero or more occurrences, taking the longest matching sequence (`*`).
118    ZeroOrMore,
119    /// Allow one or more occurrences, taking the longest matching sequence (`+`).
120    OneOrMore,
121    /// Allow zero or more occurrences, taking the shortest matching sequence (`-`).
122    ZeroOrMoreLazy,
123    /// Allow zero or one occurrences (`?`).
124    ZeroOrOne,
125}
126
127/// A character class, matching any character contained in the class.
128#[derive(Debug, Clone, Copy, PartialEq, Eq)]
129pub enum Class {
130    /// Matches any letter; equivalent to `[a-zA-Z]` (`%a`).
131    Letters,
132    /// Matches any control character; equivalent to `[\0-\31]` (`%c`).
133    Controls,
134    /// Matches any digit; equivalent to `[0-9]` (`%d`).
135    Digits,
136    /// Matches any printable character except space; equivalent to `[\33-\126]` (`%g`).
137    Printable,
138    /// Matches any lowercase letter; equivalent to `[a-z]` (`%l`).
139    Lowercase,
140    /// Matches any punctuation character; equivalent to ``[!"#$%&'()*+,-./[\%]^_`{|}~]`` (`%p`).
141    Punctuations,
142    /// Matches any whitespace character; equivalent to `[ \t\n\v\f\r]` (`%s`).
143    Spaces,
144    /// Matches any uppercase letter; equivalent to `[A-Z]` (`%u`).
145    Uppercase,
146    /// Matches any alphanumeric character (digit or letter); equivalent to `[a-zA-Z0-9]` (`%w`).
147    Alphanumerics,
148    /// Matches any hexadecimal digit; equivalent to `[0-9a-fA-F]` (`%x`).
149    Hexadecimals,
150    /// Matches the NULL character / `0` byte; equivalent to `\0` (`%z`).
151    ZeroByte,
152
153    /// Matches any character, **except** all letters; equivalent to `[^a-zA-Z]` (`%A`).
154    NotLetters,
155    /// Matches any character, **except** all control characters; equivalent to `[^\0-\31]` (`%C`).
156    NotControls,
157    /// Matches any character, **except** all digits; equivalent to `[^0-9]` (`%D`).
158    NotDigits,
159    /// Matches any character, **except** all printable characters, but including space; equivalent
160    /// to `[^\33-\126]` (`%G`).
161    NotPrintable,
162    /// Matches any character, **except** all lowercase letters; equivalent to `[^a-z]` (`%L`).
163    NotLowercase,
164    /// Matches any character, **except** all punctuation characters; equivalent to
165    /// ``[^!"#$%&'()*+,-./[\%]^_`{|}~]`` (`%P`).
166    NotPunctuations,
167    /// Matches any character, **except** all whitespace characters; equivalent to `[^ \t\n\v\f\r]`
168    /// (`%S`).
169    NotSpaces,
170    /// Matches any character, **except** all uppercase letters; equivalent to `[^A-Z]` (`%U`).
171    NotUppercase,
172    /// Matches any character, **except** all alphanumeric characters (digits and letters);
173    /// equivalent to `[^a-zA-Z0-9]` (`%W`).
174    NotAlphanumerics,
175    /// Matches any character, **except** all hexadecimal digits; equivalent to `[^0-9a-fA-F]`
176    /// (`%X`).
177    NotHexadecimals,
178    /// Matches the character, **except** the NULL character / `0` byte; equivalent to `[^\0]`
179    /// (`%Z`).
180    NotZeroByte,
181}
182
183/// A token as used by the internal lexer. Exposed to the public API for use in [`Error`]s.
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185pub enum Token {
186    /// `^`
187    Start,
188    /// `$`
189    End,
190    /// `.`
191    Any,
192    /// `*`
193    ZeroOrMore,
194    /// `+`
195    OneOrMore,
196    /// `-`
197    ZeroOrMoreLazy,
198    /// `?`
199    ZeroOrOne,
200    /// `^` in a set
201    Inverse,
202
203    /// `(`
204    LParen,
205    /// `)`
206    RParen,
207    /// `[`
208    LBrack,
209    /// `]`
210    RBrack,
211
212    /// A literal character
213    Char(char),
214    /// An escaped character
215    Escaped(char),
216    /// A character class
217    Class(Class),
218    /// A reference to a previous capture group
219    CaptureRef(u8),
220    /// `%b`
221    Balanced(char, char),
222    /// `%f`
223    Frontier,
224
225    /// End of file
226    Eof,
227}
228
229///////////////////////////
230// Trait implementations //
231///////////////////////////
232
233impl TryFrom<Token> for Quantifier {
234    type Error = ();
235
236    fn try_from(value: Token) -> std::result::Result<Self, Self::Error> {
237        match value {
238            Token::ZeroOrMore => Ok(Self::ZeroOrMore),
239            Token::OneOrMore => Ok(Self::OneOrMore),
240            Token::ZeroOrMoreLazy => Ok(Self::ZeroOrMoreLazy),
241            Token::ZeroOrOne => Ok(Self::ZeroOrOne),
242            _ => Err(()),
243        }
244    }
245}
246
247impl TryFrom<char> for Class {
248    type Error = ();
249
250    fn try_from(value: char) -> std::result::Result<Self, Self::Error> {
251        match value {
252            'a' => Ok(Self::Letters),
253            'c' => Ok(Self::Controls),
254            'd' => Ok(Self::Digits),
255            'g' => Ok(Self::Printable),
256            'l' => Ok(Self::Lowercase),
257            'p' => Ok(Self::Punctuations),
258            's' => Ok(Self::Spaces),
259            'u' => Ok(Self::Uppercase),
260            'w' => Ok(Self::Alphanumerics),
261            'x' => Ok(Self::Hexadecimals),
262            'z' => Ok(Self::ZeroByte),
263
264            'A' => Ok(Self::NotLetters),
265            'C' => Ok(Self::NotControls),
266            'D' => Ok(Self::NotDigits),
267            'G' => Ok(Self::NotPrintable),
268            'L' => Ok(Self::NotLowercase),
269            'P' => Ok(Self::NotPunctuations),
270            'S' => Ok(Self::NotSpaces),
271            'U' => Ok(Self::NotUppercase),
272            'W' => Ok(Self::NotAlphanumerics),
273            'X' => Ok(Self::NotHexadecimals),
274            'Z' => Ok(Self::NotZeroByte),
275
276            _ => Err(()),
277        }
278    }
279}
280
281impl Display for Token {
282    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
283        match self {
284            Token::Start => write!(f, "^"),
285            Token::End => write!(f, "$"),
286            Token::Any => write!(f, "."),
287            Token::ZeroOrMore => write!(f, "*"),
288            Token::OneOrMore => write!(f, "+"),
289            Token::ZeroOrMoreLazy => write!(f, "-"),
290            Token::ZeroOrOne => write!(f, "?"),
291            Token::Inverse => write!(f, "^"),
292            Token::LParen => write!(f, "("),
293            Token::RParen => write!(f, ")"),
294            Token::LBrack => write!(f, "["),
295            Token::RBrack => write!(f, "]"),
296            Token::Char(char) => write!(f, "{char}"),
297            Token::Escaped(char) => write!(f, "%{char}"),
298            Token::Class(class) => write!(f, "{class}"),
299            Token::CaptureRef(id) => write!(f, "%{id}"),
300            Token::Balanced(open, close) => write!(f, "%b{open}{close}"),
301            Token::Frontier => write!(f, "%f"),
302            Token::Eof => write!(f, "end of file"),
303        }
304    }
305}
306
307impl Display for Class {
308    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
309        match self {
310            Class::Letters => write!(f, "%a"),
311            Class::Controls => write!(f, "%c"),
312            Class::Digits => write!(f, "%d"),
313            Class::Printable => write!(f, "%g"),
314            Class::Lowercase => write!(f, "%l"),
315            Class::Punctuations => write!(f, "%p"),
316            Class::Spaces => write!(f, "%s"),
317            Class::Uppercase => write!(f, "%u"),
318            Class::Alphanumerics => write!(f, "%w"),
319            Class::Hexadecimals => write!(f, "%x"),
320            Class::ZeroByte => write!(f, "%z"),
321            Class::NotLetters => write!(f, "%A"),
322            Class::NotControls => write!(f, "%C"),
323            Class::NotDigits => write!(f, "%D"),
324            Class::NotPrintable => write!(f, "%G"),
325            Class::NotLowercase => write!(f, "%L"),
326            Class::NotPunctuations => write!(f, "%P"),
327            Class::NotSpaces => write!(f, "%S"),
328            Class::NotUppercase => write!(f, "%U"),
329            Class::NotAlphanumerics => write!(f, "%W"),
330            Class::NotHexadecimals => write!(f, "%X"),
331            Class::NotZeroByte => write!(f, "%Z"),
332        }
333    }
334}