Merge pull request #509 from Marwes/lexer_comment

feat: Allow the tokenizer to contain custom skip regexes/literals
2025-03-16 17:00:53 +00:00 · 2020-03-03 21:03:21 +01:00 · 2020-03-03 21:03:21 +01:00 · 723678f364
commit 723678f364
parent a338f4c019 b22e892fc2
14 changed files with 7250 additions and 7065 deletions
--- a/doc/calculator/src/calculator3.lalrpop
+++ b/doc/calculator/src/calculator3.lalrpop
@ -2,6 +2,21 @@ use std::str::FromStr;

 grammar;

+match {
+    "+",
+    "-",
+    "*",
+    "/",
+    "(",
+    ")",
+    r"[0-9]+",
+
+    // Skip whitespace and comments
+    r"\s*" => { },
+    r"//[^\n\r]*[\n\r]*" => { }, // `// comment`
+    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // `/* comment */`
+}
+
 pub Expr: i32 = {
    <l:Expr> "+" <r:Factor> => l + r,
    <l:Expr> "-" <r:Factor> => l - r,
--- a/doc/src/lexer_tutorial/001_lexer_gen.md
+++ b/doc/src/lexer_tutorial/001_lexer_gen.md
@ -209,13 +209,13 @@ match {
 } else {
    r"\w+",
    _
-}    
+}
 ```

 Here the match contains two levels; each level can have more than one
 item in it. The top-level contains only `r"[0-9]+"`, which means that this
 regular expression is given highest priority. The next level contains
-`r\w+`, so that will match afterwards. 
+`r\w+`, so that will match afterwards.

 The final `_` indicates that other string literals and regular
 expressions that appear elsewhere in the grammar (e.g., `"("` or
@ -240,7 +240,7 @@ fn calculator2b() {

    let result = calculator2b::TermParser::new().parse("(foo33)").unwrap();
    assert_eq!(result, "Id(foo33)");
-    
+
    // This one will fail:

    let result = calculator2b::TermParser::new().parse("(22)").unwrap();
@ -262,7 +262,7 @@ match {
 } else {
    r"\w+",
    _
-}    
+}
 ```

 This raises the interesting question of what the precedence is **within**
@ -280,7 +280,7 @@ There is one final twist before we reach the
 can also use `match` declarations to give names to regular
 expressions, so that we don't have to type them directly in our
 grammar. For example, maybe instead of writing `r"\w+"`, we would
-prefer to write `ID`. We could do that by modifying the match declaration like 
+prefer to write `ID`. We could do that by modifying the match declaration like
 so:

 ```
@ -321,6 +321,20 @@ match {
 And now any reference in your grammar to `"BEGIN"` will actually match
 any capitalization.

+#### Customizing skipping between tokens
+
+If we want to support comments we will need to skip more than just whitespace in our lexer.
+To this end `ignore patterns` can be specified.
+
+```
+match {
+    r"\s*" => { }, // The default whitespace skipping is disabled an `ignore pattern` is specified
+    r"//[^\n\r]*[\n\r]*" => { }, // Skip `// comments`
+    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },  // Skip `/* comments */`
+}
+```
+
+
 [lexer tutorial]: index.md
 [calculator2b]: ../../calculator/src/calculator2b.lalrpop
 [calculator3]: ../../calculator/src/calculator3.lalrpop
--- a/lalrpop-test/src/comments.lalrpop
+++ b/lalrpop-test/src/comments.lalrpop
@ -0,0 +1,12 @@
+grammar;
+
+match {
+    r"[0-9]+" => NUM,
+    r"\s*" => { },
+    r"//[^\n\r]*[\n\r]*" => { },
+    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
+}
+
+pub(crate) Term: Vec<&'input str> = {
+    <NUM*>,
+};
--- a/lalrpop-test/src/lib.rs
+++ b/lalrpop-test/src/lib.rs
@ -143,6 +143,8 @@ lalrpop_mod!(
    dyn_argument
 );

+lalrpop_mod!(comments);
+
 pub fn use_cfg_created_parser() {
    cfg::CreatedParser::new();
 }
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
         Use ./snap.sh to generate a new snapshot of the lrgrammar",
    );
 }
+
+#[test]
+fn comments() {
+    assert_eq!(
+        comments::TermParser::new().parse("22 3 5 13").unwrap(),
+        vec!["22", "3", "5", "13"]
+    );
+
+    assert_eq!(
+        comments::TermParser::new()
+            .parse(
+                "22 /* 123 */ 3 5
+            //  abc
+            13 // "
+            )
+            .unwrap(),
+        vec!["22", "3", "5", "13"]
+    );
+}
--- a/lalrpop-util/src/lexer.rs
+++ b/lalrpop-util/src/lexer.rs
@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
    }
 }

+struct RegexEntry {
+    regex: regex::Regex,
+    skip: bool,
+}
+
 pub struct MatcherBuilder {
    regex_set: regex::RegexSet,
-    regex_vec: Vec<regex::Regex>,
+    regex_vec: Vec<RegexEntry>,
 }

 impl MatcherBuilder {
-    pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error>
+    pub fn new<S>(
+        exprs: impl IntoIterator<Item = (S, bool)>,
+    ) -> Result<MatcherBuilder, regex::Error>
    where
        S: AsRef<str>,
    {
        let exprs = exprs.into_iter();
        let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
        let mut first_error = None;
-        let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| {
+        let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
            regex_vec.push(match regex::Regex::new(s.as_ref()) {
-                Ok(regex) => regex,
+                Ok(regex) => RegexEntry { regex, skip },
                Err(err) => {
                    first_error = Some(err);
                    return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
    text: &'input str,
    consumed: usize,
    regex_set: &'builder regex::RegexSet,
-    regex_vec: &'builder Vec<regex::Regex>,
+    regex_vec: &'builder Vec<RegexEntry>,
    _marker: PhantomData<fn() -> E>,
 }

@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
    type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;

    fn next(&mut self) -> Option<Self::Item> {
-        let text = self.text.trim_start();
-        let whitespace = self.text.len() - text.len();
-        let start_offset = self.consumed + whitespace;
-        if text.is_empty() {
-            self.text = text;
-            self.consumed = start_offset;
-            None
-        } else {
-            let matches = self.regex_set.matches(text);
-            if !matches.matched_any() {
-                Some(Err(ParseError::InvalidToken {
-                    location: start_offset,
-                }))
+        loop {
+            let text = self.text;
+            let start_offset = self.consumed;
+            eprintln!("{:?}", text);
+            if text.is_empty() {
+                self.consumed = start_offset;
+                return None;
            } else {
-                let mut longest_match = 0;
-                let mut index = 0;
-                for i in matches.iter() {
-                    let match_ = self.regex_vec[i].find(text).unwrap();
-                    let len = match_.end();
-                    if len >= longest_match {
-                        longest_match = len;
-                        index = i;
+                let matches = self.regex_set.matches(text);
+                if !matches.matched_any() {
+                    return Some(Err(ParseError::InvalidToken {
+                        location: start_offset,
+                    }));
+                } else {
+                    let mut longest_match = 0;
+                    let mut index = 0;
+                    let mut skip = false;
+                    for i in matches.iter() {
+                        let entry = &self.regex_vec[i];
+                        let match_ = entry.regex.find(text).unwrap();
+                        let len = match_.end();
+                        if len >= longest_match {
+                            longest_match = len;
+                            index = i;
+                            skip = entry.skip;
+                        }
                    }
+
+                    let result = &text[..longest_match];
+                    let remaining = &text[longest_match..];
+                    let end_offset = start_offset + longest_match;
+                    self.text = remaining;
+                    self.consumed = end_offset;
+
+                    // Skip any whitespace matches
+                    if skip {
+                        if longest_match == 0 {
+                            return Some(Err(ParseError::InvalidToken {
+                                location: start_offset,
+                            }));
+                        }
+                        continue;
+                    }
+
+                    return Some(Ok((start_offset, Token(index, result), end_offset)));
                }
-                let result = &text[..longest_match];
-                let remaining = &text[longest_match..];
-                let end_offset = start_offset + longest_match;
-                self.text = remaining;
-                self.consumed = end_offset;
-                Some(Ok((start_offset, Token(index, result), end_offset)))
            }
        }
    }
--- a/lalrpop/src/grammar/parse_tree.rs
+++ b/lalrpop/src/grammar/parse_tree.rs
@ -109,7 +109,29 @@ impl MatchItem {
 }

 pub type MatchSymbol = TerminalLiteral;
-pub type MatchMapping = TerminalString;
+
+#[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
+pub enum MatchMapping {
+    Terminal(TerminalString),
+    Skip,
+}
+
+impl Debug for MatchMapping {
+    fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
+        match self {
+            MatchMapping::Terminal(term) => write!(fmt, "{:?}", term),
+            MatchMapping::Skip => write!(fmt, "{{ }}"),
+        }
+    }
+}
+impl Display for MatchMapping {
+    fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
+        match self {
+            MatchMapping::Terminal(term) => write!(fmt, "{}", term),
+            MatchMapping::Skip => write!(fmt, "{{ }}"),
+        }
+    }
+}

 /// Intern tokens are not typed by the user: they are synthesized in
 /// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
    /// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
    pub precedence: usize,
    pub match_literal: TerminalLiteral,
-    pub user_name: TerminalString,
+    pub user_name: MatchMapping,
 }

 #[derive(Clone, Debug, PartialEq, Eq)]
--- a/lalrpop/src/lexer/intern_token/mod.rs
+++ b/lalrpop/src/lexer/intern_token/mod.rs
@ -1,6 +1,6 @@
 //! Generates an iterator type `Matcher` that looks roughly like

-use grammar::parse_tree::InternToken;
+use grammar::parse_tree::{InternToken, MatchMapping};
 use grammar::repr::{Grammar, TerminalLiteral};
 use lexer::re;
 use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(

    // create a vector of rust string literals with the text of each
    // regular expression
-    let regex_strings: Vec<String> = {
-        intern_token
-            .match_entries
-            .iter()
-            .map(|match_entry| match match_entry.match_literal {
-                TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
-                TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
-            })
-            .map(|regex| {
-                // make sure all regex are anchored at the beginning of the input
-                format!("^({})", regex)
-            })
-            .map(|regex_str| {
-                // create a rust string with text of the regex; the Debug impl
-                // will add quotes and escape
-                format!("{:?}", regex_str)
-            })
-            .collect()
-    };
+    let regex_strings = intern_token
+        .match_entries
+        .iter()
+        .map(|match_entry| {
+            (
+                match match_entry.match_literal {
+                    TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
+                    TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
+                },
+                match match_entry.user_name {
+                    MatchMapping::Terminal(_) => false,
+                    MatchMapping::Skip => true,
+                },
+            )
+        })
+        .map(|(regex, skip)| {
+            // make sure all regex are anchored at the beginning of the input
+            (format!("^({})", regex), skip)
+        })
+        .map(|(regex_str, skip)| {
+            // create a rust string with text of the regex; the Debug impl
+            // will add quotes and escape
+            (format!("{:?}", regex_str), skip)
+        });

-    rust!(out, "let {}strs: &[&str] = &[", prefix);
-    for literal in &regex_strings {
-        rust!(out, "{},", literal);
+    let mut contains_skip = false;
+
+    rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
+    for (literal, skip) in regex_strings {
+        rust!(out, "({}, {}),", literal, skip);
+        contains_skip |= skip;
    }
+
+    if !contains_skip {
+        rust!(out, r#"(r"^(\s*)", true),"#);
+    }
+
    rust!(out, "];");

    rust!(
        out,
-        "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()",
+        "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
        p = prefix
    );

--- a/lalrpop/src/normalize/lower/mod.rs
+++ b/lalrpop/src/normalize/lower/mod.rs
@ -5,8 +5,8 @@ use collections::{map, Map};
 use grammar::consts::CFG;
 use grammar::parse_tree as pt;
 use grammar::parse_tree::{
-    read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path,
-    TerminalString,
+    read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
+    Path, TerminalString,
 };
 use grammar::pattern::{Pattern, PatternKind};
 use grammar::repr as r;
@ -79,26 +79,29 @@ impl<'s> LowerState<'s> {
                        })),
                    };
                    self.conversions
-                        .extend(data.match_entries.iter().enumerate().map(
-                            |(index, match_entry)| {
-                                let pattern = Pattern {
-                                    span,
-                                    kind: PatternKind::TupleStruct(
-                                        internal_token_path.clone(),
-                                        vec![
-                                            Pattern {
-                                                span,
-                                                kind: PatternKind::Usize(index),
-                                            },
-                                            Pattern {
-                                                span,
-                                                kind: PatternKind::Choose(input_str.clone()),
-                                            },
-                                        ],
-                                    ),
-                                };
+                        .extend(data.match_entries.iter().enumerate().filter_map(
+                            |(index, match_entry)| match &match_entry.user_name {
+                                MatchMapping::Terminal(user_name) => {
+                                    let pattern = Pattern {
+                                        span,
+                                        kind: PatternKind::TupleStruct(
+                                            internal_token_path.clone(),
+                                            vec![
+                                                Pattern {
+                                                    span,
+                                                    kind: PatternKind::Usize(index),
+                                                },
+                                                Pattern {
+                                                    span,
+                                                    kind: PatternKind::Choose(input_str.clone()),
+                                                },
+                                            ],
+                                        ),
+                                    };

-                                (match_entry.user_name.clone(), pattern)
+                                    Some((user_name.clone(), pattern))
+                                }
+                                MatchMapping::Skip => None,
                            },
                        ));
                    self.intern_token = Some(data);
--- a/lalrpop/src/normalize/resolve/mod.rs
+++ b/lalrpop/src/normalize/resolve/mod.rs
@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
            .flat_map(|match_token| &match_token.contents)
            .flat_map(|match_contents| &match_contents.items)
            .filter_map(|item| match *item {
-                MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => {
+                MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
                    Some((item.span(), id.clone(), Def::Terminal))
                }
                _ => None,
--- a/lalrpop/src/normalize/token_check/mod.rs
+++ b/lalrpop/src/normalize/token_check/mod.rs
@ -133,7 +133,7 @@ impl MatchBlock {
                            match_block.add_match_entry(
                                precedence,
                                sym.clone(),
-                                TerminalString::Literal(sym.clone()),
+                                MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
                                span,
                            )?;
                        }
@ -162,7 +162,7 @@ impl MatchBlock {
        &mut self,
        match_group_precedence: usize,
        sym: TerminalLiteral,
-        user_name: TerminalString,
+        user_name: MatchMapping,
        span: Span,
    ) -> NormResult<()> {
        if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
        }

        // NB: It's legal for multiple regex to produce same terminal.
-        self.match_user_names.insert(user_name.clone());
+        if let MatchMapping::Terminal(user_name) = &user_name {
+            self.match_user_names.insert(user_name.clone());
+        }

        self.match_entries.push(MatchEntry {
            precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
        self.match_entries.push(MatchEntry {
            precedence: sym.base_precedence(),
            match_literal: sym.clone(),
-            user_name: TerminalString::Literal(sym.clone()),
+            user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
        });

        self.spans.insert(sym, span);
@ -328,29 +330,26 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
    // one of precedences, that are parallel with `literals`.
    let mut regexs = Vec::with_capacity(match_entries.len());
    let mut precedences = Vec::with_capacity(match_entries.len());
-    {
-        for match_entry in &match_entries {
-            precedences.push(Precedence(match_entry.precedence));
-            match match_entry.match_literal {
-                TerminalLiteral::Quoted(ref s) => {
-                    regexs.push(re::parse_literal(&s));
-                }
-                TerminalLiteral::Regex(ref s) => {
-                    match re::parse_regex(&s) {
-                        Ok(regex) => regexs.push(regex),
-                        Err(error) => {
-                            let literal_span = spans[&match_entry.match_literal];
-                            // FIXME -- take offset into account for
-                            // span; this requires knowing how many #
-                            // the user used, which we do not track
-                            return_err!(literal_span, "invalid regular expression: {}", error);
-                        }
+    for match_entry in &match_entries {
+        precedences.push(Precedence(match_entry.precedence));
+        match match_entry.match_literal {
+            TerminalLiteral::Quoted(ref s) => {
+                regexs.push(re::parse_literal(&s));
+            }
+            TerminalLiteral::Regex(ref s) => {
+                match re::parse_regex(&s) {
+                    Ok(regex) => regexs.push(regex),
+                    Err(error) => {
+                        let literal_span = spans[&match_entry.match_literal];
+                        // FIXME -- take offset into account for
+                        // span; this requires knowing how many #
+                        // the user used, which we do not track
+                        return_err!(literal_span, "invalid regular expression: {}", error);
                    }
                }
            }
        }
-        Ok(())
-    }?;
+    }

    let dfa = match dfa::build_dfa(&regexs, &precedences) {
        Ok(dfa) => dfa,
--- a/lalrpop/src/normalize/tyinfer/mod.rs
+++ b/lalrpop/src/normalize/tyinfer/mod.rs
@ -3,8 +3,8 @@ use super::{NormError, NormResult};

 use grammar::consts::{ERROR, LOCATION};
 use grammar::parse_tree::{
-    ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString,
-    Path, Span, SymbolKind, TypeParameter, TypeRef,
+    ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
+    NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
 };
 use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
 use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
            let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);

            for match_entry in &intern_token.match_entries {
-                types.add_term_type(match_entry.user_name.clone(), input_str.clone());
+                if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
+                    types.add_term_type(user_name.clone(), input_str.clone());
+                }
            }

            types
--- a/lalrpop/src/parser/lrgrammar.lalrpop
+++ b/lalrpop/src/parser/lrgrammar.lalrpop
@ -319,7 +319,10 @@ MatchItem: MatchItem = {

 MatchSymbol = QuotedLiteral;

-MatchMapping = Terminal;
+MatchMapping: MatchMapping = {
+    Terminal => MatchMapping::Terminal(<>),
+    "{" "}" => MatchMapping::Skip,
+};

 EnumToken: EnumToken =
    "enum" <lo:@L> <t:TypeRef> <hi:@R> "{"
--- a/lalrpop/src/parser/lrgrammar.rs
+++ b/lalrpop/src/parser/lrgrammar.rs
--- a/lalrpop/src/parser/mod.rs
+++ b/lalrpop/src/parser/mod.rs
@ -22,7 +22,7 @@ mod test;
 pub enum Top {
    Grammar(Grammar),
    Pattern(Pattern<TypeRef>),
-    MatchMapping(TerminalString),
+    MatchMapping(MatchMapping),
    TypeRef(TypeRef),
    GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
 }