Merge pull request #509 from Marwes/lexer_comment

feat: Allow the tokenizer to contain custom skip regexes/literals
2025-03-28 06:01:02 +00:00 · 2020-03-03 21:03:21 +01:00 · 2020-03-03 21:03:21 +01:00 · 723678f364
commit 723678f364
parent a338f4c019 b22e892fc2
14 changed files with 7250 additions and 7065 deletions
--- a/doc/calculator/src/calculator3.lalrpop
+++ b/doc/calculator/src/calculator3.lalrpop
@ -2,6 +2,21 @@ use std::str::FromStr;
 grammar;
 match {
    "+",
    "-",
    "*",
    "/",
    "(",
    ")",
    r"[0-9]+",
    // Skip whitespace and comments
    r"\s*" => { },
    r"//[^\n\r]*[\n\r]*" => { }, // `// comment`
    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // `/* comment */`
 }
 pub Expr: i32 = {
    <l:Expr> "+" <r:Factor> => l + r,
    <l:Expr> "-" <r:Factor> => l - r,
--- a/doc/src/lexer_tutorial/001_lexer_gen.md
+++ b/doc/src/lexer_tutorial/001_lexer_gen.md
@ -209,13 +209,13 @@ match {
 } else {
    r"\w+",
    _
-}    
+}
 ```
 Here the match contains two levels; each level can have more than one
 item in it. The top-level contains only `r"[0-9]+"`, which means that this
 regular expression is given highest priority. The next level contains
-`r\w+`, so that will match afterwards. 
+`r\w+`, so that will match afterwards.
 The final `_` indicates that other string literals and regular
 expressions that appear elsewhere in the grammar (e.g., `"("` or
@ -240,7 +240,7 @@ fn calculator2b() {
    let result = calculator2b::TermParser::new().parse("(foo33)").unwrap();
    assert_eq!(result, "Id(foo33)");
-    
+
    // This one will fail:
    let result = calculator2b::TermParser::new().parse("(22)").unwrap();
@ -262,7 +262,7 @@ match {
 } else {
    r"\w+",
    _
-}    
+}
 ```
 This raises the interesting question of what the precedence is **within**
@ -280,7 +280,7 @@ There is one final twist before we reach the
 can also use `match` declarations to give names to regular
 expressions, so that we don't have to type them directly in our
 grammar. For example, maybe instead of writing `r"\w+"`, we would
-prefer to write `ID`. We could do that by modifying the match declaration like 
+prefer to write `ID`. We could do that by modifying the match declaration like
 so:
 ```
@ -321,6 +321,20 @@ match {
 And now any reference in your grammar to `"BEGIN"` will actually match
 any capitalization.
 #### Customizing skipping between tokens
 If we want to support comments we will need to skip more than just whitespace in our lexer.
 To this end `ignore patterns` can be specified.
 ```
 match {
    r"\s*" => { }, // The default whitespace skipping is disabled an `ignore pattern` is specified
    r"//[^\n\r]*[\n\r]*" => { }, // Skip `// comments`
    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },  // Skip `/* comments */`
 }
 ```
 [lexer tutorial]: index.md
 [calculator2b]: ../../calculator/src/calculator2b.lalrpop
 [calculator3]: ../../calculator/src/calculator3.lalrpop
--- a/lalrpop-test/src/comments.lalrpop
+++ b/lalrpop-test/src/comments.lalrpop
@ -0,0 +1,12 @@
 grammar;
 match {
    r"[0-9]+" => NUM,
    r"\s*" => { },
    r"//[^\n\r]*[\n\r]*" => { },
    r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
 }
 pub(crate) Term: Vec<&'input str> = {
    <NUM*>,
 };
--- a/lalrpop-test/src/lib.rs
+++ b/lalrpop-test/src/lib.rs
@ -143,6 +143,8 @@ lalrpop_mod!(
    dyn_argument
 );
 lalrpop_mod!(comments);
 pub fn use_cfg_created_parser() {
    cfg::CreatedParser::new();
 }
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
         Use ./snap.sh to generate a new snapshot of the lrgrammar",
    );
 }
 #[test]
 fn comments() {
    assert_eq!(
        comments::TermParser::new().parse("22 3 5 13").unwrap(),
        vec!["22", "3", "5", "13"]
    );
    assert_eq!(
        comments::TermParser::new()
            .parse(
                "22 /* 123 */ 3 5
            //  abc
            13 // "
            )
            .unwrap(),
        vec!["22", "3", "5", "13"]
    );
 }
--- a/lalrpop-util/src/lexer.rs
+++ b/lalrpop-util/src/lexer.rs
@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
    }
 }
 struct RegexEntry {
    regex: regex::Regex,
    skip: bool,
 }
 pub struct MatcherBuilder {
    regex_set: regex::RegexSet,
-    regex_vec: Vec<regex::Regex>,
+    regex_vec: Vec<RegexEntry>,
 }
 impl MatcherBuilder {
-    pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error>
+    pub fn new<S>(
        exprs: impl IntoIterator<Item = (S, bool)>,
    ) -> Result<MatcherBuilder, regex::Error>
    where
        S: AsRef<str>,
    {
        let exprs = exprs.into_iter();
        let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
        let mut first_error = None;
-        let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| {
+        let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
            regex_vec.push(match regex::Regex::new(s.as_ref()) {
-                Ok(regex) => regex,
+                Ok(regex) => RegexEntry { regex, skip },
                Err(err) => {
                    first_error = Some(err);
                    return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
    text: &'input str,
    consumed: usize,
    regex_set: &'builder regex::RegexSet,
-    regex_vec: &'builder Vec<regex::Regex>,
+    regex_vec: &'builder Vec<RegexEntry>,
    _marker: PhantomData<fn() -> E>,
 }
@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
    type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
    fn next(&mut self) -> Option<Self::Item> {
-        let text = self.text.trim_start();
+        loop {
-        let whitespace = self.text.len() - text.len();
+            let text = self.text;
-        let start_offset = self.consumed + whitespace;
+            let start_offset = self.consumed;
-        if text.is_empty() {
+            eprintln!("{:?}", text);
-            self.text = text;
+            if text.is_empty() {
-            self.consumed = start_offset;
+                self.consumed = start_offset;
-            None
+                return None;
        } else {
            let matches = self.regex_set.matches(text);
            if !matches.matched_any() {
                Some(Err(ParseError::InvalidToken {
                    location: start_offset,
                }))
            } else {
-                let mut longest_match = 0;
+                let matches = self.regex_set.matches(text);
-                let mut index = 0;
+                if !matches.matched_any() {
-                for i in matches.iter() {
+                    return Some(Err(ParseError::InvalidToken {
-                    let match_ = self.regex_vec[i].find(text).unwrap();
+                        location: start_offset,
-                    let len = match_.end();
+                    }));
-                    if len >= longest_match {
+                } else {
-                        longest_match = len;
+                    let mut longest_match = 0;
-                        index = i;
+                    let mut index = 0;
                    let mut skip = false;
                    for i in matches.iter() {
                        let entry = &self.regex_vec[i];
                        let match_ = entry.regex.find(text).unwrap();
                        let len = match_.end();
                        if len >= longest_match {
                            longest_match = len;
                            index = i;
                            skip = entry.skip;
                        }
                    }
                    let result = &text[..longest_match];
                    let remaining = &text[longest_match..];
                    let end_offset = start_offset + longest_match;
                    self.text = remaining;
                    self.consumed = end_offset;
                    // Skip any whitespace matches
                    if skip {
                        if longest_match == 0 {
                            return Some(Err(ParseError::InvalidToken {
                                location: start_offset,
                            }));
                        }
                        continue;
                    }
                    return Some(Ok((start_offset, Token(index, result), end_offset)));
                }
                let result = &text[..longest_match];
                let remaining = &text[longest_match..];
                let end_offset = start_offset + longest_match;
                self.text = remaining;
                self.consumed = end_offset;
                Some(Ok((start_offset, Token(index, result), end_offset)))
            }
        }
    }
--- a/lalrpop/src/grammar/parse_tree.rs
+++ b/lalrpop/src/grammar/parse_tree.rs
@ -109,7 +109,29 @@ impl MatchItem {
 }
 pub type MatchSymbol = TerminalLiteral;
-pub type MatchMapping = TerminalString;
+
 #[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
 pub enum MatchMapping {
    Terminal(TerminalString),
    Skip,
 }
 impl Debug for MatchMapping {
    fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
        match self {
            MatchMapping::Terminal(term) => write!(fmt, "{:?}", term),
            MatchMapping::Skip => write!(fmt, "{{ }}"),
        }
    }
 }
 impl Display for MatchMapping {
    fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
        match self {
            MatchMapping::Terminal(term) => write!(fmt, "{}", term),
            MatchMapping::Skip => write!(fmt, "{{ }}"),
        }
    }
 }
 /// Intern tokens are not typed by the user: they are synthesized in
 /// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
    /// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
    pub precedence: usize,
    pub match_literal: TerminalLiteral,
-    pub user_name: TerminalString,
+    pub user_name: MatchMapping,
 }
 #[derive(Clone, Debug, PartialEq, Eq)]
--- a/lalrpop/src/lexer/intern_token/mod.rs
+++ b/lalrpop/src/lexer/intern_token/mod.rs
@ -1,6 +1,6 @@
 //! Generates an iterator type `Matcher` that looks roughly like
-use grammar::parse_tree::InternToken;
+use grammar::parse_tree::{InternToken, MatchMapping};
 use grammar::repr::{Grammar, TerminalLiteral};
 use lexer::re;
 use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(
    // create a vector of rust string literals with the text of each
    // regular expression
-    let regex_strings: Vec<String> = {
+    let regex_strings = intern_token
-        intern_token
+        .match_entries
-            .match_entries
+        .iter()
-            .iter()
+        .map(|match_entry| {
-            .map(|match_entry| match match_entry.match_literal {
+            (
-                TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
+                match match_entry.match_literal {
-                TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
+                    TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
-            })
+                    TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
-            .map(|regex| {
+                },
-                // make sure all regex are anchored at the beginning of the input
+                match match_entry.user_name {
-                format!("^({})", regex)
+                    MatchMapping::Terminal(_) => false,
-            })
+                    MatchMapping::Skip => true,
-            .map(|regex_str| {
+                },
-                // create a rust string with text of the regex; the Debug impl
+            )
-                // will add quotes and escape
+        })
-                format!("{:?}", regex_str)
+        .map(|(regex, skip)| {
-            })
+            // make sure all regex are anchored at the beginning of the input
-            .collect()
+            (format!("^({})", regex), skip)
-    };
+        })
        .map(|(regex_str, skip)| {
            // create a rust string with text of the regex; the Debug impl
            // will add quotes and escape
            (format!("{:?}", regex_str), skip)
        });
-    rust!(out, "let {}strs: &[&str] = &[", prefix);
+    let mut contains_skip = false;
-    for literal in &regex_strings {
+
-        rust!(out, "{},", literal);
+    rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
    for (literal, skip) in regex_strings {
        rust!(out, "({}, {}),", literal, skip);
        contains_skip |= skip;
    }
    if !contains_skip {
        rust!(out, r#"(r"^(\s*)", true),"#);
    }
    rust!(out, "];");
    rust!(
        out,
-        "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()",
+        "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
        p = prefix
    );
--- a/lalrpop/src/normalize/lower/mod.rs
+++ b/lalrpop/src/normalize/lower/mod.rs
@ -5,8 +5,8 @@ use collections::{map, Map};
 use grammar::consts::CFG;
 use grammar::parse_tree as pt;
 use grammar::parse_tree::{
-    read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path,
+    read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
-    TerminalString,
+    Path, TerminalString,
 };
 use grammar::pattern::{Pattern, PatternKind};
 use grammar::repr as r;
@ -79,26 +79,29 @@ impl<'s> LowerState<'s> {
                        })),
                    };
                    self.conversions
-                        .extend(data.match_entries.iter().enumerate().map(
+                        .extend(data.match_entries.iter().enumerate().filter_map(
-                            |(index, match_entry)| {
+                            |(index, match_entry)| match &match_entry.user_name {
-                                let pattern = Pattern {
+                                MatchMapping::Terminal(user_name) => {
-                                    span,
+                                    let pattern = Pattern {
-                                    kind: PatternKind::TupleStruct(
+                                        span,
-                                        internal_token_path.clone(),
+                                        kind: PatternKind::TupleStruct(
-                                        vec![
+                                            internal_token_path.clone(),
-                                            Pattern {
+                                            vec![
-                                                span,
+                                                Pattern {
-                                                kind: PatternKind::Usize(index),
+                                                    span,
-                                            },
+                                                    kind: PatternKind::Usize(index),
-                                            Pattern {
+                                                },
-                                                span,
+                                                Pattern {
-                                                kind: PatternKind::Choose(input_str.clone()),
+                                                    span,
-                                            },
+                                                    kind: PatternKind::Choose(input_str.clone()),
-                                        ],
+                                                },
-                                    ),
+                                            ],
-                                };
+                                        ),
                                    };
-                                (match_entry.user_name.clone(), pattern)
+                                    Some((user_name.clone(), pattern))
                                }
                                MatchMapping::Skip => None,
                            },
                        ));
                    self.intern_token = Some(data);
--- a/lalrpop/src/normalize/resolve/mod.rs
+++ b/lalrpop/src/normalize/resolve/mod.rs
@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
            .flat_map(|match_token| &match_token.contents)
            .flat_map(|match_contents| &match_contents.items)
            .filter_map(|item| match *item {
-                MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => {
+                MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
                    Some((item.span(), id.clone(), Def::Terminal))
                }
                _ => None,
--- a/lalrpop/src/normalize/token_check/mod.rs
+++ b/lalrpop/src/normalize/token_check/mod.rs
@ -133,7 +133,7 @@ impl MatchBlock {
                            match_block.add_match_entry(
                                precedence,
                                sym.clone(),
-                                TerminalString::Literal(sym.clone()),
+                                MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
                                span,
                            )?;
                        }
@ -162,7 +162,7 @@ impl MatchBlock {
        &mut self,
        match_group_precedence: usize,
        sym: TerminalLiteral,
-        user_name: TerminalString,
+        user_name: MatchMapping,
        span: Span,
    ) -> NormResult<()> {
        if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
        }
        // NB: It's legal for multiple regex to produce same terminal.
-        self.match_user_names.insert(user_name.clone());
+        if let MatchMapping::Terminal(user_name) = &user_name {
            self.match_user_names.insert(user_name.clone());
        }
        self.match_entries.push(MatchEntry {
            precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
        self.match_entries.push(MatchEntry {
            precedence: sym.base_precedence(),
            match_literal: sym.clone(),
-            user_name: TerminalString::Literal(sym.clone()),
+            user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
        });
        self.spans.insert(sym, span);
@ -328,29 +330,26 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
    // one of precedences, that are parallel with `literals`.
    let mut regexs = Vec::with_capacity(match_entries.len());
    let mut precedences = Vec::with_capacity(match_entries.len());
-    {
+    for match_entry in &match_entries {
-        for match_entry in &match_entries {
+        precedences.push(Precedence(match_entry.precedence));
-            precedences.push(Precedence(match_entry.precedence));
+        match match_entry.match_literal {
-            match match_entry.match_literal {
+            TerminalLiteral::Quoted(ref s) => {
-                TerminalLiteral::Quoted(ref s) => {
+                regexs.push(re::parse_literal(&s));
-                    regexs.push(re::parse_literal(&s));
+            }
-                }
+            TerminalLiteral::Regex(ref s) => {
-                TerminalLiteral::Regex(ref s) => {
+                match re::parse_regex(&s) {
-                    match re::parse_regex(&s) {
+                    Ok(regex) => regexs.push(regex),
-                        Ok(regex) => regexs.push(regex),
+                    Err(error) => {
-                        Err(error) => {
+                        let literal_span = spans[&match_entry.match_literal];
-                            let literal_span = spans[&match_entry.match_literal];
+                        // FIXME -- take offset into account for
-                            // FIXME -- take offset into account for
+                        // span; this requires knowing how many #
-                            // span; this requires knowing how many #
+                        // the user used, which we do not track
-                            // the user used, which we do not track
+                        return_err!(literal_span, "invalid regular expression: {}", error);
                            return_err!(literal_span, "invalid regular expression: {}", error);
                        }
                    }
                }
            }
        }
-        Ok(())
+    }
    }?;
    let dfa = match dfa::build_dfa(&regexs, &precedences) {
        Ok(dfa) => dfa,
--- a/lalrpop/src/normalize/tyinfer/mod.rs
+++ b/lalrpop/src/normalize/tyinfer/mod.rs
@ -3,8 +3,8 @@ use super::{NormError, NormResult};
 use grammar::consts::{ERROR, LOCATION};
 use grammar::parse_tree::{
-    ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString,
+    ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
-    Path, Span, SymbolKind, TypeParameter, TypeRef,
+    NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
 };
 use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
 use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
            let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);
            for match_entry in &intern_token.match_entries {
-                types.add_term_type(match_entry.user_name.clone(), input_str.clone());
+                if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
                    types.add_term_type(user_name.clone(), input_str.clone());
                }
            }
            types
--- a/lalrpop/src/parser/lrgrammar.lalrpop
+++ b/lalrpop/src/parser/lrgrammar.lalrpop
@ -319,7 +319,10 @@ MatchItem: MatchItem = {
 MatchSymbol = QuotedLiteral;
-MatchMapping = Terminal;
+MatchMapping: MatchMapping = {
    Terminal => MatchMapping::Terminal(<>),
    "{" "}" => MatchMapping::Skip,
 };
 EnumToken: EnumToken =
    "enum" <lo:@L> <t:TypeRef> <hi:@R> "{"
--- a/lalrpop/src/parser/lrgrammar.rs
+++ b/lalrpop/src/parser/lrgrammar.rs
--- a/lalrpop/src/parser/mod.rs
+++ b/lalrpop/src/parser/mod.rs
@ -22,7 +22,7 @@ mod test;
 pub enum Top {
    Grammar(Grammar),
    Pattern(Pattern<TypeRef>),
-    MatchMapping(TerminalString),
+    MatchMapping(MatchMapping),
    TypeRef(TypeRef),
    GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
 }