Merge pull request #509 from Marwes/lexer_comment

feat: Allow the tokenizer to contain custom skip regexes/literals
This commit is contained in:
Markus Westerlind 2020-03-03 21:03:21 +01:00 committed by GitHub
commit 723678f364
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 7250 additions and 7065 deletions

View File

@ -2,6 +2,21 @@ use std::str::FromStr;
grammar; grammar;
match {
"+",
"-",
"*",
"/",
"(",
")",
r"[0-9]+",
// Skip whitespace and comments
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { }, // `// comment`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // `/* comment */`
}
pub Expr: i32 = { pub Expr: i32 = {
<l:Expr> "+" <r:Factor> => l + r, <l:Expr> "+" <r:Factor> => l + r,
<l:Expr> "-" <r:Factor> => l - r, <l:Expr> "-" <r:Factor> => l - r,

View File

@ -321,6 +321,20 @@ match {
And now any reference in your grammar to `"BEGIN"` will actually match And now any reference in your grammar to `"BEGIN"` will actually match
any capitalization. any capitalization.
#### Customizing skipping between tokens
If we want to support comments we will need to skip more than just whitespace in our lexer.
To this end `ignore patterns` can be specified.
```
match {
r"\s*" => { }, // The default whitespace skipping is disabled an `ignore pattern` is specified
r"//[^\n\r]*[\n\r]*" => { }, // Skip `// comments`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // Skip `/* comments */`
}
```
[lexer tutorial]: index.md [lexer tutorial]: index.md
[calculator2b]: ../../calculator/src/calculator2b.lalrpop [calculator2b]: ../../calculator/src/calculator2b.lalrpop
[calculator3]: ../../calculator/src/calculator3.lalrpop [calculator3]: ../../calculator/src/calculator3.lalrpop

View File

@ -0,0 +1,12 @@
grammar;
match {
r"[0-9]+" => NUM,
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { },
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
}
pub(crate) Term: Vec<&'input str> = {
<NUM*>,
};

View File

@ -143,6 +143,8 @@ lalrpop_mod!(
dyn_argument dyn_argument
); );
lalrpop_mod!(comments);
pub fn use_cfg_created_parser() { pub fn use_cfg_created_parser() {
cfg::CreatedParser::new(); cfg::CreatedParser::new();
} }
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
Use ./snap.sh to generate a new snapshot of the lrgrammar", Use ./snap.sh to generate a new snapshot of the lrgrammar",
); );
} }
#[test]
fn comments() {
assert_eq!(
comments::TermParser::new().parse("22 3 5 13").unwrap(),
vec!["22", "3", "5", "13"]
);
assert_eq!(
comments::TermParser::new()
.parse(
"22 /* 123 */ 3 5
// abc
13 // "
)
.unwrap(),
vec!["22", "3", "5", "13"]
);
}

View File

@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
} }
} }
struct RegexEntry {
regex: regex::Regex,
skip: bool,
}
pub struct MatcherBuilder { pub struct MatcherBuilder {
regex_set: regex::RegexSet, regex_set: regex::RegexSet,
regex_vec: Vec<regex::Regex>, regex_vec: Vec<RegexEntry>,
} }
impl MatcherBuilder { impl MatcherBuilder {
pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error> pub fn new<S>(
exprs: impl IntoIterator<Item = (S, bool)>,
) -> Result<MatcherBuilder, regex::Error>
where where
S: AsRef<str>, S: AsRef<str>,
{ {
let exprs = exprs.into_iter(); let exprs = exprs.into_iter();
let mut regex_vec = Vec::with_capacity(exprs.size_hint().0); let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
let mut first_error = None; let mut first_error = None;
let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| { let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
regex_vec.push(match regex::Regex::new(s.as_ref()) { regex_vec.push(match regex::Regex::new(s.as_ref()) {
Ok(regex) => regex, Ok(regex) => RegexEntry { regex, skip },
Err(err) => { Err(err) => {
first_error = Some(err); first_error = Some(err);
return None; return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
text: &'input str, text: &'input str,
consumed: usize, consumed: usize,
regex_set: &'builder regex::RegexSet, regex_set: &'builder regex::RegexSet,
regex_vec: &'builder Vec<regex::Regex>, regex_vec: &'builder Vec<RegexEntry>,
_marker: PhantomData<fn() -> E>, _marker: PhantomData<fn() -> E>,
} }
@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>; type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let text = self.text.trim_start(); loop {
let whitespace = self.text.len() - text.len(); let text = self.text;
let start_offset = self.consumed + whitespace; let start_offset = self.consumed;
eprintln!("{:?}", text);
if text.is_empty() { if text.is_empty() {
self.text = text;
self.consumed = start_offset; self.consumed = start_offset;
None return None;
} else { } else {
let matches = self.regex_set.matches(text); let matches = self.regex_set.matches(text);
if !matches.matched_any() { if !matches.matched_any() {
Some(Err(ParseError::InvalidToken { return Some(Err(ParseError::InvalidToken {
location: start_offset, location: start_offset,
})) }));
} else { } else {
let mut longest_match = 0; let mut longest_match = 0;
let mut index = 0; let mut index = 0;
let mut skip = false;
for i in matches.iter() { for i in matches.iter() {
let match_ = self.regex_vec[i].find(text).unwrap(); let entry = &self.regex_vec[i];
let match_ = entry.regex.find(text).unwrap();
let len = match_.end(); let len = match_.end();
if len >= longest_match { if len >= longest_match {
longest_match = len; longest_match = len;
index = i; index = i;
skip = entry.skip;
} }
} }
let result = &text[..longest_match]; let result = &text[..longest_match];
let remaining = &text[longest_match..]; let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match; let end_offset = start_offset + longest_match;
self.text = remaining; self.text = remaining;
self.consumed = end_offset; self.consumed = end_offset;
Some(Ok((start_offset, Token(index, result), end_offset)))
// Skip any whitespace matches
if skip {
if longest_match == 0 {
return Some(Err(ParseError::InvalidToken {
location: start_offset,
}));
}
continue;
}
return Some(Ok((start_offset, Token(index, result), end_offset)));
}
} }
} }
} }

View File

@ -109,7 +109,29 @@ impl MatchItem {
} }
pub type MatchSymbol = TerminalLiteral; pub type MatchSymbol = TerminalLiteral;
pub type MatchMapping = TerminalString;
#[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
pub enum MatchMapping {
Terminal(TerminalString),
Skip,
}
impl Debug for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{:?}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
impl Display for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
/// Intern tokens are not typed by the user: they are synthesized in /// Intern tokens are not typed by the user: they are synthesized in
/// the absence of an "extern" declaration with information about the /// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
/// NB: This field must go first, so that `PartialOrd` sorts by precedence first! /// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
pub precedence: usize, pub precedence: usize,
pub match_literal: TerminalLiteral, pub match_literal: TerminalLiteral,
pub user_name: TerminalString, pub user_name: MatchMapping,
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]

View File

@ -1,6 +1,6 @@
//! Generates an iterator type `Matcher` that looks roughly like //! Generates an iterator type `Matcher` that looks roughly like
use grammar::parse_tree::InternToken; use grammar::parse_tree::{InternToken, MatchMapping};
use grammar::repr::{Grammar, TerminalLiteral}; use grammar::repr::{Grammar, TerminalLiteral};
use lexer::re; use lexer::re;
use rust::RustWrite; use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(
// create a vector of rust string literals with the text of each // create a vector of rust string literals with the text of each
// regular expression // regular expression
let regex_strings: Vec<String> = { let regex_strings = intern_token
intern_token
.match_entries .match_entries
.iter() .iter()
.map(|match_entry| match match_entry.match_literal { .map(|match_entry| {
(
match match_entry.match_literal {
TerminalLiteral::Quoted(ref s) => re::parse_literal(&s), TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(), TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
},
match match_entry.user_name {
MatchMapping::Terminal(_) => false,
MatchMapping::Skip => true,
},
)
}) })
.map(|regex| { .map(|(regex, skip)| {
// make sure all regex are anchored at the beginning of the input // make sure all regex are anchored at the beginning of the input
format!("^({})", regex) (format!("^({})", regex), skip)
}) })
.map(|regex_str| { .map(|(regex_str, skip)| {
// create a rust string with text of the regex; the Debug impl // create a rust string with text of the regex; the Debug impl
// will add quotes and escape // will add quotes and escape
format!("{:?}", regex_str) (format!("{:?}", regex_str), skip)
}) });
.collect()
};
rust!(out, "let {}strs: &[&str] = &[", prefix); let mut contains_skip = false;
for literal in &regex_strings {
rust!(out, "{},", literal); rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
for (literal, skip) in regex_strings {
rust!(out, "({}, {}),", literal, skip);
contains_skip |= skip;
} }
if !contains_skip {
rust!(out, r#"(r"^(\s*)", true),"#);
}
rust!(out, "];"); rust!(out, "];");
rust!( rust!(
out, out,
"{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()", "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
p = prefix p = prefix
); );

View File

@ -5,8 +5,8 @@ use collections::{map, Map};
use grammar::consts::CFG; use grammar::consts::CFG;
use grammar::parse_tree as pt; use grammar::parse_tree as pt;
use grammar::parse_tree::{ use grammar::parse_tree::{
read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path, read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
TerminalString, Path, TerminalString,
}; };
use grammar::pattern::{Pattern, PatternKind}; use grammar::pattern::{Pattern, PatternKind};
use grammar::repr as r; use grammar::repr as r;
@ -79,8 +79,9 @@ impl<'s> LowerState<'s> {
})), })),
}; };
self.conversions self.conversions
.extend(data.match_entries.iter().enumerate().map( .extend(data.match_entries.iter().enumerate().filter_map(
|(index, match_entry)| { |(index, match_entry)| match &match_entry.user_name {
MatchMapping::Terminal(user_name) => {
let pattern = Pattern { let pattern = Pattern {
span, span,
kind: PatternKind::TupleStruct( kind: PatternKind::TupleStruct(
@ -98,7 +99,9 @@ impl<'s> LowerState<'s> {
), ),
}; };
(match_entry.user_name.clone(), pattern) Some((user_name.clone(), pattern))
}
MatchMapping::Skip => None,
}, },
)); ));
self.intern_token = Some(data); self.intern_token = Some(data);

View File

@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
.flat_map(|match_token| &match_token.contents) .flat_map(|match_token| &match_token.contents)
.flat_map(|match_contents| &match_contents.items) .flat_map(|match_contents| &match_contents.items)
.filter_map(|item| match *item { .filter_map(|item| match *item {
MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => { MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
Some((item.span(), id.clone(), Def::Terminal)) Some((item.span(), id.clone(), Def::Terminal))
} }
_ => None, _ => None,

View File

@ -133,7 +133,7 @@ impl MatchBlock {
match_block.add_match_entry( match_block.add_match_entry(
precedence, precedence,
sym.clone(), sym.clone(),
TerminalString::Literal(sym.clone()), MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
span, span,
)?; )?;
} }
@ -162,7 +162,7 @@ impl MatchBlock {
&mut self, &mut self,
match_group_precedence: usize, match_group_precedence: usize,
sym: TerminalLiteral, sym: TerminalLiteral,
user_name: TerminalString, user_name: MatchMapping,
span: Span, span: Span,
) -> NormResult<()> { ) -> NormResult<()> {
if let Some(_old_span) = self.spans.insert(sym.clone(), span) { if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
} }
// NB: It's legal for multiple regex to produce same terminal. // NB: It's legal for multiple regex to produce same terminal.
if let MatchMapping::Terminal(user_name) = &user_name {
self.match_user_names.insert(user_name.clone()); self.match_user_names.insert(user_name.clone());
}
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: match_group_precedence * 2 + sym.base_precedence(), precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: sym.base_precedence(), precedence: sym.base_precedence(),
match_literal: sym.clone(), match_literal: sym.clone(),
user_name: TerminalString::Literal(sym.clone()), user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
}); });
self.spans.insert(sym, span); self.spans.insert(sym, span);
@ -328,7 +330,6 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
// one of precedences, that are parallel with `literals`. // one of precedences, that are parallel with `literals`.
let mut regexs = Vec::with_capacity(match_entries.len()); let mut regexs = Vec::with_capacity(match_entries.len());
let mut precedences = Vec::with_capacity(match_entries.len()); let mut precedences = Vec::with_capacity(match_entries.len());
{
for match_entry in &match_entries { for match_entry in &match_entries {
precedences.push(Precedence(match_entry.precedence)); precedences.push(Precedence(match_entry.precedence));
match match_entry.match_literal { match match_entry.match_literal {
@ -349,8 +350,6 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
} }
} }
} }
Ok(())
}?;
let dfa = match dfa::build_dfa(&regexs, &precedences) { let dfa = match dfa::build_dfa(&regexs, &precedences) {
Ok(dfa) => dfa, Ok(dfa) => dfa,

View File

@ -3,8 +3,8 @@ use super::{NormError, NormResult};
use grammar::consts::{ERROR, LOCATION}; use grammar::consts::{ERROR, LOCATION};
use grammar::parse_tree::{ use grammar::parse_tree::{
ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString, ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
Path, Span, SymbolKind, TypeParameter, TypeRef, NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
}; };
use grammar::repr::{NominalTypeRepr, TypeRepr, Types}; use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type); let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);
for match_entry in &intern_token.match_entries { for match_entry in &intern_token.match_entries {
types.add_term_type(match_entry.user_name.clone(), input_str.clone()); if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
types.add_term_type(user_name.clone(), input_str.clone());
}
} }
types types

View File

@ -319,7 +319,10 @@ MatchItem: MatchItem = {
MatchSymbol = QuotedLiteral; MatchSymbol = QuotedLiteral;
MatchMapping = Terminal; MatchMapping: MatchMapping = {
Terminal => MatchMapping::Terminal(<>),
"{" "}" => MatchMapping::Skip,
};
EnumToken: EnumToken = EnumToken: EnumToken =
"enum" <lo:@L> <t:TypeRef> <hi:@R> "{" "enum" <lo:@L> <t:TypeRef> <hi:@R> "{"

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ mod test;
pub enum Top { pub enum Top {
Grammar(Grammar), Grammar(Grammar),
Pattern(Pattern<TypeRef>), Pattern(Pattern<TypeRef>),
MatchMapping(TerminalString), MatchMapping(MatchMapping),
TypeRef(TypeRef), TypeRef(TypeRef),
GrammarWhereClauses(Vec<WhereClause<TypeRef>>), GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
} }