Merge pull request #509 from Marwes/lexer_comment

feat: Allow the tokenizer to contain custom skip regexes/literals
This commit is contained in:
Markus Westerlind 2020-03-03 21:03:21 +01:00 committed by GitHub
commit 723678f364
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 7250 additions and 7065 deletions

View File

@ -2,6 +2,21 @@ use std::str::FromStr;
grammar; grammar;
match {
"+",
"-",
"*",
"/",
"(",
")",
r"[0-9]+",
// Skip whitespace and comments
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { }, // `// comment`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // `/* comment */`
}
pub Expr: i32 = { pub Expr: i32 = {
<l:Expr> "+" <r:Factor> => l + r, <l:Expr> "+" <r:Factor> => l + r,
<l:Expr> "-" <r:Factor> => l - r, <l:Expr> "-" <r:Factor> => l - r,

View File

@ -209,13 +209,13 @@ match {
} else { } else {
r"\w+", r"\w+",
_ _
} }
``` ```
Here the match contains two levels; each level can have more than one Here the match contains two levels; each level can have more than one
item in it. The top-level contains only `r"[0-9]+"`, which means that this item in it. The top-level contains only `r"[0-9]+"`, which means that this
regular expression is given highest priority. The next level contains regular expression is given highest priority. The next level contains
`r\w+`, so that will match afterwards. `r\w+`, so that will match afterwards.
The final `_` indicates that other string literals and regular The final `_` indicates that other string literals and regular
expressions that appear elsewhere in the grammar (e.g., `"("` or expressions that appear elsewhere in the grammar (e.g., `"("` or
@ -240,7 +240,7 @@ fn calculator2b() {
let result = calculator2b::TermParser::new().parse("(foo33)").unwrap(); let result = calculator2b::TermParser::new().parse("(foo33)").unwrap();
assert_eq!(result, "Id(foo33)"); assert_eq!(result, "Id(foo33)");
// This one will fail: // This one will fail:
let result = calculator2b::TermParser::new().parse("(22)").unwrap(); let result = calculator2b::TermParser::new().parse("(22)").unwrap();
@ -262,7 +262,7 @@ match {
} else { } else {
r"\w+", r"\w+",
_ _
} }
``` ```
This raises the interesting question of what the precedence is **within** This raises the interesting question of what the precedence is **within**
@ -280,7 +280,7 @@ There is one final twist before we reach the
can also use `match` declarations to give names to regular can also use `match` declarations to give names to regular
expressions, so that we don't have to type them directly in our expressions, so that we don't have to type them directly in our
grammar. For example, maybe instead of writing `r"\w+"`, we would grammar. For example, maybe instead of writing `r"\w+"`, we would
prefer to write `ID`. We could do that by modifying the match declaration like prefer to write `ID`. We could do that by modifying the match declaration like
so: so:
``` ```
@ -321,6 +321,20 @@ match {
And now any reference in your grammar to `"BEGIN"` will actually match And now any reference in your grammar to `"BEGIN"` will actually match
any capitalization. any capitalization.
#### Customizing skipping between tokens
If we want to support comments we will need to skip more than just whitespace in our lexer.
To this end `ignore patterns` can be specified.
```
match {
r"\s*" => { }, // The default whitespace skipping is disabled an `ignore pattern` is specified
r"//[^\n\r]*[\n\r]*" => { }, // Skip `// comments`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // Skip `/* comments */`
}
```
[lexer tutorial]: index.md [lexer tutorial]: index.md
[calculator2b]: ../../calculator/src/calculator2b.lalrpop [calculator2b]: ../../calculator/src/calculator2b.lalrpop
[calculator3]: ../../calculator/src/calculator3.lalrpop [calculator3]: ../../calculator/src/calculator3.lalrpop

View File

@ -0,0 +1,12 @@
grammar;
match {
r"[0-9]+" => NUM,
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { },
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
}
pub(crate) Term: Vec<&'input str> = {
<NUM*>,
};

View File

@ -143,6 +143,8 @@ lalrpop_mod!(
dyn_argument dyn_argument
); );
lalrpop_mod!(comments);
pub fn use_cfg_created_parser() { pub fn use_cfg_created_parser() {
cfg::CreatedParser::new(); cfg::CreatedParser::new();
} }
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
Use ./snap.sh to generate a new snapshot of the lrgrammar", Use ./snap.sh to generate a new snapshot of the lrgrammar",
); );
} }
#[test]
fn comments() {
assert_eq!(
comments::TermParser::new().parse("22 3 5 13").unwrap(),
vec!["22", "3", "5", "13"]
);
assert_eq!(
comments::TermParser::new()
.parse(
"22 /* 123 */ 3 5
// abc
13 // "
)
.unwrap(),
vec!["22", "3", "5", "13"]
);
}

View File

@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
} }
} }
struct RegexEntry {
regex: regex::Regex,
skip: bool,
}
pub struct MatcherBuilder { pub struct MatcherBuilder {
regex_set: regex::RegexSet, regex_set: regex::RegexSet,
regex_vec: Vec<regex::Regex>, regex_vec: Vec<RegexEntry>,
} }
impl MatcherBuilder { impl MatcherBuilder {
pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error> pub fn new<S>(
exprs: impl IntoIterator<Item = (S, bool)>,
) -> Result<MatcherBuilder, regex::Error>
where where
S: AsRef<str>, S: AsRef<str>,
{ {
let exprs = exprs.into_iter(); let exprs = exprs.into_iter();
let mut regex_vec = Vec::with_capacity(exprs.size_hint().0); let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
let mut first_error = None; let mut first_error = None;
let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| { let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
regex_vec.push(match regex::Regex::new(s.as_ref()) { regex_vec.push(match regex::Regex::new(s.as_ref()) {
Ok(regex) => regex, Ok(regex) => RegexEntry { regex, skip },
Err(err) => { Err(err) => {
first_error = Some(err); first_error = Some(err);
return None; return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
text: &'input str, text: &'input str,
consumed: usize, consumed: usize,
regex_set: &'builder regex::RegexSet, regex_set: &'builder regex::RegexSet,
regex_vec: &'builder Vec<regex::Regex>, regex_vec: &'builder Vec<RegexEntry>,
_marker: PhantomData<fn() -> E>, _marker: PhantomData<fn() -> E>,
} }
@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>; type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let text = self.text.trim_start(); loop {
let whitespace = self.text.len() - text.len(); let text = self.text;
let start_offset = self.consumed + whitespace; let start_offset = self.consumed;
if text.is_empty() { eprintln!("{:?}", text);
self.text = text; if text.is_empty() {
self.consumed = start_offset; self.consumed = start_offset;
None return None;
} else {
let matches = self.regex_set.matches(text);
if !matches.matched_any() {
Some(Err(ParseError::InvalidToken {
location: start_offset,
}))
} else { } else {
let mut longest_match = 0; let matches = self.regex_set.matches(text);
let mut index = 0; if !matches.matched_any() {
for i in matches.iter() { return Some(Err(ParseError::InvalidToken {
let match_ = self.regex_vec[i].find(text).unwrap(); location: start_offset,
let len = match_.end(); }));
if len >= longest_match { } else {
longest_match = len; let mut longest_match = 0;
index = i; let mut index = 0;
let mut skip = false;
for i in matches.iter() {
let entry = &self.regex_vec[i];
let match_ = entry.regex.find(text).unwrap();
let len = match_.end();
if len >= longest_match {
longest_match = len;
index = i;
skip = entry.skip;
}
} }
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
// Skip any whitespace matches
if skip {
if longest_match == 0 {
return Some(Err(ParseError::InvalidToken {
location: start_offset,
}));
}
continue;
}
return Some(Ok((start_offset, Token(index, result), end_offset)));
} }
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
Some(Ok((start_offset, Token(index, result), end_offset)))
} }
} }
} }

View File

@ -109,7 +109,29 @@ impl MatchItem {
} }
pub type MatchSymbol = TerminalLiteral; pub type MatchSymbol = TerminalLiteral;
pub type MatchMapping = TerminalString;
#[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
pub enum MatchMapping {
Terminal(TerminalString),
Skip,
}
impl Debug for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{:?}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
impl Display for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
/// Intern tokens are not typed by the user: they are synthesized in /// Intern tokens are not typed by the user: they are synthesized in
/// the absence of an "extern" declaration with information about the /// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
/// NB: This field must go first, so that `PartialOrd` sorts by precedence first! /// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
pub precedence: usize, pub precedence: usize,
pub match_literal: TerminalLiteral, pub match_literal: TerminalLiteral,
pub user_name: TerminalString, pub user_name: MatchMapping,
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]

View File

@ -1,6 +1,6 @@
//! Generates an iterator type `Matcher` that looks roughly like //! Generates an iterator type `Matcher` that looks roughly like
use grammar::parse_tree::InternToken; use grammar::parse_tree::{InternToken, MatchMapping};
use grammar::repr::{Grammar, TerminalLiteral}; use grammar::repr::{Grammar, TerminalLiteral};
use lexer::re; use lexer::re;
use rust::RustWrite; use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(
// create a vector of rust string literals with the text of each // create a vector of rust string literals with the text of each
// regular expression // regular expression
let regex_strings: Vec<String> = { let regex_strings = intern_token
intern_token .match_entries
.match_entries .iter()
.iter() .map(|match_entry| {
.map(|match_entry| match match_entry.match_literal { (
TerminalLiteral::Quoted(ref s) => re::parse_literal(&s), match match_entry.match_literal {
TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(), TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
}) TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
.map(|regex| { },
// make sure all regex are anchored at the beginning of the input match match_entry.user_name {
format!("^({})", regex) MatchMapping::Terminal(_) => false,
}) MatchMapping::Skip => true,
.map(|regex_str| { },
// create a rust string with text of the regex; the Debug impl )
// will add quotes and escape })
format!("{:?}", regex_str) .map(|(regex, skip)| {
}) // make sure all regex are anchored at the beginning of the input
.collect() (format!("^({})", regex), skip)
}; })
.map(|(regex_str, skip)| {
// create a rust string with text of the regex; the Debug impl
// will add quotes and escape
(format!("{:?}", regex_str), skip)
});
rust!(out, "let {}strs: &[&str] = &[", prefix); let mut contains_skip = false;
for literal in &regex_strings {
rust!(out, "{},", literal); rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
for (literal, skip) in regex_strings {
rust!(out, "({}, {}),", literal, skip);
contains_skip |= skip;
} }
if !contains_skip {
rust!(out, r#"(r"^(\s*)", true),"#);
}
rust!(out, "];"); rust!(out, "];");
rust!( rust!(
out, out,
"{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()", "{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
p = prefix p = prefix
); );

View File

@ -5,8 +5,8 @@ use collections::{map, Map};
use grammar::consts::CFG; use grammar::consts::CFG;
use grammar::parse_tree as pt; use grammar::parse_tree as pt;
use grammar::parse_tree::{ use grammar::parse_tree::{
read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path, read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
TerminalString, Path, TerminalString,
}; };
use grammar::pattern::{Pattern, PatternKind}; use grammar::pattern::{Pattern, PatternKind};
use grammar::repr as r; use grammar::repr as r;
@ -79,26 +79,29 @@ impl<'s> LowerState<'s> {
})), })),
}; };
self.conversions self.conversions
.extend(data.match_entries.iter().enumerate().map( .extend(data.match_entries.iter().enumerate().filter_map(
|(index, match_entry)| { |(index, match_entry)| match &match_entry.user_name {
let pattern = Pattern { MatchMapping::Terminal(user_name) => {
span, let pattern = Pattern {
kind: PatternKind::TupleStruct( span,
internal_token_path.clone(), kind: PatternKind::TupleStruct(
vec![ internal_token_path.clone(),
Pattern { vec![
span, Pattern {
kind: PatternKind::Usize(index), span,
}, kind: PatternKind::Usize(index),
Pattern { },
span, Pattern {
kind: PatternKind::Choose(input_str.clone()), span,
}, kind: PatternKind::Choose(input_str.clone()),
], },
), ],
}; ),
};
(match_entry.user_name.clone(), pattern) Some((user_name.clone(), pattern))
}
MatchMapping::Skip => None,
}, },
)); ));
self.intern_token = Some(data); self.intern_token = Some(data);

View File

@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
.flat_map(|match_token| &match_token.contents) .flat_map(|match_token| &match_token.contents)
.flat_map(|match_contents| &match_contents.items) .flat_map(|match_contents| &match_contents.items)
.filter_map(|item| match *item { .filter_map(|item| match *item {
MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => { MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
Some((item.span(), id.clone(), Def::Terminal)) Some((item.span(), id.clone(), Def::Terminal))
} }
_ => None, _ => None,

View File

@ -133,7 +133,7 @@ impl MatchBlock {
match_block.add_match_entry( match_block.add_match_entry(
precedence, precedence,
sym.clone(), sym.clone(),
TerminalString::Literal(sym.clone()), MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
span, span,
)?; )?;
} }
@ -162,7 +162,7 @@ impl MatchBlock {
&mut self, &mut self,
match_group_precedence: usize, match_group_precedence: usize,
sym: TerminalLiteral, sym: TerminalLiteral,
user_name: TerminalString, user_name: MatchMapping,
span: Span, span: Span,
) -> NormResult<()> { ) -> NormResult<()> {
if let Some(_old_span) = self.spans.insert(sym.clone(), span) { if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
} }
// NB: It's legal for multiple regex to produce same terminal. // NB: It's legal for multiple regex to produce same terminal.
self.match_user_names.insert(user_name.clone()); if let MatchMapping::Terminal(user_name) = &user_name {
self.match_user_names.insert(user_name.clone());
}
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: match_group_precedence * 2 + sym.base_precedence(), precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
self.match_entries.push(MatchEntry { self.match_entries.push(MatchEntry {
precedence: sym.base_precedence(), precedence: sym.base_precedence(),
match_literal: sym.clone(), match_literal: sym.clone(),
user_name: TerminalString::Literal(sym.clone()), user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
}); });
self.spans.insert(sym, span); self.spans.insert(sym, span);
@ -328,29 +330,26 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
// one of precedences, that are parallel with `literals`. // one of precedences, that are parallel with `literals`.
let mut regexs = Vec::with_capacity(match_entries.len()); let mut regexs = Vec::with_capacity(match_entries.len());
let mut precedences = Vec::with_capacity(match_entries.len()); let mut precedences = Vec::with_capacity(match_entries.len());
{ for match_entry in &match_entries {
for match_entry in &match_entries { precedences.push(Precedence(match_entry.precedence));
precedences.push(Precedence(match_entry.precedence)); match match_entry.match_literal {
match match_entry.match_literal { TerminalLiteral::Quoted(ref s) => {
TerminalLiteral::Quoted(ref s) => { regexs.push(re::parse_literal(&s));
regexs.push(re::parse_literal(&s)); }
} TerminalLiteral::Regex(ref s) => {
TerminalLiteral::Regex(ref s) => { match re::parse_regex(&s) {
match re::parse_regex(&s) { Ok(regex) => regexs.push(regex),
Ok(regex) => regexs.push(regex), Err(error) => {
Err(error) => { let literal_span = spans[&match_entry.match_literal];
let literal_span = spans[&match_entry.match_literal]; // FIXME -- take offset into account for
// FIXME -- take offset into account for // span; this requires knowing how many #
// span; this requires knowing how many # // the user used, which we do not track
// the user used, which we do not track return_err!(literal_span, "invalid regular expression: {}", error);
return_err!(literal_span, "invalid regular expression: {}", error);
}
} }
} }
} }
} }
Ok(()) }
}?;
let dfa = match dfa::build_dfa(&regexs, &precedences) { let dfa = match dfa::build_dfa(&regexs, &precedences) {
Ok(dfa) => dfa, Ok(dfa) => dfa,

View File

@ -3,8 +3,8 @@ use super::{NormError, NormResult};
use grammar::consts::{ERROR, LOCATION}; use grammar::consts::{ERROR, LOCATION};
use grammar::parse_tree::{ use grammar::parse_tree::{
ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString, ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
Path, Span, SymbolKind, TypeParameter, TypeRef, NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
}; };
use grammar::repr::{NominalTypeRepr, TypeRepr, Types}; use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type); let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);
for match_entry in &intern_token.match_entries { for match_entry in &intern_token.match_entries {
types.add_term_type(match_entry.user_name.clone(), input_str.clone()); if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
types.add_term_type(user_name.clone(), input_str.clone());
}
} }
types types

View File

@ -319,7 +319,10 @@ MatchItem: MatchItem = {
MatchSymbol = QuotedLiteral; MatchSymbol = QuotedLiteral;
MatchMapping = Terminal; MatchMapping: MatchMapping = {
Terminal => MatchMapping::Terminal(<>),
"{" "}" => MatchMapping::Skip,
};
EnumToken: EnumToken = EnumToken: EnumToken =
"enum" <lo:@L> <t:TypeRef> <hi:@R> "{" "enum" <lo:@L> <t:TypeRef> <hi:@R> "{"

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ mod test;
pub enum Top { pub enum Top {
Grammar(Grammar), Grammar(Grammar),
Pattern(Pattern<TypeRef>), Pattern(Pattern<TypeRef>),
MatchMapping(TerminalString), MatchMapping(MatchMapping),
TypeRef(TypeRef), TypeRef(TypeRef),
GrammarWhereClauses(Vec<WhereClause<TypeRef>>), GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
} }