Merge pull request #509 from Marwes/lexer_comment

feat: Allow the tokenizer to contain custom skip regexes/literals
This commit is contained in:
Markus Westerlind 2020-03-03 21:03:21 +01:00 committed by GitHub
commit 723678f364
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 7250 additions and 7065 deletions

View File

@ -2,6 +2,21 @@ use std::str::FromStr;
grammar;
match {
"+",
"-",
"*",
"/",
"(",
")",
r"[0-9]+",
// Skip whitespace and comments
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { }, // `// comment`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // `/* comment */`
}
pub Expr: i32 = {
<l:Expr> "+" <r:Factor> => l + r,
<l:Expr> "-" <r:Factor> => l - r,

View File

@ -209,13 +209,13 @@ match {
} else {
r"\w+",
_
}
}
```
Here the match contains two levels; each level can have more than one
item in it. The top-level contains only `r"[0-9]+"`, which means that this
regular expression is given highest priority. The next level contains
`r\w+`, so that will match afterwards.
`r\w+`, so that will match afterwards.
The final `_` indicates that other string literals and regular
expressions that appear elsewhere in the grammar (e.g., `"("` or
@ -240,7 +240,7 @@ fn calculator2b() {
let result = calculator2b::TermParser::new().parse("(foo33)").unwrap();
assert_eq!(result, "Id(foo33)");
// This one will fail:
let result = calculator2b::TermParser::new().parse("(22)").unwrap();
@ -262,7 +262,7 @@ match {
} else {
r"\w+",
_
}
}
```
This raises the interesting question of what the precedence is **within**
@ -280,7 +280,7 @@ There is one final twist before we reach the
can also use `match` declarations to give names to regular
expressions, so that we don't have to type them directly in our
grammar. For example, maybe instead of writing `r"\w+"`, we would
prefer to write `ID`. We could do that by modifying the match declaration like
prefer to write `ID`. We could do that by modifying the match declaration like
so:
```
@ -321,6 +321,20 @@ match {
And now any reference in your grammar to `"BEGIN"` will actually match
any capitalization.
#### Customizing skipping between tokens
If we want to support comments we will need to skip more than just whitespace in our lexer.
To this end `ignore patterns` can be specified.
```
match {
r"\s*" => { }, // The default whitespace skipping is disabled an `ignore pattern` is specified
r"//[^\n\r]*[\n\r]*" => { }, // Skip `// comments`
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { }, // Skip `/* comments */`
}
```
[lexer tutorial]: index.md
[calculator2b]: ../../calculator/src/calculator2b.lalrpop
[calculator3]: ../../calculator/src/calculator3.lalrpop

View File

@ -0,0 +1,12 @@
grammar;
match {
r"[0-9]+" => NUM,
r"\s*" => { },
r"//[^\n\r]*[\n\r]*" => { },
r"/\*([^\*]*\*+[^\*/])*([^\*]*\*+|[^\*])*\*/" => { },
}
pub(crate) Term: Vec<&'input str> = {
<NUM*>,
};

View File

@ -143,6 +143,8 @@ lalrpop_mod!(
dyn_argument
);
lalrpop_mod!(comments);
pub fn use_cfg_created_parser() {
cfg::CreatedParser::new();
}
@ -996,3 +998,22 @@ fn verify_lalrpop_generates_itself() {
Use ./snap.sh to generate a new snapshot of the lrgrammar",
);
}
#[test]
fn comments() {
assert_eq!(
comments::TermParser::new().parse("22 3 5 13").unwrap(),
vec!["22", "3", "5", "13"]
);
assert_eq!(
comments::TermParser::new()
.parse(
"22 /* 123 */ 3 5
// abc
13 // "
)
.unwrap(),
vec!["22", "3", "5", "13"]
);
}

View File

@ -10,22 +10,29 @@ impl<'a> fmt::Display for Token<'a> {
}
}
struct RegexEntry {
regex: regex::Regex,
skip: bool,
}
pub struct MatcherBuilder {
regex_set: regex::RegexSet,
regex_vec: Vec<regex::Regex>,
regex_vec: Vec<RegexEntry>,
}
impl MatcherBuilder {
pub fn new<S>(exprs: impl IntoIterator<Item = S>) -> Result<MatcherBuilder, regex::Error>
pub fn new<S>(
exprs: impl IntoIterator<Item = (S, bool)>,
) -> Result<MatcherBuilder, regex::Error>
where
S: AsRef<str>,
{
let exprs = exprs.into_iter();
let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
let mut first_error = None;
let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, s| {
let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
regex_vec.push(match regex::Regex::new(s.as_ref()) {
Ok(regex) => regex,
Ok(regex) => RegexEntry { regex, skip },
Err(err) => {
first_error = Some(err);
return None;
@ -62,7 +69,7 @@ pub struct Matcher<'input, 'builder, E> {
text: &'input str,
consumed: usize,
regex_set: &'builder regex::RegexSet,
regex_vec: &'builder Vec<regex::Regex>,
regex_vec: &'builder Vec<RegexEntry>,
_marker: PhantomData<fn() -> E>,
}
@ -70,36 +77,52 @@ impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
fn next(&mut self) -> Option<Self::Item> {
let text = self.text.trim_start();
let whitespace = self.text.len() - text.len();
let start_offset = self.consumed + whitespace;
if text.is_empty() {
self.text = text;
self.consumed = start_offset;
None
} else {
let matches = self.regex_set.matches(text);
if !matches.matched_any() {
Some(Err(ParseError::InvalidToken {
location: start_offset,
}))
loop {
let text = self.text;
let start_offset = self.consumed;
eprintln!("{:?}", text);
if text.is_empty() {
self.consumed = start_offset;
return None;
} else {
let mut longest_match = 0;
let mut index = 0;
for i in matches.iter() {
let match_ = self.regex_vec[i].find(text).unwrap();
let len = match_.end();
if len >= longest_match {
longest_match = len;
index = i;
let matches = self.regex_set.matches(text);
if !matches.matched_any() {
return Some(Err(ParseError::InvalidToken {
location: start_offset,
}));
} else {
let mut longest_match = 0;
let mut index = 0;
let mut skip = false;
for i in matches.iter() {
let entry = &self.regex_vec[i];
let match_ = entry.regex.find(text).unwrap();
let len = match_.end();
if len >= longest_match {
longest_match = len;
index = i;
skip = entry.skip;
}
}
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
// Skip any whitespace matches
if skip {
if longest_match == 0 {
return Some(Err(ParseError::InvalidToken {
location: start_offset,
}));
}
continue;
}
return Some(Ok((start_offset, Token(index, result), end_offset)));
}
let result = &text[..longest_match];
let remaining = &text[longest_match..];
let end_offset = start_offset + longest_match;
self.text = remaining;
self.consumed = end_offset;
Some(Ok((start_offset, Token(index, result), end_offset)))
}
}
}

View File

@ -109,7 +109,29 @@ impl MatchItem {
}
pub type MatchSymbol = TerminalLiteral;
pub type MatchMapping = TerminalString;
#[derive(Clone, PartialEq, Eq, Ord, PartialOrd)]
pub enum MatchMapping {
Terminal(TerminalString),
Skip,
}
impl Debug for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{:?}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
impl Display for MatchMapping {
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
match self {
MatchMapping::Terminal(term) => write!(fmt, "{}", term),
MatchMapping::Skip => write!(fmt, "{{ }}"),
}
}
}
/// Intern tokens are not typed by the user: they are synthesized in
/// the absence of an "extern" declaration with information about the
@ -158,7 +180,7 @@ pub struct MatchEntry {
/// NB: This field must go first, so that `PartialOrd` sorts by precedence first!
pub precedence: usize,
pub match_literal: TerminalLiteral,
pub user_name: TerminalString,
pub user_name: MatchMapping,
}
#[derive(Clone, Debug, PartialEq, Eq)]

View File

@ -1,6 +1,6 @@
//! Generates an iterator type `Matcher` that looks roughly like
use grammar::parse_tree::InternToken;
use grammar::parse_tree::{InternToken, MatchMapping};
use grammar::repr::{Grammar, TerminalLiteral};
use lexer::re;
use rust::RustWrite;
@ -25,35 +25,48 @@ pub fn compile<W: Write>(
// create a vector of rust string literals with the text of each
// regular expression
let regex_strings: Vec<String> = {
intern_token
.match_entries
.iter()
.map(|match_entry| match match_entry.match_literal {
TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
})
.map(|regex| {
// make sure all regex are anchored at the beginning of the input
format!("^({})", regex)
})
.map(|regex_str| {
// create a rust string with text of the regex; the Debug impl
// will add quotes and escape
format!("{:?}", regex_str)
})
.collect()
};
let regex_strings = intern_token
.match_entries
.iter()
.map(|match_entry| {
(
match match_entry.match_literal {
TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
},
match match_entry.user_name {
MatchMapping::Terminal(_) => false,
MatchMapping::Skip => true,
},
)
})
.map(|(regex, skip)| {
// make sure all regex are anchored at the beginning of the input
(format!("^({})", regex), skip)
})
.map(|(regex_str, skip)| {
// create a rust string with text of the regex; the Debug impl
// will add quotes and escape
(format!("{:?}", regex_str), skip)
});
rust!(out, "let {}strs: &[&str] = &[", prefix);
for literal in &regex_strings {
rust!(out, "{},", literal);
let mut contains_skip = false;
rust!(out, "let {}strs: &[(&str, bool)] = &[", prefix);
for (literal, skip) in regex_strings {
rust!(out, "({}, {}),", literal, skip);
contains_skip |= skip;
}
if !contains_skip {
rust!(out, r#"(r"^(\s*)", true),"#);
}
rust!(out, "];");
rust!(
out,
"{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs).unwrap()",
"{p}lalrpop_util::lexer::MatcherBuilder::new({p}strs.iter().copied()).unwrap()",
p = prefix
);

View File

@ -5,8 +5,8 @@ use collections::{map, Map};
use grammar::consts::CFG;
use grammar::parse_tree as pt;
use grammar::parse_tree::{
read_algorithm, GrammarItem, InternToken, Lifetime, Name, NonterminalString, Path,
TerminalString,
read_algorithm, GrammarItem, InternToken, Lifetime, MatchMapping, Name, NonterminalString,
Path, TerminalString,
};
use grammar::pattern::{Pattern, PatternKind};
use grammar::repr as r;
@ -79,26 +79,29 @@ impl<'s> LowerState<'s> {
})),
};
self.conversions
.extend(data.match_entries.iter().enumerate().map(
|(index, match_entry)| {
let pattern = Pattern {
span,
kind: PatternKind::TupleStruct(
internal_token_path.clone(),
vec![
Pattern {
span,
kind: PatternKind::Usize(index),
},
Pattern {
span,
kind: PatternKind::Choose(input_str.clone()),
},
],
),
};
.extend(data.match_entries.iter().enumerate().filter_map(
|(index, match_entry)| match &match_entry.user_name {
MatchMapping::Terminal(user_name) => {
let pattern = Pattern {
span,
kind: PatternKind::TupleStruct(
internal_token_path.clone(),
vec![
Pattern {
span,
kind: PatternKind::Usize(index),
},
Pattern {
span,
kind: PatternKind::Choose(input_str.clone()),
},
],
),
};
(match_entry.user_name.clone(), pattern)
Some((user_name.clone(), pattern))
}
MatchMapping::Skip => None,
},
));
self.intern_token = Some(data);

View File

@ -49,7 +49,7 @@ fn resolve_in_place(grammar: &mut Grammar) -> NormResult<()> {
.flat_map(|match_token| &match_token.contents)
.flat_map(|match_contents| &match_contents.items)
.filter_map(|item| match *item {
MatchItem::Mapped(_, TerminalString::Bare(ref id), _) => {
MatchItem::Mapped(_, MatchMapping::Terminal(TerminalString::Bare(ref id)), _) => {
Some((item.span(), id.clone(), Def::Terminal))
}
_ => None,

View File

@ -133,7 +133,7 @@ impl MatchBlock {
match_block.add_match_entry(
precedence,
sym.clone(),
TerminalString::Literal(sym.clone()),
MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
span,
)?;
}
@ -162,7 +162,7 @@ impl MatchBlock {
&mut self,
match_group_precedence: usize,
sym: TerminalLiteral,
user_name: TerminalString,
user_name: MatchMapping,
span: Span,
) -> NormResult<()> {
if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
@ -170,7 +170,9 @@ impl MatchBlock {
}
// NB: It's legal for multiple regex to produce same terminal.
self.match_user_names.insert(user_name.clone());
if let MatchMapping::Terminal(user_name) = &user_name {
self.match_user_names.insert(user_name.clone());
}
self.match_entries.push(MatchEntry {
precedence: match_group_precedence * 2 + sym.base_precedence(),
@ -203,7 +205,7 @@ impl MatchBlock {
self.match_entries.push(MatchEntry {
precedence: sym.base_precedence(),
match_literal: sym.clone(),
user_name: TerminalString::Literal(sym.clone()),
user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
});
self.spans.insert(sym, span);
@ -328,29 +330,26 @@ fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
// one of precedences, that are parallel with `literals`.
let mut regexs = Vec::with_capacity(match_entries.len());
let mut precedences = Vec::with_capacity(match_entries.len());
{
for match_entry in &match_entries {
precedences.push(Precedence(match_entry.precedence));
match match_entry.match_literal {
TerminalLiteral::Quoted(ref s) => {
regexs.push(re::parse_literal(&s));
}
TerminalLiteral::Regex(ref s) => {
match re::parse_regex(&s) {
Ok(regex) => regexs.push(regex),
Err(error) => {
let literal_span = spans[&match_entry.match_literal];
// FIXME -- take offset into account for
// span; this requires knowing how many #
// the user used, which we do not track
return_err!(literal_span, "invalid regular expression: {}", error);
}
for match_entry in &match_entries {
precedences.push(Precedence(match_entry.precedence));
match match_entry.match_literal {
TerminalLiteral::Quoted(ref s) => {
regexs.push(re::parse_literal(&s));
}
TerminalLiteral::Regex(ref s) => {
match re::parse_regex(&s) {
Ok(regex) => regexs.push(regex),
Err(error) => {
let literal_span = spans[&match_entry.match_literal];
// FIXME -- take offset into account for
// span; this requires knowing how many #
// the user used, which we do not track
return_err!(literal_span, "invalid regular expression: {}", error);
}
}
}
}
Ok(())
}?;
}
let dfa = match dfa::build_dfa(&regexs, &precedences) {
Ok(dfa) => dfa,

View File

@ -3,8 +3,8 @@ use super::{NormError, NormResult};
use grammar::consts::{ERROR, LOCATION};
use grammar::parse_tree::{
ActionKind, Alternative, Grammar, GrammarItem, Lifetime, NonterminalData, NonterminalString,
Path, Span, SymbolKind, TypeParameter, TypeRef,
ActionKind, Alternative, Grammar, GrammarItem, Lifetime, MatchMapping, NonterminalData,
NonterminalString, Path, Span, SymbolKind, TypeParameter, TypeRef,
};
use grammar::repr::{NominalTypeRepr, TypeRepr, Types};
use std::collections::{HashMap, HashSet};
@ -96,7 +96,9 @@ impl<'grammar> TypeInferencer<'grammar> {
let mut types = Types::new(&grammar.prefix, Some(loc_type), error_type, enum_type);
for match_entry in &intern_token.match_entries {
types.add_term_type(match_entry.user_name.clone(), input_str.clone());
if let MatchMapping::Terminal(user_name) = &match_entry.user_name {
types.add_term_type(user_name.clone(), input_str.clone());
}
}
types

View File

@ -319,7 +319,10 @@ MatchItem: MatchItem = {
MatchSymbol = QuotedLiteral;
MatchMapping = Terminal;
MatchMapping: MatchMapping = {
Terminal => MatchMapping::Terminal(<>),
"{" "}" => MatchMapping::Skip,
};
EnumToken: EnumToken =
"enum" <lo:@L> <t:TypeRef> <hi:@R> "{"

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ mod test;
pub enum Top {
Grammar(Grammar),
Pattern(Pattern<TypeRef>),
MatchMapping(TerminalString),
MatchMapping(MatchMapping),
TypeRef(TypeRef),
GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
}