From 1d4c5d3efc4e13fab9437faf5287260d834e6491 Mon Sep 17 00:00:00 2001 From: Niko Matsakis Date: Thu, 23 Jul 2015 08:50:12 -0400 Subject: [PATCH] separate out pre/post-validation in order to properly check for terminals without defined conversions; also, eliminate the default conversion --- lalrpop/src/grammar/parse_tree.rs | 8 -- lalrpop/src/grammar/repr.rs | 22 +---- lalrpop/src/normalize/mod.rs | 18 +++- lalrpop/src/normalize/postvalidate/mod.rs | 98 +++++++++++++++++++ lalrpop/src/normalize/postvalidate/test.rs | 36 +++++++ .../{validate => prevalidate}/mod.rs | 51 ++-------- .../{validate => prevalidate}/test.rs | 7 ++ lalrpop/src/tok/mod.rs | 37 ++++--- lalrpop/src/tok/test.rs | 8 ++ 9 files changed, 199 insertions(+), 86 deletions(-) create mode 100644 lalrpop/src/normalize/postvalidate/mod.rs create mode 100644 lalrpop/src/normalize/postvalidate/test.rs rename lalrpop/src/normalize/{validate => prevalidate}/mod.rs (87%) rename lalrpop/src/normalize/{validate => prevalidate}/test.rs (93%) diff --git a/lalrpop/src/grammar/parse_tree.rs b/lalrpop/src/grammar/parse_tree.rs index ba93b6d..a38c2e8 100644 --- a/lalrpop/src/grammar/parse_tree.rs +++ b/lalrpop/src/grammar/parse_tree.rs @@ -9,7 +9,6 @@ use intern::{intern, InternedString}; use grammar::repr::{NominalTypeRepr, TypeRepr}; use grammar::pattern::Pattern; use std::fmt::{Debug, Display, Formatter, Error}; -use std::iter::once; use util::Sep; #[derive(Clone, Debug, PartialEq, Eq)] @@ -482,13 +481,6 @@ impl Path { None } } - - pub fn append(&self, id: InternedString) -> Path { - Path { - absolute: self.absolute, - ids: self.ids.iter().cloned().chain(once(id)).collect() - } - } } impl ActionKind { diff --git a/lalrpop/src/grammar/repr.rs b/lalrpop/src/grammar/repr.rs index e0c46e7..1af3752 100644 --- a/lalrpop/src/grammar/repr.rs +++ b/lalrpop/src/grammar/repr.rs @@ -5,7 +5,7 @@ */ use intern::{InternedString}; -use grammar::pattern::{Pattern, PatternKind}; +use grammar::pattern::{Pattern}; use std::fmt::{Debug, Display, Formatter, Error}; use util::{map, Map, Sep}; @@ -291,24 +291,8 @@ impl ActionFnDefn { } impl Grammar { - pub fn default_pattern(&self, id: InternedString) -> Pattern { - let path = self.types.terminal_enum_type().path.append(id); - Pattern { - span: self.token_span, - kind: PatternKind::Enum(path, vec![ - Pattern { - span: self.token_span, - kind: PatternKind::DotDot - } - ]) - } - } - - pub fn pattern(&self, t: TerminalString) -> Pattern { - match self.conversions.get(&t).cloned() { - Some(p) => p, - None => self.default_pattern(t.0), - } + pub fn pattern(&self, t: TerminalString) -> &Pattern { + &self.conversions[&t] } pub fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] { diff --git a/lalrpop/src/normalize/mod.rs b/lalrpop/src/normalize/mod.rs index f580edc..d581578 100644 --- a/lalrpop/src/normalize/mod.rs +++ b/lalrpop/src/normalize/mod.rs @@ -25,21 +25,27 @@ macro_rules! return_err { } pub fn normalize(grammar: pt::Grammar) -> NormResult { - try!(validate::validate(&grammar)); - normalize_without_validating(grammar) + normalize_helper(grammar, true) } /// for unit tests, it is convenient to skip the validation step +#[cfg(test)] pub fn normalize_without_validating(grammar: pt::Grammar) -> NormResult { + normalize_helper(grammar, false) +} + +fn normalize_helper(grammar: pt::Grammar, validate: bool) -> NormResult { + if validate { try!(prevalidate::validate(&grammar)); } let grammar = try!(macro_expand::expand_macros(grammar)); + if validate { try!(postvalidate::validate(&grammar)); } let types = try!(tyinfer::infer_types(&grammar)); lower::lower(grammar, types) } // These are executed *IN ORDER*: -// Check some basic safety conditions. -mod validate; +// Check most safety conditions. +mod prevalidate; // Expands macros and expressions // @@ -56,6 +62,10 @@ mod validate; // may occur. mod macro_expand; +// Check some safety conditions that can only be tested +// after macro expansion. +mod postvalidate; + // Computes types where the user omitted them (or from macro // byproducts). // diff --git a/lalrpop/src/normalize/postvalidate/mod.rs b/lalrpop/src/normalize/postvalidate/mod.rs new file mode 100644 index 0000000..21c5bc3 --- /dev/null +++ b/lalrpop/src/normalize/postvalidate/mod.rs @@ -0,0 +1,98 @@ +//! Validate that terminal IDs have conversions. This can only +//! be done after macro expansion because some macro arguments +//! never make it into an actual production and are only used +//! in `if` conditions; we use string literals for those, +//! but they do not have to have a defined conversion. + +use super::{NormResult, NormError}; + +use grammar::parse_tree::*; +use util::{Set}; + +#[cfg(test)] +mod test; + +pub fn validate(grammar: &Grammar) -> NormResult<()> { + let conversions: Set<_> = + grammar.items + .iter() + .filter_map(|item| item.as_extern_token()) + .flat_map(|tt| tt.enum_token.conversions.iter().map(|conversion| conversion.from)) + .collect(); + + let validator = Validator { + grammar: grammar, + conversions: conversions, + }; + + validator.validate() +} + +struct Validator<'grammar> { + grammar: &'grammar Grammar, + conversions: Set, +} + +impl<'grammar> Validator<'grammar> { + fn validate(&self) -> NormResult<()> { + for item in &self.grammar.items { + match *item { + GrammarItem::Use(..) => { } + GrammarItem::ExternToken(_) => { } + GrammarItem::Nonterminal(ref data) => { + for alternative in &data.alternatives { + try!(self.validate_alternative(alternative)); + } + } + } + } + Ok(()) + } + + fn validate_alternative(&self, alternative: &Alternative) -> NormResult<()> { + assert!(alternative.condition.is_none()); // macro expansion should have removed these + try!(self.validate_expr(&alternative.expr)); + Ok(()) + } + + fn validate_expr(&self, expr: &ExprSymbol) -> NormResult<()> { + for symbol in &expr.symbols { + try!(self.validate_symbol(symbol)); + } + Ok(()) + } + + fn validate_symbol(&self, symbol: &Symbol) -> NormResult<()> { + match symbol.kind { + SymbolKind::Expr(ref expr) => { + try!(self.validate_expr(expr)); + } + SymbolKind::Terminal(term) => { + try!(self.validate_terminal(symbol.span, term)); + } + SymbolKind::Nonterminal(_) => { + } + SymbolKind::Macro(..) => { + panic!("macro not removed: {:?}", symbol); + } + SymbolKind::Repeat(ref repeat) => { + try!(self.validate_symbol(&repeat.symbol)); + } + SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => { + try!(self.validate_symbol(sym)); + } + SymbolKind::Lookahead | SymbolKind::Lookbehind => { } + } + + Ok(()) + } + + fn validate_terminal(&self, span: Span, term: TerminalString) -> NormResult<()> { + if !self.conversions.contains(&term) { + return_err!(span, "terminal `{}` does not have a pattern defined for it", + term); + } + + Ok(()) + } +} diff --git a/lalrpop/src/normalize/postvalidate/test.rs b/lalrpop/src/normalize/postvalidate/test.rs new file mode 100644 index 0000000..e0ba470 --- /dev/null +++ b/lalrpop/src/normalize/postvalidate/test.rs @@ -0,0 +1,36 @@ +use parser; +use grammar::parse_tree::{Span}; +use regex::Regex; + +fn check_err(expected_err: &str, grammar: &str) { + let expected_err = Regex::new(expected_err).unwrap(); + + // the string will have a `>>>` and `<<<` in it, which serve to + // indicate the span where an error is expected. + let start_index = grammar.find(">>>").unwrap(); + let grammar = grammar.replace(">>>", ""); // remove the `>>>` marker + let end_index = grammar.rfind("<<<").unwrap(); + let grammar = grammar.replace("<<<", ""); + + assert!(start_index <= end_index); + + let parsed_grammar = parser::parse_grammar(&grammar).unwrap(); + match super::validate(&parsed_grammar) { + Ok(()) => { + panic!("expected error for grammar"); + } + Err(err) => { + assert_eq!(err.span, Span(start_index, end_index)); + + assert!(expected_err.is_match(&err.message), + "unexpected error text `{}`, did not match `{}`", err.message, expected_err); + } + } +} + +#[test] +fn unknown_terminal() { + check_err( + r#"terminal `"\+"` does not have a pattern defined for it"#, + r#"grammar; X = X >>>"+"<<<;"#); +} diff --git a/lalrpop/src/normalize/validate/mod.rs b/lalrpop/src/normalize/prevalidate/mod.rs similarity index 87% rename from lalrpop/src/normalize/validate/mod.rs rename to lalrpop/src/normalize/prevalidate/mod.rs index f4e79f7..e54a855 100644 --- a/lalrpop/src/normalize/validate/mod.rs +++ b/lalrpop/src/normalize/prevalidate/mod.rs @@ -4,17 +4,12 @@ use super::{NormResult, NormError}; use super::norm_util::{self, Symbols}; use grammar::parse_tree::*; -use intern::{intern, read, InternedString}; -use regex::Regex; -use util::{Map, Multimap, Sep, set, Set}; +use intern::{intern, InternedString}; +use util::{Map, Multimap, Sep, set}; #[cfg(test)] mod test; -thread_local! { - static IDENTIFIER_RE: Regex = Regex::new("^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap() -} - pub fn validate(grammar: &Grammar) -> NormResult<()> { let globals = ScopeChain { previous: None, @@ -31,18 +26,10 @@ pub fn validate(grammar: &Grammar) -> NormResult<()> { .filter_map(|item| item.as_extern_token()) .next(); - let conversions: Set<_> = - grammar.items - .iter() - .filter_map(|item| item.as_extern_token()) - .flat_map(|tt| tt.enum_token.conversions.iter().map(|conversion| conversion.from)) - .collect(); - let validator = Validator { grammar: grammar, globals: globals, extern_token: extern_token, - conversions: conversions, }; validator.validate() @@ -52,7 +39,6 @@ struct Validator<'grammar> { grammar: &'grammar Grammar, extern_token: Option<&'grammar ExternToken>, globals: ScopeChain<'grammar>, - conversions: Set, } #[derive(Copy, Clone, Debug)] @@ -221,8 +207,8 @@ impl<'grammar> Validator<'grammar> { SymbolKind::Expr(ref expr) => { try!(self.validate_expr(scope, expr)); } - SymbolKind::Terminal(term) => { - try!(self.validate_terminal(symbol.span, term)); + SymbolKind::Terminal(_) => { + /* see postvalidate! */ } SymbolKind::Nonterminal(nt) => { try!(self.validate_nt(scope, symbol.span, nt)); @@ -245,6 +231,10 @@ impl<'grammar> Validator<'grammar> { } } } + + for arg in &msym.args { + try!(self.validate_symbol(scope, arg)); + } } SymbolKind::Repeat(ref repeat) => { try!(self.validate_symbol(scope, &repeat.symbol)); @@ -274,31 +264,6 @@ impl<'grammar> Validator<'grammar> { Ok(()) } - fn validate_terminal(&self, - span: Span, - term: TerminalString) - -> NormResult<()> { - // if this is a valid Rust identifier, then the - // terminal is accepted - if self.is_identifier(term.0) { - return Ok(()); - } - - // otherwise, a remapping must have been defined - if self.conversions.contains(&term) { - return Ok(()); - } - - return_err!(span, "terminal `{}` is neither a valid Rust identifier \ - and nor does it have a defined conversion", term); - } - - fn is_identifier(&self, term: InternedString) -> bool { - IDENTIFIER_RE.with(|identifier_re| - read(|interner| - identifier_re.is_match(interner.data(term)))) - } - fn validate_nt(&self, scope: &ScopeChain, span: Span, diff --git a/lalrpop/src/normalize/validate/test.rs b/lalrpop/src/normalize/prevalidate/test.rs similarity index 93% rename from lalrpop/src/normalize/validate/test.rs rename to lalrpop/src/normalize/prevalidate/test.rs index 4570e38..f4e6a29 100644 --- a/lalrpop/src/normalize/validate/test.rs +++ b/lalrpop/src/normalize/prevalidate/test.rs @@ -35,6 +35,13 @@ fn unknown_nonterminal() { r#"grammar; X = X >>>Y<<<;"#); } +#[test] +fn unknown_nonterminal_in_macro_arg() { + check_err( + "no definition found for nonterminal `Y`", + r#"grammar; X = X Id<>>>Y<<<>; Id = T;"#); +} + #[test] fn repeated_macro_arg() { check_err( diff --git a/lalrpop/src/tok/mod.rs b/lalrpop/src/tok/mod.rs index 558b9d3..6cd6d7b 100644 --- a/lalrpop/src/tok/mod.rs +++ b/lalrpop/src/tok/mod.rs @@ -47,6 +47,7 @@ pub enum Tok<'input> { Equals, EqualsEquals, EqualsGreaterThanCode(&'input str), + EqualsGreaterThanQuestionCode(&'input str), EqualsGreaterThanLookahead, EqualsGreaterThanLookbehind, GreaterThan, @@ -113,33 +114,47 @@ impl<'input> Tokenizer<'input> { // we've seen =>, now we have to choose between: // // => code + // =>? code // =>@L // =>@R - let idx1 = match self.lookahead { + match self.lookahead { Some((_, '@')) => { match self.bump() { Some((idx2, 'L')) => { self.bump(); - return Ok((idx0, EqualsGreaterThanLookahead, idx2+1)); + Ok((idx0, EqualsGreaterThanLookahead, idx2+1)) } Some((idx2, 'R')) => { self.bump(); - return Ok((idx0, EqualsGreaterThanLookbehind, idx2+1)); + Ok((idx0, EqualsGreaterThanLookbehind, idx2+1)) } _ => { - return Err(UnrecognizedToken(idx0)); + Err(UnrecognizedToken(idx0)) } } } - None => { - return Err(UnterminatedCode(idx0)); + Some((idx1, '?')) => { + self.bump(); + let idx2 = try!(self.code(idx0)); + let code = &self.text[idx1+1..idx2]; + Ok((idx0, EqualsGreaterThanQuestionCode(code), idx2)) } - Some((idx1, _)) => { idx1 } - }; + Some((idx1, _)) => { + let idx2 = try!(self.code(idx0)); + let code = &self.text[idx1..idx2]; + Ok((idx0, EqualsGreaterThanCode(code), idx2)) + } + None => { + Err(UnterminatedCode(idx0)) + } + } + } + + fn code(&mut self, idx0: usize) -> Result { // This is the interesting case. To find the end of the code, // we have to scan ahead, matching (), [], and {}, and looking // for a suitable terminator: `,`, `;`, `]`, `}`, or `)`. @@ -163,8 +178,7 @@ impl<'input> Tokenizer<'input> { // terminator. The code is everything *up // to but not including* the terminating // `,`, `;`, etc. - let code = &self.text[idx1..]; - return Ok((idx0, EqualsGreaterThanCode(code), self.text.len())); + return Ok(self.text.len()); } Some((idx2, ';')) | @@ -176,8 +190,7 @@ impl<'input> Tokenizer<'input> { // terminator. The code is everything *up // to but not including* the terminating // `,`, `;`, etc. - let code = &self.text[idx1..idx2]; - return Ok((idx0, EqualsGreaterThanCode(code), idx2)); + return Ok(idx2); } None if balance > 0 => { diff --git a/lalrpop/src/tok/test.rs b/lalrpop/src/tok/test.rs index 9892093..571b9dd 100644 --- a/lalrpop/src/tok/test.rs +++ b/lalrpop/src/tok/test.rs @@ -55,6 +55,14 @@ fn code1() { ]); } +#[test] +fn code2() { + test("=>? a(b, c),", vec![ + ("~~~~~~~~~~~ ", EqualsGreaterThanQuestionCode(" a(b, c)")), + (" ~", Comma), + ]); +} + #[test] #[should_panic] fn code_forgot_comma() {