From 1d4c5d3efc4e13fab9437faf5287260d834e6491 Mon Sep 17 00:00:00 2001
From: Niko Matsakis <niko@alum.mit.edu>
Date: Thu, 23 Jul 2015 08:50:12 -0400
Subject: [PATCH] separate out pre/post-validation in order to properly check
 for terminals without defined conversions; also, eliminate the default
 conversion

---
 lalrpop/src/grammar/parse_tree.rs             |  8 --
 lalrpop/src/grammar/repr.rs                   | 22 +----
 lalrpop/src/normalize/mod.rs                  | 18 +++-
 lalrpop/src/normalize/postvalidate/mod.rs     | 98 +++++++++++++++++++
 lalrpop/src/normalize/postvalidate/test.rs    | 36 +++++++
 .../{validate => prevalidate}/mod.rs          | 51 ++--------
 .../{validate => prevalidate}/test.rs         |  7 ++
 lalrpop/src/tok/mod.rs                        | 37 ++++---
 lalrpop/src/tok/test.rs                       |  8 ++
 9 files changed, 199 insertions(+), 86 deletions(-)
 create mode 100644 lalrpop/src/normalize/postvalidate/mod.rs
 create mode 100644 lalrpop/src/normalize/postvalidate/test.rs
 rename lalrpop/src/normalize/{validate => prevalidate}/mod.rs (87%)
 rename lalrpop/src/normalize/{validate => prevalidate}/test.rs (93%)
diff --git a/lalrpop/src/grammar/parse_tree.rs b/lalrpop/src/grammar/parse_tree.rs
index ba93b6d..a38c2e8 100644
--- a/lalrpop/src/grammar/parse_tree.rs
+++ b/lalrpop/src/grammar/parse_tree.rs
@@ -9,7 +9,6 @@ use intern::{intern, InternedString};
 use grammar::repr::{NominalTypeRepr, TypeRepr};
 use grammar::pattern::Pattern;
 use std::fmt::{Debug, Display, Formatter, Error};
-use std::iter::once;
 use util::Sep;
 
 #[derive(Clone, Debug, PartialEq, Eq)]
@@ -482,13 +481,6 @@ impl Path {
             None
         }
     }
-
-    pub fn append(&self, id: InternedString) -> Path {
-        Path {
-            absolute: self.absolute,
-            ids: self.ids.iter().cloned().chain(once(id)).collect()
-        }
-    }
 }
 
 impl ActionKind {
diff --git a/lalrpop/src/grammar/repr.rs b/lalrpop/src/grammar/repr.rs
index e0c46e7..1af3752 100644
--- a/lalrpop/src/grammar/repr.rs
+++ b/lalrpop/src/grammar/repr.rs
@@ -5,7 +5,7 @@
  */
 
 use intern::{InternedString};
-use grammar::pattern::{Pattern, PatternKind};
+use grammar::pattern::{Pattern};
 use std::fmt::{Debug, Display, Formatter, Error};
 use util::{map, Map, Sep};
 
@@ -291,24 +291,8 @@ impl ActionFnDefn {
 }
 
 impl Grammar {
-    pub fn default_pattern(&self, id: InternedString) -> Pattern<TypeRepr> {
-        let path = self.types.terminal_enum_type().path.append(id);
-        Pattern {
-            span: self.token_span,
-            kind: PatternKind::Enum(path, vec![
-                Pattern {
-                    span: self.token_span,
-                    kind: PatternKind::DotDot
-                }
-            ])
-        }
-    }
-
-    pub fn pattern(&self, t: TerminalString) -> Pattern<TypeRepr> {
-        match self.conversions.get(&t).cloned() {
-            Some(p) => p,
-            None => self.default_pattern(t.0),
-        }
+    pub fn pattern(&self, t: TerminalString) -> &Pattern<TypeRepr> {
+        &self.conversions[&t]
     }
 
     pub fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] {
diff --git a/lalrpop/src/normalize/mod.rs b/lalrpop/src/normalize/mod.rs
index f580edc..d581578 100644
--- a/lalrpop/src/normalize/mod.rs
+++ b/lalrpop/src/normalize/mod.rs
@@ -25,21 +25,27 @@ macro_rules! return_err {
 }
 
 pub fn normalize(grammar: pt::Grammar) -> NormResult<r::Grammar> {
-    try!(validate::validate(&grammar));
-    normalize_without_validating(grammar)
+    normalize_helper(grammar, true)
 }
 
 /// for unit tests, it is convenient to skip the validation step
+#[cfg(test)]
 pub fn normalize_without_validating(grammar: pt::Grammar) -> NormResult<r::Grammar> {
+    normalize_helper(grammar, false)
+}
+
+fn normalize_helper(grammar: pt::Grammar, validate: bool) -> NormResult<r::Grammar> {
+    if validate { try!(prevalidate::validate(&grammar)); }
     let grammar = try!(macro_expand::expand_macros(grammar));
+    if validate { try!(postvalidate::validate(&grammar)); }
     let types = try!(tyinfer::infer_types(&grammar));
     lower::lower(grammar, types)
 }
 
 // These are executed *IN ORDER*:
 
-// Check some basic safety conditions.
-mod validate;
+// Check most safety conditions.
+mod prevalidate;
 
 // Expands macros and expressions
 //
@@ -56,6 +62,10 @@ mod validate;
 // may occur.
 mod macro_expand;
 
+// Check some safety conditions that can only be tested
+// after macro expansion.
+mod postvalidate;
+
 // Computes types where the user omitted them (or from macro
 // byproducts).
 //
diff --git a/lalrpop/src/normalize/postvalidate/mod.rs b/lalrpop/src/normalize/postvalidate/mod.rs
new file mode 100644
index 0000000..21c5bc3
--- /dev/null
+++ b/lalrpop/src/normalize/postvalidate/mod.rs
@@ -0,0 +1,98 @@
+//! Validate that terminal IDs have conversions. This can only
+//! be done after macro expansion because some macro arguments
+//! never make it into an actual production and are only used
+//! in `if` conditions; we use string literals for those,
+//! but they do not have to have a defined conversion. 
+
+use super::{NormResult, NormError};
+
+use grammar::parse_tree::*;
+use util::{Set};
+
+#[cfg(test)]
+mod test;
+
+pub fn validate(grammar: &Grammar) -> NormResult<()> {
+    let conversions: Set<_> =
+        grammar.items
+               .iter()
+               .filter_map(|item| item.as_extern_token())
+               .flat_map(|tt| tt.enum_token.conversions.iter().map(|conversion| conversion.from))
+               .collect();
+
+    let validator = Validator {
+        grammar: grammar,
+        conversions: conversions,
+    };
+
+    validator.validate()
+}
+
+struct Validator<'grammar> {
+    grammar: &'grammar Grammar,
+    conversions: Set<TerminalString>,
+}
+
+impl<'grammar> Validator<'grammar> {
+    fn validate(&self) -> NormResult<()> {
+        for item in &self.grammar.items {
+            match *item {
+                GrammarItem::Use(..) => { }
+                GrammarItem::ExternToken(_) => { }
+                GrammarItem::Nonterminal(ref data) => {
+                    for alternative in &data.alternatives {
+                        try!(self.validate_alternative(alternative));
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn validate_alternative(&self, alternative: &Alternative) -> NormResult<()> {
+        assert!(alternative.condition.is_none()); // macro expansion should have removed these
+        try!(self.validate_expr(&alternative.expr));
+        Ok(())
+    }
+
+    fn validate_expr(&self, expr: &ExprSymbol) -> NormResult<()> {
+        for symbol in &expr.symbols {
+            try!(self.validate_symbol(symbol));
+        }
+        Ok(())
+    }
+
+    fn validate_symbol(&self, symbol: &Symbol) -> NormResult<()> {
+        match symbol.kind {
+            SymbolKind::Expr(ref expr) => {
+                try!(self.validate_expr(expr));
+            }
+            SymbolKind::Terminal(term) => {
+                try!(self.validate_terminal(symbol.span, term));
+            }
+            SymbolKind::Nonterminal(_) => {
+            }
+            SymbolKind::Macro(..) => {
+                panic!("macro not removed: {:?}", symbol);
+            }
+            SymbolKind::Repeat(ref repeat) => {
+                try!(self.validate_symbol(&repeat.symbol));
+            }
+            SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => {
+                try!(self.validate_symbol(sym));
+            }
+            SymbolKind::Lookahead | SymbolKind::Lookbehind => { }
+        }
+
+        Ok(())
+    }
+
+    fn validate_terminal(&self, span: Span, term: TerminalString) -> NormResult<()> {
+        if !self.conversions.contains(&term) {
+            return_err!(span, "terminal `{}` does not have a pattern defined for it",
+                        term);
+        }
+
+        Ok(())
+    }
+}
diff --git a/lalrpop/src/normalize/postvalidate/test.rs b/lalrpop/src/normalize/postvalidate/test.rs
new file mode 100644
index 0000000..e0ba470
--- /dev/null
+++ b/lalrpop/src/normalize/postvalidate/test.rs
@@ -0,0 +1,36 @@
+use parser;
+use grammar::parse_tree::{Span};
+use regex::Regex;
+
+fn check_err(expected_err: &str, grammar: &str) {
+    let expected_err = Regex::new(expected_err).unwrap();
+
+    // the string will have a `>>>` and `<<<` in it, which serve to
+    // indicate the span where an error is expected.
+    let start_index = grammar.find(">>>").unwrap();
+    let grammar = grammar.replace(">>>", ""); // remove the `>>>` marker
+    let end_index = grammar.rfind("<<<").unwrap();
+    let grammar = grammar.replace("<<<", "");
+
+    assert!(start_index <= end_index);
+
+    let parsed_grammar = parser::parse_grammar(&grammar).unwrap();
+    match super::validate(&parsed_grammar) {
+        Ok(()) => {
+            panic!("expected error for grammar");
+        }
+        Err(err) => {
+            assert_eq!(err.span, Span(start_index, end_index));
+
+            assert!(expected_err.is_match(&err.message),
+                    "unexpected error text `{}`, did not match `{}`", err.message, expected_err);
+        }
+    }
+}
+
+#[test]
+fn unknown_terminal() {
+    check_err(
+        r#"terminal `"\+"` does not have a pattern defined for it"#,
+        r#"grammar; X = X >>>"+"<<<;"#);
+}
diff --git a/lalrpop/src/normalize/validate/mod.rs b/lalrpop/src/normalize/prevalidate/mod.rs
similarity index 87%
rename from lalrpop/src/normalize/validate/mod.rs
rename to lalrpop/src/normalize/prevalidate/mod.rs
index f4e79f7..e54a855 100644
--- a/lalrpop/src/normalize/validate/mod.rs
+++ b/lalrpop/src/normalize/prevalidate/mod.rs
@@ -4,17 +4,12 @@ use super::{NormResult, NormError};
 use super::norm_util::{self, Symbols};
 
 use grammar::parse_tree::*;
-use intern::{intern, read, InternedString};
-use regex::Regex;
-use util::{Map, Multimap, Sep, set, Set};
+use intern::{intern, InternedString};
+use util::{Map, Multimap, Sep, set};
 
 #[cfg(test)]
 mod test;
 
-thread_local! {
-    static IDENTIFIER_RE: Regex = Regex::new("^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap()
-}
-
 pub fn validate(grammar: &Grammar) -> NormResult<()> {
     let globals = ScopeChain {
         previous: None,
@@ -31,18 +26,10 @@ pub fn validate(grammar: &Grammar) -> NormResult<()> {
                .filter_map(|item| item.as_extern_token())
                .next();
 
-    let conversions: Set<_> =
-        grammar.items
-               .iter()
-               .filter_map(|item| item.as_extern_token())
-               .flat_map(|tt| tt.enum_token.conversions.iter().map(|conversion| conversion.from))
-               .collect();
-
     let validator = Validator {
         grammar: grammar,
         globals: globals,
         extern_token: extern_token,
-        conversions: conversions,
     };
 
     validator.validate()
@@ -52,7 +39,6 @@ struct Validator<'grammar> {
     grammar: &'grammar Grammar,
     extern_token: Option<&'grammar ExternToken>,
     globals: ScopeChain<'grammar>,
-    conversions: Set<TerminalString>,
 }
 
 #[derive(Copy, Clone, Debug)]
@@ -221,8 +207,8 @@ impl<'grammar> Validator<'grammar> {
             SymbolKind::Expr(ref expr) => {
                 try!(self.validate_expr(scope, expr));
             }
-            SymbolKind::Terminal(term) => {
-                try!(self.validate_terminal(symbol.span, term));
+            SymbolKind::Terminal(_) => {
+                /* see postvalidate! */
             }
             SymbolKind::Nonterminal(nt) => {
                 try!(self.validate_nt(scope, symbol.span, nt));
@@ -245,6 +231,10 @@ impl<'grammar> Validator<'grammar> {
                         }
                     }
                 }
+
+                for arg in &msym.args {
+                    try!(self.validate_symbol(scope, arg));
+                }
             }
             SymbolKind::Repeat(ref repeat) => {
                 try!(self.validate_symbol(scope, &repeat.symbol));
@@ -274,31 +264,6 @@ impl<'grammar> Validator<'grammar> {
         Ok(())
     }
 
-    fn validate_terminal(&self,
-                         span: Span,
-                         term: TerminalString)
-                         -> NormResult<()> {
-        // if this is a valid Rust identifier, then the
-        // terminal is accepted
-        if self.is_identifier(term.0) {
-            return Ok(());
-        }
-
-        // otherwise, a remapping must have been defined
-        if self.conversions.contains(&term) {
-            return Ok(());
-        }
-
-        return_err!(span, "terminal `{}` is neither a valid Rust identifier \
-                           and nor does it have a defined conversion", term);
-    }
-
-    fn is_identifier(&self, term: InternedString) -> bool {
-        IDENTIFIER_RE.with(|identifier_re|
-                           read(|interner|
-                                identifier_re.is_match(interner.data(term))))
-    }
-
     fn validate_nt(&self,
                    scope: &ScopeChain,
                    span: Span,
diff --git a/lalrpop/src/normalize/validate/test.rs b/lalrpop/src/normalize/prevalidate/test.rs
similarity index 93%
rename from lalrpop/src/normalize/validate/test.rs
rename to lalrpop/src/normalize/prevalidate/test.rs
index 4570e38..f4e6a29 100644
--- a/lalrpop/src/normalize/validate/test.rs
+++ b/lalrpop/src/normalize/prevalidate/test.rs
@@ -35,6 +35,13 @@ fn unknown_nonterminal() {
         r#"grammar; X = X >>>Y<<<;"#);
 }
 
+#[test]
+fn unknown_nonterminal_in_macro_arg() {
+    check_err(
+        "no definition found for nonterminal `Y`",
+        r#"grammar; X = X Id<>>>Y<<<>; Id<T> = T;"#);
+}
+
 #[test]
 fn repeated_macro_arg() {
     check_err(
diff --git a/lalrpop/src/tok/mod.rs b/lalrpop/src/tok/mod.rs
index 558b9d3..6cd6d7b 100644
--- a/lalrpop/src/tok/mod.rs
+++ b/lalrpop/src/tok/mod.rs
@@ -47,6 +47,7 @@ pub enum Tok<'input> {
     Equals,
     EqualsEquals,
     EqualsGreaterThanCode(&'input str),
+    EqualsGreaterThanQuestionCode(&'input str),
     EqualsGreaterThanLookahead,
     EqualsGreaterThanLookbehind,
     GreaterThan,
@@ -113,33 +114,47 @@ impl<'input> Tokenizer<'input> {
         // we've seen =>, now we have to choose between:
         //
         // => code
+        // =>? code
         // =>@L
         // =>@R
 
-        let idx1 = match self.lookahead {
+        match self.lookahead {
             Some((_, '@')) => {
                 match self.bump() {
                     Some((idx2, 'L')) => {
                         self.bump();
-                        return Ok((idx0, EqualsGreaterThanLookahead, idx2+1));
+                        Ok((idx0, EqualsGreaterThanLookahead, idx2+1))
                     }
                     Some((idx2, 'R')) => {
                         self.bump();
-                        return Ok((idx0, EqualsGreaterThanLookbehind, idx2+1));
+                        Ok((idx0, EqualsGreaterThanLookbehind, idx2+1))
                     }
                     _ => {
-                        return Err(UnrecognizedToken(idx0));
+                        Err(UnrecognizedToken(idx0))
                     }
                 }
             }
 
-            None => {
-                return Err(UnterminatedCode(idx0));
+            Some((idx1, '?')) => {
+                self.bump();
+                let idx2 = try!(self.code(idx0));
+                let code = &self.text[idx1+1..idx2];
+                Ok((idx0, EqualsGreaterThanQuestionCode(code), idx2))
             }
 
-            Some((idx1, _)) => { idx1 }
-        };
+            Some((idx1, _)) => {
+                let idx2 = try!(self.code(idx0));
+                let code = &self.text[idx1..idx2];
+                Ok((idx0, EqualsGreaterThanCode(code), idx2))
+            }
 
+            None => {
+                Err(UnterminatedCode(idx0))
+            }
+        }
+    }
+
+    fn code(&mut self, idx0: usize) -> Result<usize, Error> {
         // This is the interesting case. To find the end of the code,
         // we have to scan ahead, matching (), [], and {}, and looking
         // for a suitable terminator: `,`, `;`, `]`, `}`, or `)`.
@@ -163,8 +178,7 @@ impl<'input> Tokenizer<'input> {
                     // terminator. The code is everything *up
                     // to but not including* the terminating
                     // `,`, `;`, etc.
-                    let code = &self.text[idx1..];
-                    return Ok((idx0, EqualsGreaterThanCode(code), self.text.len()));
+                    return Ok(self.text.len());
                 }
 
                 Some((idx2, ';')) |
@@ -176,8 +190,7 @@ impl<'input> Tokenizer<'input> {
                     // terminator. The code is everything *up
                     // to but not including* the terminating
                     // `,`, `;`, etc.
-                    let code = &self.text[idx1..idx2];
-                    return Ok((idx0, EqualsGreaterThanCode(code), idx2));
+                    return Ok(idx2);
                 }
 
                 None if balance > 0 => {
diff --git a/lalrpop/src/tok/test.rs b/lalrpop/src/tok/test.rs
index 9892093..571b9dd 100644
--- a/lalrpop/src/tok/test.rs
+++ b/lalrpop/src/tok/test.rs
@@ -55,6 +55,14 @@ fn code1() {
     ]);
 }
 
+#[test]
+fn code2() {
+    test("=>? a(b, c),", vec![
+        ("~~~~~~~~~~~ ", EqualsGreaterThanQuestionCode(" a(b, c)")),
+        ("           ~", Comma),
+    ]);
+}
+
 #[test]
 #[should_panic]
 fn code_forgot_comma() {