From 4e5204078a61cfc764dd6393840818948aeeb148 Mon Sep 17 00:00:00 2001
From: Niko Matsakis <niko@alum.mit.edu>
Date: Fri, 19 Jun 2015 06:42:11 -0400
Subject: [PATCH] Start working on a naive LR(1) algorithm (seems like a good
 starting point). Implement first set computation.

---
 src/grammar/repr.rs         |   6 +-
 src/lr1/first.rs            | 139 ++++++++++++++++++++++++++++++++++++
 src/lr1/mod.rs              |  45 ++++++++++++
 src/main.rs                 |   1 +
 src/normalize/lower/mod.rs  |   2 +-
 src/normalize/lower/test.rs |   9 +--
 src/util.rs                 |   6 --
 7 files changed, 192 insertions(+), 16 deletions(-)
 create mode 100644 src/lr1/first.rs
 create mode 100644 src/lr1/mod.rs
diff --git a/src/grammar/repr.rs b/src/grammar/repr.rs
index 8f67113..40d2cd3 100644
--- a/src/grammar/repr.rs
+++ b/src/grammar/repr.rs
@@ -5,11 +5,13 @@
  */
 
 use intern::InternedString;
-use grammar::parse_tree::{NonterminalString, Span, TerminalString};
 use std::collections::HashMap;
 use std::fmt::{Debug, Display, Formatter, Error};
 use util::Sep;
 
+// These concepts we re-use wholesale 
+pub use grammar::parse_tree::{NonterminalString, Span, TerminalString};
+
 #[derive(Clone, Debug)]
 pub struct Grammar {
     pub action_fn_defns: Vec<ActionFnDefn>,
@@ -186,7 +188,7 @@ impl Grammar {
         }
     }
 
-    fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] {
+    pub fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] {
         match self.productions.get(&nonterminal) {
             Some(v) => &v[..],
             None => &[], // this...probably shouldn't happen actually?
diff --git a/src/lr1/first.rs b/src/lr1/first.rs
new file mode 100644
index 0000000..0745e98
--- /dev/null
+++ b/src/lr1/first.rs
@@ -0,0 +1,139 @@
+//! First set construction and computation.
+
+use grammar::repr::*;
+use std::collections::{HashMap, HashSet};
+
+use super::Lookahead;
+
+pub struct FirstSets {
+    map: HashMap<NonterminalString, FirstSet>
+}
+
+pub type FirstSet = HashSet<Option<TerminalString>>;
+
+impl FirstSets {
+    pub fn new(grammar: &Grammar) -> FirstSets {
+        let mut this = FirstSets { map: HashMap::new() };
+        let mut changed = true;
+        while changed {
+            changed = false;
+            for production in grammar.productions.values().flat_map(|p| p.iter()) {
+                let nt = production.nonterminal;
+                let lookahead = this.first(&production.symbols, Lookahead::EOF);
+                let first_set = this.map.entry(nt).or_insert_with(|| HashSet::new());
+                let cardinality = first_set.len();
+                first_set.extend(
+                    lookahead.into_iter()
+                             .map(|la| match la {
+                                 Lookahead::EOF => None,
+                                 Lookahead::Terminal(t) => Some(t),
+                             }));
+                changed |= (cardinality != first_set.len());
+            }
+        }
+        this
+    }
+
+    pub fn first_set(&self, nt: NonterminalString) -> &FirstSet {
+        &self.map[&nt]
+    }
+
+    pub fn first(&self, symbols: &[Symbol], lookahead: Lookahead) -> Vec<Lookahead> {
+        let mut result = vec![];
+
+        for symbol in symbols {
+            match *symbol {
+                Symbol::Terminal(t) => {
+                    result.push(Lookahead::Terminal(t));
+                    return result;
+                }
+
+                Symbol::Nonterminal(nt) => {
+                    let mut empty_prod = false;
+                    match self.map.get(&nt) {
+                        None => {
+                            // This should only happen during set
+                            // construction; it corresponds to an
+                            // entry that has not yet been
+                            // built. Otherwise, it would mean a
+                            // terminal with no productions. Either
+                            // way, the resulting first set should be
+                            // empty.
+                        }
+                        Some(set) => {
+                            for &opt_terminal in set {
+                                if let Some(terminal) = opt_terminal {
+                                    result.push(Lookahead::Terminal(terminal));
+                                } else {
+                                    empty_prod = true;
+                                }
+                            }
+                        }
+                    }
+                    if !empty_prod {
+                        return result;
+                    }
+                }
+            }
+        }
+
+        result.push(lookahead);
+        result
+    }
+}
+
+mod test {
+    use intern::intern;
+    use normalize::normalize;
+    use parser;
+    use grammar::repr::*;
+    use lr1::Lookahead;
+    use lr1::Lookahead::EOF;
+    use super::FirstSets;
+
+    fn nt(t: &str) -> Symbol {
+        Symbol::Nonterminal(NonterminalString(intern(t)))
+    }
+
+    fn t(t: &str) -> Symbol {
+        Symbol::Terminal(TerminalString(intern(t)))
+    }
+
+    fn la(t: &str) -> Lookahead {
+        Lookahead::Terminal(TerminalString(intern(t)))
+    }
+
+    fn first(first: &FirstSets, symbols: &[Symbol], lookahead: Lookahead) -> Vec<Lookahead> {
+        let mut v = first.first(symbols, lookahead);
+        v.sort();
+        v
+    }
+
+    #[test]
+    fn basic() {
+        let grammar = parser::parse_grammar(r#"
+grammar Foo {
+    token Tok where { };
+    A = B "C";
+    B: Option<u32> = {
+        "D" => Some(1);
+        => None;
+    };
+}
+"#).unwrap();
+        let grammar = normalize(grammar).unwrap();
+        let first_sets = FirstSets::new(&grammar);
+
+        assert_eq!(
+            first(&first_sets, &[nt("A")], EOF),
+            vec![la("C"), la("D")]);
+
+        assert_eq!(
+            first(&first_sets, &[nt("B")], EOF),
+            vec![EOF, la("D")]);
+
+        assert_eq!(
+            first(&first_sets, &[nt("B"), t("E")], EOF),
+            vec![la("D"), la("E")]);
+    }
+}
diff --git a/src/lr1/mod.rs b/src/lr1/mod.rs
new file mode 100644
index 0000000..5fd2a06
--- /dev/null
+++ b/src/lr1/mod.rs
@@ -0,0 +1,45 @@
+//! Naive LR(1) generation algorithm.
+
+use grammar::repr::*;
+use std::collections::{HashMap};
+
+mod first;
+
+struct LR1<'grammar> {
+    grammar: &'grammar Grammar,
+    states: Vec<State<'grammar>>,
+    first_sets: first::FirstSets,
+}
+
+struct State<'grammar> {
+    items: Vec<Configuration<'grammar>>,
+    shifts: HashMap<TerminalString, StateIndex>,
+    gotos: HashMap<NonterminalString, StateIndex>,
+}
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct StateIndex(usize);
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+enum Lookahead {
+    EOF,
+    Terminal(TerminalString),
+}
+
+struct Configuration<'grammar> {
+    production: &'grammar Production,
+    index: usize, // the dot comes before `index`, so `index` would be 1 for X = A (*) B C
+    lookahead: Lookahead,
+}
+
+impl<'grammar> LR1<'grammar> {
+    fn new(grammar: &'grammar Grammar) -> LR1 {
+        LR1 {
+            grammar: grammar,
+            states: vec![],
+            first_sets: first::FirstSets::new(grammar),
+        }
+    }
+
+}
+
diff --git a/src/main.rs b/src/main.rs
index 01c3227..4abebb9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -8,6 +8,7 @@ extern crate regex;
 
 mod grammar;
 mod intern;
+mod lr1;
 mod normalize;
 mod parser;
 mod util;
diff --git a/src/normalize/lower/mod.rs b/src/normalize/lower/mod.rs
index 6982f18..85732fe 100644
--- a/src/normalize/lower/mod.rs
+++ b/src/normalize/lower/mod.rs
@@ -6,7 +6,7 @@ use intern::{self, intern, InternedString};
 use normalize::NormResult;
 use normalize::norm_util::{self, Symbols};
 use grammar::parse_tree as pt;
-use grammar::parse_tree::{TerminalString, NonterminalString};
+use grammar::parse_tree::{TerminalString};
 use grammar::repr as r;
 
 #[cfg(test)]
diff --git a/src/normalize/lower/test.rs b/src/normalize/lower/test.rs
index c0258a2..3915b02 100644
--- a/src/normalize/lower/test.rs
+++ b/src/normalize/lower/test.rs
@@ -1,17 +1,12 @@
-use intern::InternedString;
 use grammar::repr::{Grammar, Production};
 use normalize::normalize;
 use normalize::test_util::expect_debug;
 use parser;
-use std::fmt::{Debug, Formatter, Error};
 
 fn flat_productions(grammar: &Grammar) -> Vec<Production> {
     let mut productions: Vec<_> =
-        grammar.productions.iter()
-                           .flat_map(|(&nt, prods)| {
-                               prods.iter()
-                                    .cloned()
-                           })
+        grammar.productions.values()
+                           .flat_map(|prods| prods.iter().cloned())
                            .collect();
 
     // sort by the action fn index just to get a consistent ordering
diff --git a/src/util.rs b/src/util.rs
index 89de3a5..02b4e07 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -15,9 +15,3 @@ impl<'a,S:Display> Display for Sep<&'a Vec<S>> {
         Ok(())
     }
 }
-
-fn shift<T:Clone>(slice: &mut &[T]) -> T {
-    let elem = slice[0].clone();
-    *slice = &slice[1..];
-    elem
-}