Start working on a naive LR(1) algorithm (seems like a good starting

point). Implement first set computation.
2025-03-16 17:00:53 +00:00 · 2015-06-19 06:42:11 -04:00 · 2015-06-19 06:42:11 -04:00 · 4e5204078a
commit 4e5204078a
parent 334a419c49
7 changed files with 192 additions and 16 deletions
--- a/src/grammar/repr.rs
+++ b/src/grammar/repr.rs
@ -5,11 +5,13 @@
 */

 use intern::InternedString;
-use grammar::parse_tree::{NonterminalString, Span, TerminalString};
 use std::collections::HashMap;
 use std::fmt::{Debug, Display, Formatter, Error};
 use util::Sep;

+// These concepts we re-use wholesale 
+pub use grammar::parse_tree::{NonterminalString, Span, TerminalString};
+
 #[derive(Clone, Debug)]
 pub struct Grammar {
    pub action_fn_defns: Vec<ActionFnDefn>,
@ -186,7 +188,7 @@ impl Grammar {
        }
    }

-    fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] {
+    pub fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] {
        match self.productions.get(&nonterminal) {
            Some(v) => &v[..],
            None => &[], // this...probably shouldn't happen actually?
--- a/src/lr1/first.rs
+++ b/src/lr1/first.rs
@ -0,0 +1,139 @@
+//! First set construction and computation.
+
+use grammar::repr::*;
+use std::collections::{HashMap, HashSet};
+
+use super::Lookahead;
+
+pub struct FirstSets {
+    map: HashMap<NonterminalString, FirstSet>
+}
+
+pub type FirstSet = HashSet<Option<TerminalString>>;
+
+impl FirstSets {
+    pub fn new(grammar: &Grammar) -> FirstSets {
+        let mut this = FirstSets { map: HashMap::new() };
+        let mut changed = true;
+        while changed {
+            changed = false;
+            for production in grammar.productions.values().flat_map(|p| p.iter()) {
+                let nt = production.nonterminal;
+                let lookahead = this.first(&production.symbols, Lookahead::EOF);
+                let first_set = this.map.entry(nt).or_insert_with(|| HashSet::new());
+                let cardinality = first_set.len();
+                first_set.extend(
+                    lookahead.into_iter()
+                             .map(|la| match la {
+                                 Lookahead::EOF => None,
+                                 Lookahead::Terminal(t) => Some(t),
+                             }));
+                changed |= (cardinality != first_set.len());
+            }
+        }
+        this
+    }
+
+    pub fn first_set(&self, nt: NonterminalString) -> &FirstSet {
+        &self.map[&nt]
+    }
+
+    pub fn first(&self, symbols: &[Symbol], lookahead: Lookahead) -> Vec<Lookahead> {
+        let mut result = vec![];
+
+        for symbol in symbols {
+            match *symbol {
+                Symbol::Terminal(t) => {
+                    result.push(Lookahead::Terminal(t));
+                    return result;
+                }
+
+                Symbol::Nonterminal(nt) => {
+                    let mut empty_prod = false;
+                    match self.map.get(&nt) {
+                        None => {
+                            // This should only happen during set
+                            // construction; it corresponds to an
+                            // entry that has not yet been
+                            // built. Otherwise, it would mean a
+                            // terminal with no productions. Either
+                            // way, the resulting first set should be
+                            // empty.
+                        }
+                        Some(set) => {
+                            for &opt_terminal in set {
+                                if let Some(terminal) = opt_terminal {
+                                    result.push(Lookahead::Terminal(terminal));
+                                } else {
+                                    empty_prod = true;
+                                }
+                            }
+                        }
+                    }
+                    if !empty_prod {
+                        return result;
+                    }
+                }
+            }
+        }
+
+        result.push(lookahead);
+        result
+    }
+}
+
+mod test {
+    use intern::intern;
+    use normalize::normalize;
+    use parser;
+    use grammar::repr::*;
+    use lr1::Lookahead;
+    use lr1::Lookahead::EOF;
+    use super::FirstSets;
+
+    fn nt(t: &str) -> Symbol {
+        Symbol::Nonterminal(NonterminalString(intern(t)))
+    }
+
+    fn t(t: &str) -> Symbol {
+        Symbol::Terminal(TerminalString(intern(t)))
+    }
+
+    fn la(t: &str) -> Lookahead {
+        Lookahead::Terminal(TerminalString(intern(t)))
+    }
+
+    fn first(first: &FirstSets, symbols: &[Symbol], lookahead: Lookahead) -> Vec<Lookahead> {
+        let mut v = first.first(symbols, lookahead);
+        v.sort();
+        v
+    }
+
+    #[test]
+    fn basic() {
+        let grammar = parser::parse_grammar(r#"
+grammar Foo {
+    token Tok where { };
+    A = B "C";
+    B: Option<u32> = {
+        "D" => Some(1);
+        => None;
+    };
+}
+"#).unwrap();
+        let grammar = normalize(grammar).unwrap();
+        let first_sets = FirstSets::new(&grammar);
+
+        assert_eq!(
+            first(&first_sets, &[nt("A")], EOF),
+            vec![la("C"), la("D")]);
+
+        assert_eq!(
+            first(&first_sets, &[nt("B")], EOF),
+            vec![EOF, la("D")]);
+
+        assert_eq!(
+            first(&first_sets, &[nt("B"), t("E")], EOF),
+            vec![la("D"), la("E")]);
+    }
+}
--- a/src/lr1/mod.rs
+++ b/src/lr1/mod.rs
@ -0,0 +1,45 @@
+//! Naive LR(1) generation algorithm.
+
+use grammar::repr::*;
+use std::collections::{HashMap};
+
+mod first;
+
+struct LR1<'grammar> {
+    grammar: &'grammar Grammar,
+    states: Vec<State<'grammar>>,
+    first_sets: first::FirstSets,
+}
+
+struct State<'grammar> {
+    items: Vec<Configuration<'grammar>>,
+    shifts: HashMap<TerminalString, StateIndex>,
+    gotos: HashMap<NonterminalString, StateIndex>,
+}
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct StateIndex(usize);
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+enum Lookahead {
+    EOF,
+    Terminal(TerminalString),
+}
+
+struct Configuration<'grammar> {
+    production: &'grammar Production,
+    index: usize, // the dot comes before `index`, so `index` would be 1 for X = A (*) B C
+    lookahead: Lookahead,
+}
+
+impl<'grammar> LR1<'grammar> {
+    fn new(grammar: &'grammar Grammar) -> LR1 {
+        LR1 {
+            grammar: grammar,
+            states: vec![],
+            first_sets: first::FirstSets::new(grammar),
+        }
+    }
+
+}
+
--- a/src/main.rs
+++ b/src/main.rs
@ -8,6 +8,7 @@ extern crate regex;

 mod grammar;
 mod intern;
+mod lr1;
 mod normalize;
 mod parser;
 mod util;
--- a/src/normalize/lower/mod.rs
+++ b/src/normalize/lower/mod.rs
@ -6,7 +6,7 @@ use intern::{self, intern, InternedString};
 use normalize::NormResult;
 use normalize::norm_util::{self, Symbols};
 use grammar::parse_tree as pt;
-use grammar::parse_tree::{TerminalString, NonterminalString};
+use grammar::parse_tree::{TerminalString};
 use grammar::repr as r;

 #[cfg(test)]
--- a/src/normalize/lower/test.rs
+++ b/src/normalize/lower/test.rs
@ -1,17 +1,12 @@
-use intern::InternedString;
 use grammar::repr::{Grammar, Production};
 use normalize::normalize;
 use normalize::test_util::expect_debug;
 use parser;
-use std::fmt::{Debug, Formatter, Error};

 fn flat_productions(grammar: &Grammar) -> Vec<Production> {
    let mut productions: Vec<_> =
-        grammar.productions.iter()
-                           .flat_map(|(&nt, prods)| {
-                               prods.iter()
-                                    .cloned()
-                           })
+        grammar.productions.values()
+                           .flat_map(|prods| prods.iter().cloned())
                           .collect();

    // sort by the action fn index just to get a consistent ordering
--- a/src/util.rs
+++ b/src/util.rs
@ -15,9 +15,3 @@ impl<'a,S:Display> Display for Sep<&'a Vec<S>> {
        Ok(())
    }
 }
-
-fn shift<T:Clone>(slice: &mut &[T]) -> T {
-    let elem = slice[0].clone();
-    *slice = &slice[1..];
-    elem
-}