From 4e5204078a61cfc764dd6393840818948aeeb148 Mon Sep 17 00:00:00 2001 From: Niko Matsakis Date: Fri, 19 Jun 2015 06:42:11 -0400 Subject: [PATCH] Start working on a naive LR(1) algorithm (seems like a good starting point). Implement first set computation. --- src/grammar/repr.rs | 6 +- src/lr1/first.rs | 139 ++++++++++++++++++++++++++++++++++++ src/lr1/mod.rs | 45 ++++++++++++ src/main.rs | 1 + src/normalize/lower/mod.rs | 2 +- src/normalize/lower/test.rs | 9 +-- src/util.rs | 6 -- 7 files changed, 192 insertions(+), 16 deletions(-) create mode 100644 src/lr1/first.rs create mode 100644 src/lr1/mod.rs diff --git a/src/grammar/repr.rs b/src/grammar/repr.rs index 8f67113..40d2cd3 100644 --- a/src/grammar/repr.rs +++ b/src/grammar/repr.rs @@ -5,11 +5,13 @@ */ use intern::InternedString; -use grammar::parse_tree::{NonterminalString, Span, TerminalString}; use std::collections::HashMap; use std::fmt::{Debug, Display, Formatter, Error}; use util::Sep; +// These concepts we re-use wholesale +pub use grammar::parse_tree::{NonterminalString, Span, TerminalString}; + #[derive(Clone, Debug)] pub struct Grammar { pub action_fn_defns: Vec, @@ -186,7 +188,7 @@ impl Grammar { } } - fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] { + pub fn productions_for(&self, nonterminal: NonterminalString) -> &[Production] { match self.productions.get(&nonterminal) { Some(v) => &v[..], None => &[], // this...probably shouldn't happen actually? diff --git a/src/lr1/first.rs b/src/lr1/first.rs new file mode 100644 index 0000000..0745e98 --- /dev/null +++ b/src/lr1/first.rs @@ -0,0 +1,139 @@ +//! First set construction and computation. + +use grammar::repr::*; +use std::collections::{HashMap, HashSet}; + +use super::Lookahead; + +pub struct FirstSets { + map: HashMap +} + +pub type FirstSet = HashSet>; + +impl FirstSets { + pub fn new(grammar: &Grammar) -> FirstSets { + let mut this = FirstSets { map: HashMap::new() }; + let mut changed = true; + while changed { + changed = false; + for production in grammar.productions.values().flat_map(|p| p.iter()) { + let nt = production.nonterminal; + let lookahead = this.first(&production.symbols, Lookahead::EOF); + let first_set = this.map.entry(nt).or_insert_with(|| HashSet::new()); + let cardinality = first_set.len(); + first_set.extend( + lookahead.into_iter() + .map(|la| match la { + Lookahead::EOF => None, + Lookahead::Terminal(t) => Some(t), + })); + changed |= (cardinality != first_set.len()); + } + } + this + } + + pub fn first_set(&self, nt: NonterminalString) -> &FirstSet { + &self.map[&nt] + } + + pub fn first(&self, symbols: &[Symbol], lookahead: Lookahead) -> Vec { + let mut result = vec![]; + + for symbol in symbols { + match *symbol { + Symbol::Terminal(t) => { + result.push(Lookahead::Terminal(t)); + return result; + } + + Symbol::Nonterminal(nt) => { + let mut empty_prod = false; + match self.map.get(&nt) { + None => { + // This should only happen during set + // construction; it corresponds to an + // entry that has not yet been + // built. Otherwise, it would mean a + // terminal with no productions. Either + // way, the resulting first set should be + // empty. + } + Some(set) => { + for &opt_terminal in set { + if let Some(terminal) = opt_terminal { + result.push(Lookahead::Terminal(terminal)); + } else { + empty_prod = true; + } + } + } + } + if !empty_prod { + return result; + } + } + } + } + + result.push(lookahead); + result + } +} + +mod test { + use intern::intern; + use normalize::normalize; + use parser; + use grammar::repr::*; + use lr1::Lookahead; + use lr1::Lookahead::EOF; + use super::FirstSets; + + fn nt(t: &str) -> Symbol { + Symbol::Nonterminal(NonterminalString(intern(t))) + } + + fn t(t: &str) -> Symbol { + Symbol::Terminal(TerminalString(intern(t))) + } + + fn la(t: &str) -> Lookahead { + Lookahead::Terminal(TerminalString(intern(t))) + } + + fn first(first: &FirstSets, symbols: &[Symbol], lookahead: Lookahead) -> Vec { + let mut v = first.first(symbols, lookahead); + v.sort(); + v + } + + #[test] + fn basic() { + let grammar = parser::parse_grammar(r#" +grammar Foo { + token Tok where { }; + A = B "C"; + B: Option = { + "D" => Some(1); + => None; + }; +} +"#).unwrap(); + let grammar = normalize(grammar).unwrap(); + let first_sets = FirstSets::new(&grammar); + + assert_eq!( + first(&first_sets, &[nt("A")], EOF), + vec![la("C"), la("D")]); + + assert_eq!( + first(&first_sets, &[nt("B")], EOF), + vec![EOF, la("D")]); + + assert_eq!( + first(&first_sets, &[nt("B"), t("E")], EOF), + vec![la("D"), la("E")]); + } +} diff --git a/src/lr1/mod.rs b/src/lr1/mod.rs new file mode 100644 index 0000000..5fd2a06 --- /dev/null +++ b/src/lr1/mod.rs @@ -0,0 +1,45 @@ +//! Naive LR(1) generation algorithm. + +use grammar::repr::*; +use std::collections::{HashMap}; + +mod first; + +struct LR1<'grammar> { + grammar: &'grammar Grammar, + states: Vec>, + first_sets: first::FirstSets, +} + +struct State<'grammar> { + items: Vec>, + shifts: HashMap, + gotos: HashMap, +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct StateIndex(usize); + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +enum Lookahead { + EOF, + Terminal(TerminalString), +} + +struct Configuration<'grammar> { + production: &'grammar Production, + index: usize, // the dot comes before `index`, so `index` would be 1 for X = A (*) B C + lookahead: Lookahead, +} + +impl<'grammar> LR1<'grammar> { + fn new(grammar: &'grammar Grammar) -> LR1 { + LR1 { + grammar: grammar, + states: vec![], + first_sets: first::FirstSets::new(grammar), + } + } + +} + diff --git a/src/main.rs b/src/main.rs index 01c3227..4abebb9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ extern crate regex; mod grammar; mod intern; +mod lr1; mod normalize; mod parser; mod util; diff --git a/src/normalize/lower/mod.rs b/src/normalize/lower/mod.rs index 6982f18..85732fe 100644 --- a/src/normalize/lower/mod.rs +++ b/src/normalize/lower/mod.rs @@ -6,7 +6,7 @@ use intern::{self, intern, InternedString}; use normalize::NormResult; use normalize::norm_util::{self, Symbols}; use grammar::parse_tree as pt; -use grammar::parse_tree::{TerminalString, NonterminalString}; +use grammar::parse_tree::{TerminalString}; use grammar::repr as r; #[cfg(test)] diff --git a/src/normalize/lower/test.rs b/src/normalize/lower/test.rs index c0258a2..3915b02 100644 --- a/src/normalize/lower/test.rs +++ b/src/normalize/lower/test.rs @@ -1,17 +1,12 @@ -use intern::InternedString; use grammar::repr::{Grammar, Production}; use normalize::normalize; use normalize::test_util::expect_debug; use parser; -use std::fmt::{Debug, Formatter, Error}; fn flat_productions(grammar: &Grammar) -> Vec { let mut productions: Vec<_> = - grammar.productions.iter() - .flat_map(|(&nt, prods)| { - prods.iter() - .cloned() - }) + grammar.productions.values() + .flat_map(|prods| prods.iter().cloned()) .collect(); // sort by the action fn index just to get a consistent ordering diff --git a/src/util.rs b/src/util.rs index 89de3a5..02b4e07 100644 --- a/src/util.rs +++ b/src/util.rs @@ -15,9 +15,3 @@ impl<'a,S:Display> Display for Sep<&'a Vec> { Ok(()) } } - -fn shift(slice: &mut &[T]) -> T { - let elem = slice[0].clone(); - *slice = &slice[1..]; - elem -}