From 8c577422bff079a156cad952bf491b4d21f67ea4 Mon Sep 17 00:00:00 2001 From: Niko Matsakis Date: Fri, 19 Jun 2015 10:17:35 -0400 Subject: [PATCH] Start the naive LR(1) algorithm with support for transitive closures over epsilon moves --- src/grammar/parse_tree.rs | 2 +- src/grammar/repr.rs | 8 +- src/lr1/{first.rs => first/mod.rs} | 62 +-------------- src/lr1/first/test.rs | 51 +++++++++++++ src/lr1/mod.rs | 116 ++++++++++++++++++++++++++++- src/lr1/test.rs | 40 ++++++++++ src/main.rs | 7 ++ src/normalize/lower/test.rs | 2 +- src/normalize/macro_expand/test.rs | 2 +- src/normalize/mod.rs | 3 - src/{normalize => }/test_util.rs | 5 ++ src/util.rs | 14 ++++ 12 files changed, 240 insertions(+), 72 deletions(-) rename src/lr1/{first.rs => first/mod.rs} (66%) create mode 100644 src/lr1/first/test.rs create mode 100644 src/lr1/test.rs rename src/{normalize => }/test_util.rs (88%) diff --git a/src/grammar/parse_tree.rs b/src/grammar/parse_tree.rs index 2395053..5a9bdd1 100644 --- a/src/grammar/parse_tree.rs +++ b/src/grammar/parse_tree.rs @@ -71,7 +71,7 @@ pub struct Grammar { pub items: Vec, } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct Span(pub usize, pub usize); #[derive(Clone, Debug, PartialEq, Eq)] diff --git a/src/grammar/repr.rs b/src/grammar/repr.rs index 40d2cd3..8672944 100644 --- a/src/grammar/repr.rs +++ b/src/grammar/repr.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::fmt::{Debug, Display, Formatter, Error}; use util::Sep; -// These concepts we re-use wholesale +// These concepts we re-use wholesale pub use grammar::parse_tree::{NonterminalString, Span, TerminalString}; #[derive(Clone, Debug)] @@ -20,7 +20,7 @@ pub struct Grammar { pub types: Types, } -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct Production { // this overlaps with the key in the hashmap, obviously, but it's // handy to have it @@ -30,7 +30,7 @@ pub struct Production { pub span: Span, } -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum Symbol { Nonterminal(NonterminalString), Terminal(TerminalString), @@ -101,7 +101,7 @@ impl Debug for TypeRepr { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialOrd, Ord, PartialEq, Eq)] pub struct ActionFn(u32); impl ActionFn { diff --git a/src/lr1/first.rs b/src/lr1/first/mod.rs similarity index 66% rename from src/lr1/first.rs rename to src/lr1/first/mod.rs index 0745e98..e9cdcc2 100644 --- a/src/lr1/first.rs +++ b/src/lr1/first/mod.rs @@ -5,6 +5,9 @@ use std::collections::{HashMap, HashSet}; use super::Lookahead; +#[cfg(test)] +mod test; + pub struct FirstSets { map: HashMap } @@ -34,10 +37,6 @@ impl FirstSets { this } - pub fn first_set(&self, nt: NonterminalString) -> &FirstSet { - &self.map[&nt] - } - pub fn first(&self, symbols: &[Symbol], lookahead: Lookahead) -> Vec { let mut result = vec![]; @@ -82,58 +81,3 @@ impl FirstSets { } } -mod test { - use intern::intern; - use normalize::normalize; - use parser; - use grammar::repr::*; - use lr1::Lookahead; - use lr1::Lookahead::EOF; - use super::FirstSets; - - fn nt(t: &str) -> Symbol { - Symbol::Nonterminal(NonterminalString(intern(t))) - } - - fn t(t: &str) -> Symbol { - Symbol::Terminal(TerminalString(intern(t))) - } - - fn la(t: &str) -> Lookahead { - Lookahead::Terminal(TerminalString(intern(t))) - } - - fn first(first: &FirstSets, symbols: &[Symbol], lookahead: Lookahead) -> Vec { - let mut v = first.first(symbols, lookahead); - v.sort(); - v - } - - #[test] - fn basic() { - let grammar = parser::parse_grammar(r#" -grammar Foo { - token Tok where { }; - A = B "C"; - B: Option = { - "D" => Some(1); - => None; - }; -} -"#).unwrap(); - let grammar = normalize(grammar).unwrap(); - let first_sets = FirstSets::new(&grammar); - - assert_eq!( - first(&first_sets, &[nt("A")], EOF), - vec![la("C"), la("D")]); - - assert_eq!( - first(&first_sets, &[nt("B")], EOF), - vec![EOF, la("D")]); - - assert_eq!( - first(&first_sets, &[nt("B"), t("E")], EOF), - vec![la("D"), la("E")]); - } -} diff --git a/src/lr1/first/test.rs b/src/lr1/first/test.rs new file mode 100644 index 0000000..19a92e9 --- /dev/null +++ b/src/lr1/first/test.rs @@ -0,0 +1,51 @@ +use intern::intern; +use grammar::repr::*; +use lr1::Lookahead; +use lr1::Lookahead::EOF; +use test_util::{normalized_grammar}; +use super::FirstSets; + +pub fn nt(t: &str) -> Symbol { + Symbol::Nonterminal(NonterminalString(intern(t))) +} + +pub fn term(t: &str) -> Symbol { + Symbol::Terminal(TerminalString(intern(t))) +} + +fn la(t: &str) -> Lookahead { + Lookahead::Terminal(TerminalString(intern(t))) +} + +fn first(first: &FirstSets, symbols: &[Symbol], lookahead: Lookahead) -> Vec { + let mut v = first.first(symbols, lookahead); + v.sort(); + v +} + +#[test] +fn basic() { + let grammar = normalized_grammar(r#" +grammar Foo { + token Tok where { }; + A = B "C"; + B: Option = { + "D" => Some(1); + => None; + }; +} +"#); + let first_sets = FirstSets::new(&grammar); + + assert_eq!( + first(&first_sets, &[nt("A")], EOF), + vec![la("C"), la("D")]); + + assert_eq!( + first(&first_sets, &[nt("B")], EOF), + vec![EOF, la("D")]); + + assert_eq!( + first(&first_sets, &[nt("B"), term("E")], EOF), + vec![la("D"), la("E")]); +} diff --git a/src/lr1/mod.rs b/src/lr1/mod.rs index 5fd2a06..04e6c3a 100644 --- a/src/lr1/mod.rs +++ b/src/lr1/mod.rs @@ -1,10 +1,14 @@ //! Naive LR(1) generation algorithm. use grammar::repr::*; -use std::collections::{HashMap}; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Formatter, Error}; +use util::Prefix; mod first; +#[cfg(test)] mod test; + struct LR1<'grammar> { grammar: &'grammar Grammar, states: Vec>, @@ -12,7 +16,7 @@ struct LR1<'grammar> { } struct State<'grammar> { - items: Vec>, + configurations: Vec>, shifts: HashMap, gotos: HashMap, } @@ -20,12 +24,13 @@ struct State<'grammar> { #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] struct StateIndex(usize); -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] enum Lookahead { EOF, Terminal(TerminalString), } +#[derive(Copy, Clone, Hash, PartialEq, Eq)] struct Configuration<'grammar> { production: &'grammar Production, index: usize, // the dot comes before `index`, so `index` would be 1 for X = A (*) B C @@ -41,5 +46,110 @@ impl<'grammar> LR1<'grammar> { } } + fn build_states(&mut self, start_nt: NonterminalString) { + debug_assert!(self.states.is_empty()); + + let state0 = self.start_state(start_nt, Lookahead::EOF); + self.states.push(state0); + } + + fn start_state(&self, id: NonterminalString, lookahead: Lookahead) -> State<'grammar> { + let configurations = + self.transitive_closure( + self.start_configurations(id, lookahead)); + State { configurations: configurations, + shifts: HashMap::new(), + gotos: HashMap::new() } + } + + fn start_configurations(&self, + id: NonterminalString, + lookahead: Lookahead) + -> Vec> + { + self.grammar.productions_for(id) + .iter() + .map(|production| { + Configuration { production: production, + index: 0, + lookahead: lookahead } + }) + .collect() + } + + // expands `state` with epsilon moves + fn transitive_closure(&self, mut configurations: Vec>) + -> Vec> + { + println!("expand_configurations({:?})", configurations); + + let mut counter = 0; + + let mut set: HashSet> = + configurations.iter().cloned().collect(); + + while counter < configurations.len() { + println!("expand_configurations: counter={:?}", counter); + + let new_configurations: Vec<_> = + configurations[counter..] + .iter() + .filter_map(|configuration| { + let shift_symbol = configuration.shift_symbol(); + println!("expand_configurations: configuration: {:?} shift_symbol: {:?}", + configuration, shift_symbol); + match shift_symbol { + None => None, // requires a reduce + Some((Symbol::Terminal(_), _)) => None, // requires a shift + Some((Symbol::Nonterminal(nt), remainder)) => { + Some((nt, remainder, configuration.lookahead)) + } + } + }) + .flat_map(|(nt, remainder, lookahead)| { + let first_set = self.first_sets.first(remainder, lookahead); + println!("expand_configurations: ({:?}, {:?}, {:?}) first_set={:?}", + nt, remainder, lookahead, first_set); + first_set.into_iter() + .flat_map(move |l| self.start_configurations(nt, l)) + }) + .filter(|&configuration| set.insert(configuration)) + .collect(); + + counter = configurations.len(); + configurations.extend(new_configurations); + } + + configurations + } } +impl<'grammar> Configuration<'grammar> { + fn shift_symbol(&self) -> Option<(Symbol, &[Symbol])> { + if self.index == self.production.symbols.len() { + None + } else { + Some((self.production.symbols[self.index], + &self.production.symbols[self.index+1..])) + } + } +} + +impl<'grammar> Debug for Configuration<'grammar> { + fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> { + write!(fmt, "{} ={} (*){} [{:?}]", + self.production.nonterminal, + Prefix(" ", &self.production.symbols[..self.index]), + Prefix(" ", &self.production.symbols[self.index..]), + self.lookahead) + } +} + +impl Debug for Lookahead { + fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> { + match *self { + Lookahead::EOF => write!(fmt, "EOF"), + Lookahead::Terminal(s) => write!(fmt, "{}", s), + } + } +} diff --git a/src/lr1/test.rs b/src/lr1/test.rs new file mode 100644 index 0000000..a94c662 --- /dev/null +++ b/src/lr1/test.rs @@ -0,0 +1,40 @@ +use intern::intern; +use grammar::repr::*; +use test_util::{expect_debug, normalized_grammar}; +use super::{Configuration, Lookahead, LR1}; +use super::Lookahead::EOF; + +fn nt(t: &str) -> NonterminalString { + NonterminalString(intern(t)) +} + +fn configurations<'g>(grammar: &'g Grammar, nonterminal: &str, la: Lookahead) + -> Vec> +{ + let lr1 = LR1::new(&grammar); + let configurations = + lr1.transitive_closure( + lr1.start_configurations(nt(nonterminal), la)); + configurations +} + +#[test] +fn start_state() { + let grammar = normalized_grammar(r#" +grammar Foo { + token Tok where { }; + A = B "C"; + B: Option = { + "D" => Some(1); + => None; + }; +} +"#); + let configurations = configurations(&grammar, "A", EOF); + expect_debug(configurations, r#"[ + A = (*) B "C" [EOF], + B = (*) "D" ["C"], + B = (*) ["C"] +]"#); +} + diff --git a/src/main.rs b/src/main.rs index 4abebb9..075869d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,9 @@ +// Need this for rusty_peg #![recursion_limit="256"] +// I hate this lint. +#![allow(unused_parens)] + #[macro_use] extern crate rusty_peg; extern crate diff; @@ -13,6 +17,9 @@ mod normalize; mod parser; mod util; +#[cfg(test)] +mod test_util; + #[cfg(not(test))] fn main() { println!("Hello, world!"); diff --git a/src/normalize/lower/test.rs b/src/normalize/lower/test.rs index 3915b02..ca277bd 100644 --- a/src/normalize/lower/test.rs +++ b/src/normalize/lower/test.rs @@ -1,7 +1,7 @@ use grammar::repr::{Grammar, Production}; use normalize::normalize; -use normalize::test_util::expect_debug; use parser; +use test_util::expect_debug; fn flat_productions(grammar: &Grammar) -> Vec { let mut productions: Vec<_> = diff --git a/src/normalize/macro_expand/test.rs b/src/normalize/macro_expand/test.rs index 68caccf..4a7638e 100644 --- a/src/normalize/macro_expand/test.rs +++ b/src/normalize/macro_expand/test.rs @@ -1,5 +1,5 @@ use parser; -use normalize::test_util::compare; +use test_util::compare; use super::expand_macros; diff --git a/src/normalize/mod.rs b/src/normalize/mod.rs index 5bde4c7..b9fb8cc 100644 --- a/src/normalize/mod.rs +++ b/src/normalize/mod.rs @@ -60,8 +60,5 @@ mod lower; /////////////////////////////////////////////////////////////////////////// // Shared routines -#[cfg(test)] -mod test_util; - mod norm_util; diff --git a/src/normalize/test_util.rs b/src/test_util.rs similarity index 88% rename from src/normalize/test_util.rs rename to src/test_util.rs index e54ad78..f9b3321 100644 --- a/src/normalize/test_util.rs +++ b/src/test_util.rs @@ -1,4 +1,6 @@ use diff; +use grammar::repr as r; +use intern::intern; use regex::Regex; use std::fmt::{Debug, Formatter, Error}; @@ -44,3 +46,6 @@ pub fn compare(actual: D, expected: E) { }); } +pub fn normalized_grammar(s: &str) -> r::Grammar { + ::normalize::normalize(::parser::parse_grammar(s).unwrap()).unwrap() +} diff --git a/src/util.rs b/src/util.rs index 02b4e07..042ca64 100644 --- a/src/util.rs +++ b/src/util.rs @@ -15,3 +15,17 @@ impl<'a,S:Display> Display for Sep<&'a Vec> { Ok(()) } } + +pub struct Prefix(pub &'static str, pub S); + +impl<'a,S:Display> Display for Prefix<&'a [S]> { + fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> { + let &Prefix(prefix, vec) = self; + let mut elems = vec.iter(); + while let Some(elem) = elems.next() { + try!(write!(fmt, "{}{}", prefix, elem)); + } + Ok(()) + } +} +