diff --git a/lalrpop-snap/src/lib.rs b/lalrpop-snap/src/lib.rs index 22c5ded..ae3ec29 100644 --- a/lalrpop-snap/src/lib.rs +++ b/lalrpop-snap/src/lib.rs @@ -10,7 +10,7 @@ extern crate atty; extern crate bit_set; -#[macro_use] extern crate bitflags; +extern crate bitflags; extern crate diff; extern crate itertools; extern crate lalrpop_intern as intern; diff --git a/lalrpop/Cargo.toml b/lalrpop/Cargo.toml index 4d71cf1..34a7ad7 100644 --- a/lalrpop/Cargo.toml +++ b/lalrpop/Cargo.toml @@ -17,9 +17,9 @@ doctest = false ascii-canvas = "1.0" atty = "0.1.2" bit-set = "0.4.0" -bitflags = "0.8.0" diff = "0.1.9" docopt = "0.7" +ena = "0.5" itertools = "0.5.9" regex = "0.2.1" regex-syntax = "0.2" diff --git a/lalrpop/src/lib.rs b/lalrpop/src/lib.rs index fe6aa18..aff725e 100644 --- a/lalrpop/src/lib.rs +++ b/lalrpop/src/lib.rs @@ -12,6 +12,7 @@ extern crate ascii_canvas; extern crate atty; extern crate bit_set; extern crate diff; +extern crate ena; extern crate itertools; extern crate lalrpop_intern as intern; extern crate lalrpop_util; diff --git a/lalrpop/src/lr1/core/mod.rs b/lalrpop/src/lr1/core/mod.rs index 8ba2b19..1e3f04e 100644 --- a/lalrpop/src/lr1/core/mod.rs +++ b/lalrpop/src/lr1/core/mod.rs @@ -118,7 +118,7 @@ pub type LR0Items<'grammar> = Items<'grammar, Nil>; #[allow(dead_code)] pub type LR1Items<'grammar> = Items<'grammar, TokenSet>; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct State<'grammar, L: Lookahead> { pub index: StateIndex, pub items: Items<'grammar, L>, @@ -130,7 +130,7 @@ pub struct State<'grammar, L: Lookahead> { pub type LR0State<'grammar> = State<'grammar, Nil>; pub type LR1State<'grammar> = State<'grammar, TokenSet>; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Action<'grammar> { Shift(TerminalString, StateIndex), Reduce(&'grammar Production), diff --git a/lalrpop/src/lr1/lane_table/README.md b/lalrpop/src/lr1/lane_table/README.md new file mode 100644 index 0000000..ab4593b --- /dev/null +++ b/lalrpop/src/lr1/lane_table/README.md @@ -0,0 +1,417 @@ +This module contains code for LR(1) construction based on a paper by +Pager and Chen, "The Lane Table Method Of Constructing LR(1) Parsers", +published in APPLC '12. Unfortunately, that paper is quite compact -- +only 8 pages! -- which doesn't leave much room for examples and +explanation. This README is my attempt to explain the idea, or at +least how I chose to implement it in LALRPOP, which may or may not be +faithful to the original algorithm. Naturally it also serves as a +guide to the code. + +### First example grammar: G0 + +We will be working through two example grammars. The first I call G0 +-- it is a reduced version of what the paper calls G1. It is +interesting because it does not require splitting any states, and so +we wind up with the same number of states as in LR0. Put another way, +it is an LALR(1) grammar. + +#### Grammar G0 + +``` +G0 = X "c" + | Y "d" +X = "e" X + | "e" +Y = "e" Y + | "e" +``` + +#### Step 1: Construct an LR(0) state machine + +We begin by constructing an LR(0) state machine. The LR(0) states for +G0 are as follows: + +``` +S0 = G0 = (*) X "c" + | G0 = (*) Y "d" + | X = (*) "e" X + | X = (*) "e" + | Y = (*) "e" Y + | Y = (*) "e" + +S1 = X = "e" (*) X + | X = "e" (*) + | X = (*) "e" + | X = (*) "e" "X" + | Y = "e" (*) Y + | Y = "e" (*) + | Y = (*) "e" + | Y = (*) "e" Y + +S2 = X = "e" X (*) + +S3 = G0 = X (*) "c" + +S4 = Y = "e" Y (*) + +S5 = G0 = Y (*) "d" + +S6 = G0 = X "c" (*) + +S7 = G0 = Y "d" (*) +``` + +We can also consider *edges* between the states as follows, +with the label being the symbol that is pushed onto the stack: + +``` +S0 -"e"-> S1 +S1 -"e"-> S1 +S1 --X--> S2 +S0 --X--> S3 +S1 --Y--> S4 +S0 --Y--> S5 +S3 -"c"-> S6 +S5 -"d"-> S7 +``` + +Note that state S1 is "inconsistent", in that it has conflicting +actions. + +#### Step 2: Convert LR(0) states into LR(0-1) states. + +The term LR(0-1), but basically the idea is that the lookahead in a +LR(0-1) state can be either a set of terminals (as in LR(1)) or *none* +(as in LR(0)). You can also think of it alternatively as adding a +special "wildcard" symbol `_` to the grammar; in our actual code, we +represent this with `TokenSet::all()`. We will thus denote the +inconsistent state after transformation as follows, where each line +has the "wildcard" lookahead: + +``` +S1 = X = "e" (*) X [_] + | X = "e" (*) [_] + | X = (*) "e" [_] + | X = (*) "e" "X" [_] + | Y = "e" (*) Y [_] + | Y = "e" (*) [_] + | Y = (*) "e" [_] + | Y = (*) "e" Y [_] +``` + +Naturally, the state is still inconsistent. + +#### Step 3: Resolve inconsistencies. + +In the next step, we iterate over all of our LR(0-1) states. In this +example, we will not need to create new states, but in future examples +we will. The iteration thus consists of a queue and some code like +this: + +```rust +let mut queue = Queue::new(); +queue.extend(/* all states */); +while let Some(s) = queue.pop_front() { + if /* s is an inconsistent state */ { + resolve_inconsistencies(s, &mut queue); + } +} +``` + +##### Step 3a: Build the lane table. + +To resolve an inconsistent state, we first construct a **lane +table**. This is done by the code in the `lane` module (the `table` +module maintains the data structure). It works by structing at each +conflict and tracing **backwards**. Let's start with the final table +we will get for the state S1 and then we will work our way back to how +it is constructed. First, let's identify the conflicting actions from +S1 and give them indices: + +``` +S1 = X = (*) "e" [_] // C0 -- shift "e" + | X = "e" (*) [_] // C1 -- reduce `X = "e" (*)` + | X = (*) "e" "X" [_] // C0 -- shift "e" + | X = "e" (*) X [_] + | Y = (*) "e" [_] // C0 -- shift "e" + | Y = "e" (*) [_] // C2 -- reduce `Y = "e" (*)` + | Y = (*) "e" Y [_] // C0 -- shift "e" + | Y = "e" (*) Y [_] +``` + +Several of the items can cause "Confliction Action 0" (C0), which is +to shift an `"e"`. These are all mutually compatible. However, there +are also two incompatible actions: C1 and C2, both reductions. In +fact, we'll find that we look back at state S0, these 'conflicting' +actions all occur with distinct lookahead. The purpose of the lane +table is to summarize that information. The lane table we will up +constructing for these conflicting actions is as follows: + +``` +| State | C0 | C1 | C2 | Successors | +| S0 | | ["c"] | ["d"] | {S1} | +| S1 | ["e"] | [] | [] | {S1} | +``` + +Here the idea is that the lane table summarizes the lookahead +information contributed by each state. Note that for the *shift* the +state S1 already has enough lookahead information: we only shift when +we see the terminal we need next ("e"). But state C1 and C2, the lookahead +actually came from S0, which is a predecessor state. + +As I said earlier, the algorithm for constructing the table works by +looking at the conflicting item and walking backwards. So let's +illustrate with conflict C1. We have the conflicting item `X = "e" +(*)`, and we are basically looking to find its lookahead. We know +that somewhere in the distant past of our state machine there must be +an item like + + Foo = ...a (*) X ...b + +that led us here. We want to find that item, so we can derive the +lookahead from `...b` (whatever symbols come after `X`). + +To do this, we will walk the graph. Our state at any point in time +will be the pair of a state and an item in that state. To start out, +then, we have `(S1, X = "e" (*))`, which is the conflict C1. Because +the `(*)` is not at the "front" of this item, we have to figure out +where this `"e"` came from on our stack, so we look for predecessors +of the state S1 which have an item like `X = (*) e`. This leads us to +S0 and also S1. So we can push two states in our search: `(S0, X = (*) +"e")` and `(S1, X = (*) "e")`. Let's consider each in turn. + +The next state is then `(S0, X = (*) "e")`. Here the `(*)` lies at the +front of the item, so we search **the same state** S0 for items that +would have led to this state via an *epsilon move*. This basically +means an item like `Foo = ... (*) X ...` -- i.e., where the `(*)` +appears directly before the nonterminal `X`. In our case, we will find +`G0 = (*) X "c"`. This is great, because it tells us some lookahead +("c", in particular), and hence we can stop our search. We add to the +table the entry that the state S0 contributes lookahead "c" to the +conflict C1. In some cases, we might find something like `Foo = +... (*) X` instead, where the `X` we are looking for appears at the +end. In that case, we have to restart our search, but looking for the +lookahead for `Foo`. + +The next state in our case is `(S1, X = (*) e)`. Again the `(*)` lies +at the beginning and hence we search for things in the state S1 where +`X` is the next symbol. We find `X = "e" (*) X`. This is not as good +as last time, because there are no symbols appearing after X in this +item, so it does not contribute any lookahead. We therefore can't stop +our search yet, but we push the state `(S1, X = "e" (*) X)` -- this +corresponds to the `Foo` state I mentioned at the end of the last +paragraph, except that in this case `Foo` is the same nonterminal `X` +we started with. + +Looking at `(S1, X = "e" (*) X)`, we again have the `(*)` in the +middle of the item, so we move it left, searching for predecessors +with the item `X = (*) e X`. We will (again) find S0 and S1 have such +items. In the case of S0, we will (again) find the context "c", which +we dutifully add to the table (this has no effect, since it is already +present). In the case of S1, we will (again) wind up at the state +`(S1, X = "e" (*) X)`. Since we've already visited this state, we +stop our search, it will not lead to new context. + +At this point, our table column for C1 is complete. We can repeat the +process for C2, which plays out in an analogous way. + +##### Step 3b: Update the lookahead + +Looking at the lane table we built, we can union the context sets in +any particular column. We see that the context sets for each +conflicting action are pairwise disjoint. Therefore, we can simply +update each reduce action in our state with those lookaheads in mind, +and hence render it consistent: + +``` +S1 = X = (*) "e" [_] + | X = "e" (*) ["c"] // lookahead from C1 + | X = (*) "e" "X" [_] + | X = "e" (*) X [_] + | Y = (*) "e" [_] + | Y = "e" (*) ["d"] // lookahead from C2 + | Y = (*) "e" Y [_] + | Y = "e" (*) Y [_] +``` + +This is of course also what the LALR(1) state would look like (though +it would include context for the other items, though that doesn't play +into the final machine execution). + +At this point we've covered enough to handle the grammar G0. Let's +turn to a more complex grammar, grammar G1, and then we'll come back +to cover the remaining steps. + +### Second example: the grammar G1 + +G1 is a (typo corrected) version of the grammar from the paper. This +grammar is not LALR(1) and hence it is more interesting, because it +requires splitting states. + +#### Grammar G1 + +``` +G1 = "a" X "d" + | "a" Y "c" + | "b" X "c" + | "b" Y "d" +X = "e" X + | "e" +Y = "e" Y + | "e" +``` + +The key point of this grammar is that when we see `... "e" "c"` and we +wish to know whether to reduce to `X` or `Y`, we don't have enough +information. We need to know what is in the `...`, because `"a" "e" +"c"` means we reduce `"e"` to `Y` and `"b" "e" "c"` means we reduce to +`X`. In terms of our *state machine*, this corresponds to *splitting* +the states responsible for X and Y based on earlier context. + +Let's look at a subset of the LR(0) states for G1: + +``` +S0 = G0 = (*) "a" X "d" + | G0 = (*) "a" Y "c" + | G0 = (*) "b" X "c" + | G0 = (*) "b" X "d" + +S1 = G0 = "a" (*) X "d" + | G0 = "a" (*) Y "c" + | X = (*) "e" X + | X = (*) "e" + | Y = (*) "e" Y + | Y = (*) "e" + +S2 = G0 = "b" (*) X "c" + | G0 = "b" (*) Y "d" + | X = (*) "e" X + | X = (*) "e" + | Y = (*) "e" Y + | Y = (*) "e" + +S3 = X = "e" (*) X + | X = "e" (*) // C1 -- can reduce + | X = (*) "e" // C0 -- can shift "e" + | X = (*) "e" "X" // C0 -- can shift "e" + | Y = "e" (*) Y + | Y = "e" (*) // C2 -- can reduce + | Y = (*) "e" // C0 -- can shift "e" + | Y = (*) "e" Y // C0 -- can shift "e" +``` + +Here we can see the problem. The state S3 is inconsistent. But it is +reachable from both S1 and S2. If we come from S1, then we can have (e.g.) +`X "d"`, but if we come from S2, we expect `X "c"`. + +Let's walk through our algorithm again. I'll start with step 3a. + +### Step 3a: Build the lane table. + +The lane table for state S3 will look like this: + +``` +| State | C0 | C1 | C2 | Successors | +| S1 | | ["d"] | ["c"] | {S3} | +| S2 | | ["c"] | ["d"] | {S3} | +| S3 | ["e"] | [] | [] | {S3} | +``` + +Now if we union each column, we see that both C1 and C2 wind up with +lookahead `{"c", "d"}`. This is our problem. We have to isolate things +better. Therefore, step 3b ("update lookahead") does not apply. Instead +we attempt step 3c. + +### Step 3c: Isolate lanes + +This part of the algorithm is only loosely described in the paper, but +I think it works as follows. We will employ a union-find data +structure. With each set, we will record a "context set", which +records for each conflict the set of lookahead tokens (e.g., +`{C1:{"d"}}`). + +A context set tells us how to map the lookahead to an action; +therefire, to be self-consistent, the lookaheads for each conflict +must be mutually disjoint. In other words, `{C1:{"d"}, C2:{"c"}}` is +valid, and says to do C1 if we see a "d" and C2 if we see a "c". But +`{C1:{"d"}, C2:{"d"}}` is not, because there are two actions. + +Initially, each state in the lane table is mapped to itself, and the +conflict set is derived from its column in the lane table: + +``` +S1 = {C1:d, C2:c} +S2 = {C1:c, C2:d} +S3 = {C0:e} +``` + +We designate "beachhead" states as those states in the table that are +not reachable from another state in the table (i.e., using the +successors). In this case, those are the states S1 and S2. We will be +doing a DFS through the table and we want to use those as the starting +points. + +(Question: is there always at least one beachhead state? Seems like +there must be.) + +So we begin by iterating over the beachhead states. + +```rust +for beachhead in beachheads { ... } +``` + +When we visit a state X, we will examine each of its successors Y. We +consider whether the context set for Y can be merged with the context +set for X. So, in our case, X will be S1 to start and Y will be S3. +In this case, the context set can be merged, and hence we union S1, S3 +and wind up with the following union-find state: + +``` +S1,S3 = {C0:e, C1:d, C2:c} +S2 = {C1:c, C2:d} +``` + +(Note that this union is just for the purpose of tracking context; it +doesn't imply that S1 and S3 are the 'same states' or anything like +that.) + +Next we examine the edge S3 -> S3. Here the contexts are already +merged and everything is happy, so we stop. (We already visited S3, +after all.) + +This finishes our first beachhead, so we proceed to the next edge, S2 +-> S3. Here we find that we **cannot** union the context: it would +produce an inconsistent state. So what we do is we **clone** S3 to +make a new state, S3', with the initial setup corresponding to the row +for S3 from the lane table: + +``` +S1,S3 = {C0:e, C1:d, C2:c} +S2 = {C1:c, C2:d} +S3' = {C0:e} +``` + +This also involves updating our LR(0-1) state set to have a new state +S3'. All edges from S2 that led to S3 now lead to S3'; the outgoing +edges from S3' remain unchanged. (At least to start.) + +Therefore, the edge `S2 -> S3` is now `S2 -> S3'`. We can now merge +the conflicts: + +``` +S1,S3 = {C0:e, C1:d, C2:c} +S2,S3' = {C0:e, C1:c, C2:d} +``` + +Now we examine the outgoing edge S3' -> S3. We cannot merge these +conflicts, so we search (greedily, I guess) for a clone of S3 where we +can merge the conflicts. We find one in S3', and hence we redirect the +S3 edge to S3' and we are done. (I think the actual search we want is +to make first look for a clone of S3 that is using literally the same +context as us (i.e., same root node), as in this case. If that is not +found, *then* we search for one with a mergable context. If *that* +fails, then we clone a new state.) + +The final state thus has two copies of S3, one for the path from S1, +and one for the path from S2, which gives us enough context to +proceed. diff --git a/lalrpop/src/lr1/lane_table/construct/merge.rs b/lalrpop/src/lr1/lane_table/construct/merge.rs new file mode 100644 index 0000000..5f77caa --- /dev/null +++ b/lalrpop/src/lr1/lane_table/construct/merge.rs @@ -0,0 +1,203 @@ +use super::*; + +use collections::Multimap; +use lr1::lane_table::table::context_set::ContextSet; + +/// The "merge" phase of the algorithm is described in "Step 3c" of +/// [the README][r]. It consists of walking through the various +/// states in the lane table and merging them into sets of states that +/// have compatible context sets; if we encounter a state S that has a +/// successor T but where the context set of S is not compatible with +/// T, then we will clone T into a new T2 (and hopefully the context +/// set of S will be compatible with the reduced context of T2). +/// +/// [r]: ../README.md +pub struct Merge<'m, 'grammar: 'm> { + table: &'m LaneTable<'grammar>, + states: &'m mut Vec>, + visited: Set, + original_indices: Map, + clones: Multimap>, + target_states: Vec, + context_sets: ContextSets<'m>, +} + +impl<'m, 'grammar> Merge<'m, 'grammar> { + pub fn new(table: &'m LaneTable<'grammar>, + unify: &'m mut UnificationTable, + states: &'m mut Vec>, + state_sets: &'m mut Map, + inconsistent_state: StateIndex) + -> Self { + Merge { + table: table, + states: states, + visited: Set::new(), + original_indices: Map::new(), + clones: Multimap::new(), + target_states: vec![inconsistent_state], + context_sets: ContextSets { + unify: unify, + state_sets: state_sets, + } + } + } + + pub fn start(&mut self, beachhead_state: StateIndex) -> Result<(), (StateIndex, StateIndex)> { + debug!("Merge::start(beachhead_state={:?})", beachhead_state); + + // Since we always start walks from beachhead states, and they + // are not reachable from anyone else, this state should not + // have been unioned with anything else yet. + self.walk(beachhead_state) + } + + pub fn patch_target_starts(mut self, actions: &Set>) { + debug!("Merge::patch_target_starts(actions={:?})", actions); + + for &target_state in &self.target_states { + debug!("Merge::patch_target_starts: target_state={:?}", target_state); + let context_set = self.context_sets.context_set(target_state); + debug!("Merge::patch_target_starts: context_set={:?}", context_set); + context_set.apply(&mut self.states[target_state.0], actions); + } + } + + /// If `state` is a cloned state, find its original index. Useful + /// for indexing into the lane table and so forth. + fn original_index(&self, state: StateIndex) -> StateIndex { + *self.original_indices.get(&state).unwrap_or(&state) + } + + fn successors(&self, state: StateIndex) -> Option<&'m Set> { + self.table.successors(self.original_index(state)) + } + + fn walk(&mut self, state: StateIndex) -> Result<(), (StateIndex, StateIndex)> { + debug!("Merge::walk(state={:?})", state); + + if !self.visited.insert(state) { + debug!("Merge::walk: visited already"); + return Ok(()); + } + + for &successor in self.successors(state).iter().flat_map(|&s| s) { + debug!("Merge::walk: state={:?} successor={:?}", + state, successor); + + if self.context_sets.union(state, successor) { + debug!("Merge::walk: successful union, context-set = {:?}", + self.context_sets.context_set(state)); + self.walk(successor)?; + } else { + // search for an existing clone with which we can merge + debug!("Merge::walk: union failed, seek existing clone"); + let existing_clone = { + let context_sets = &mut self.context_sets; + self.clones.get(&successor) + .into_iter() + .flat_map(|clones| clones) // get() returns an Option + .cloned() + .filter(|&successor1| context_sets.union(state, successor1)) + .next() + }; + + if let Some(successor1) = existing_clone { + debug!("Merge::walk: found existing clone {:?}", successor1); + self.patch_links(state, successor, successor1); + self.walk(successor1)?; + } else { + // if we don't find one, we have to make a new clone + debug!("Merge::walk: creating new clone of {:?}", successor); + let successor1 = self.clone(successor); + if self.context_sets.union(state, successor1) { + self.patch_links(state, successor, successor1); + self.walk(successor1)?; + } else { + debug!("Merge::walk: failed to union {:?} with {:?}", + state, successor1); + debug!("Merge::walk: state context = {:?}", + self.context_sets.context_set(state)); + debug!("Merge::walk: successor context = {:?}", + self.context_sets.context_set(successor1)); + + return Err((self.original_index(state), + self.original_index(successor1))); + } + } + } + } + + Ok(()) + } + + fn clone(&mut self, state: StateIndex) -> StateIndex { + // create a new state with same contents as the old one + let new_index = StateIndex(self.states.len()); + let new_state = self.states[state.0].clone(); + self.states.push(new_state); + + // track the original index and clones + let original_index = self.original_index(state); + self.original_indices.insert(new_index, original_index); + self.clones.push(original_index, new_index); + + // create a new unify key for this new state + let context_set = self.table.context_set(original_index).unwrap(); + self.context_sets.new_state(new_index, context_set); + + // keep track of the clones of the target state + if original_index == self.target_states[0] { + self.target_states.push(new_index); + } + + debug!("Merge::clone: cloned {:?} to {:?}", state, new_index); + new_index + } + + fn patch_links(&mut self, + predecessor: StateIndex, + original_successor: StateIndex, + cloned_successor: StateIndex) + { + let replace = |target_state: &mut StateIndex| { + if *target_state == original_successor { + *target_state = cloned_successor; + } + }; + + let state = &mut self.states[predecessor.0]; + for (_, target_state) in &mut state.shifts { + replace(target_state); + } + for (_, target_state) in &mut state.gotos { + replace(target_state); + } + } +} + +struct ContextSets<'m> { + state_sets: &'m mut Map, + unify: &'m mut UnificationTable, +} + +impl<'m> ContextSets<'m> { + fn context_set(&mut self, state: StateIndex) -> ContextSet { + let state_set = self.state_sets[&state]; + self.unify.probe_value(state_set) + } + + fn union(&mut self, source: StateIndex, target: StateIndex) -> bool { + let set1 = self.state_sets[&source]; + let set2 = self.state_sets[&target]; + let result = self.unify.unify_var_var(set1, set2).is_ok(); + debug!("ContextSets::union: source={:?} target={:?} result={:?}", + source, target, result); + result + } + + fn new_state(&mut self, new_index: StateIndex, context_set: ContextSet) { + let state_set = self.unify.new_key(context_set); + self.state_sets.insert(new_index, state_set); + } +} diff --git a/lalrpop/src/lr1/lane_table/construct/mod.rs b/lalrpop/src/lr1/lane_table/construct/mod.rs new file mode 100644 index 0000000..e48784f --- /dev/null +++ b/lalrpop/src/lr1/lane_table/construct/mod.rs @@ -0,0 +1,190 @@ +//! + +use collections::{Map, Set}; +use ena::unify::UnificationTable; +use grammar::repr::*; +use lr1::build; +use lr1::core::*; +use lr1::first::FirstSets; +use lr1::lookahead::{Lookahead, TokenSet}; +use lr1::lane_table::lane::LaneTracer; +use lr1::lane_table::table::{ConflictIndex, LaneTable}; +use lr1::lane_table::table::context_set::OverlappingLookahead; +use lr1::state_graph::StateGraph; +use std::rc::Rc; + +mod merge; +use self::merge::Merge; + +mod state_set; +use self::state_set::StateSet; + +pub struct LaneTableConstruct<'grammar> { + grammar: &'grammar Grammar, + first_sets: FirstSets, + start: NonterminalString, +} + +impl<'grammar> LaneTableConstruct<'grammar> { + pub fn new(grammar: &'grammar Grammar, start: NonterminalString) -> Self { + let first_sets = FirstSets::new(grammar); + Self { + grammar: grammar, + start: start, + first_sets: first_sets, + } + } + + pub fn construct(self) -> Result>, LR1TableConstructionError<'grammar>> { + let TableConstructionError { states, conflicts: _ } = { + match build::build_lr0_states(self.grammar, self.start) { + // This is the easy (and very rare...) case. + Ok(lr0) => return Ok(self.promote_lr0_states(lr0)), + Err(err) => err, + } + }; + + // Convert the LR(0) states into LR(0-1) states. + let mut states = self.promote_lr0_states(states); + + // For each inconsistent state, apply the lane-table algorithm to + // resolve it. + for i in 0.. { + if i >= states.len() { + break; + } + + match self.resolve_inconsistencies(&mut states, StateIndex(i)) { + Ok(()) => { } + Err(_) => { + // We failed because of irreconcilable conflicts + // somewhere. Just compute the conflicts from the final set of + // states. + let conflicts: Vec> = + states.iter() + .flat_map(|s| Lookahead::conflicts(&s)) + .collect(); + return Err(TableConstructionError { states: states, + conflicts: conflicts }); + } + } + } + + Ok(states) + } + + /// Given a set of LR0 states, returns LR1 states where the lookahead + /// is always `TokenSet::all()`. We refer to these states as LR(0-1) + /// states in the README. + fn promote_lr0_states(&self, lr0: Vec>) -> Vec> { + let all = TokenSet::all(); + lr0.into_iter() + .map(|s| { + let items = s.items + .vec + .iter() + .map(|item| { + Item { + production: item.production, + index: item.index, + lookahead: all.clone(), + } + }) + .collect(); + let reductions = s.reductions + .into_iter() + .map(|(_, p)| (all.clone(), p)) + .collect(); + State { + index: s.index, + items: Items { vec: Rc::new(items) }, + shifts: s.shifts, + reductions: reductions, + gotos: s.gotos, + } + }) + .collect() + } + + fn resolve_inconsistencies(&self, + states: &mut Vec>, + inconsistent_state: StateIndex) + -> Result<(), StateIndex> { + let actions = super::conflicting_actions(&states[inconsistent_state.0]); + if actions.is_empty() { + return Ok(()); + } + + let table = self.build_lane_table(states, inconsistent_state, &actions); + + // Consider first the "LALR" case, where the lookaheads for each + // action are completely disjoint. + if self.attempt_lalr(&mut states[inconsistent_state.0], &table, &actions) { + return Ok(()); + } + + // Construct the initial states; each state will map to a + // context-set derived from its row in the lane-table. This is + // fallible, because a state may be internally inconstent. + // + // (To handle unification, we also map each state to a + // `StateSet` that is its entry in the `ena` table.) + let rows = table.rows()?; + let mut unify = UnificationTable::::new(); + let mut state_sets = Map::new(); + for (&state_index, context_set) in &rows { + let state_set = unify.new_key(context_set.clone()); + state_sets.insert(state_index, state_set); + debug!("resolve_inconsistencies: state_index={:?}, state_set={:?}", + state_index, state_set); + } + + // Now merge state-sets, cloning states where needed. + let mut merge = Merge::new(&table, &mut unify, states, &mut state_sets, inconsistent_state); + let beachhead_states = table.beachhead_states(); + for beachhead_state in beachhead_states { + match merge.start(beachhead_state) { + Ok(()) => { } + Err((source, _)) => return Err(source), + } + } + merge.patch_target_starts(&actions); + + Ok(()) + } + + fn attempt_lalr(&self, + state: &mut LR1State<'grammar>, + table: &LaneTable<'grammar>, + actions: &Set>) + -> bool { + match table.columns() { + Ok(columns) => { + debug!("attempt_lalr, columns={:#?}", columns); + columns.apply(state, actions); + true + } + Err(OverlappingLookahead) => { + debug!("attempt_lalr, OverlappingLookahead"); + false + } + } + } + + fn build_lane_table(&self, + states: &[LR1State<'grammar>], + inconsistent_state: StateIndex, + actions: &Set>) + -> LaneTable<'grammar> { + let state_graph = StateGraph::new(states); + let mut tracer = LaneTracer::new(self.grammar, + states, + &self.first_sets, + &state_graph, + actions.len()); + for (i, &action) in actions.iter().enumerate() { + tracer.start_trace(inconsistent_state, ConflictIndex::new(i), action); + } + tracer.into_table() + } +} diff --git a/lalrpop/src/lr1/lane_table/construct/state_set.rs b/lalrpop/src/lr1/lane_table/construct/state_set.rs new file mode 100644 index 0000000..04bacb5 --- /dev/null +++ b/lalrpop/src/lr1/lane_table/construct/state_set.rs @@ -0,0 +1,44 @@ +use ena::unify::{UnifyKey, UnifyValue}; +use lr1::lane_table::table::context_set::{ContextSet, OverlappingLookahead}; + +/// The unification key for a set of states in the lane table +/// algorithm. Each set of states is associated with a +/// `ContextSet`. When two sets of states are merged, their conflict +/// sets are merged as well; this will fail if that would produce an +/// overlapping conflict set. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct StateSet { + index: u32 +} + +impl UnifyKey for StateSet { + type Value = ContextSet; + + fn index(&self) -> u32 { + self.index + } + + fn from_index(u: u32) -> Self { + StateSet { index: u } + } + + fn tag() -> &'static str { + "StateSet" + } +} + +// FIXME: The `ena` interface is really designed around `UnifyValue` +// being cheaply cloneable; we should either refactor `ena` a bit or +// find some other way to associate a `ContextSet` with a state set +// (for example, we could have each state set be associated with an +// index that maps to a `ContextSet`), and do the merging ourselves. +// But this is easier for now, and cloning a `ContextSet` isn't THAT +// expensive, right? :) +impl UnifyValue for ContextSet { + fn unify_values(value1: &Self, value2: &Self) -> Result { + match ContextSet::union(value1, value2) { + Ok(v) => Ok(v), + Err(OverlappingLookahead) => Err((value1.clone(), value2.clone())), + } + } +} diff --git a/lalrpop/src/lr1/lane_table/lane/mod.rs b/lalrpop/src/lr1/lane_table/lane/mod.rs index 8de86f7..e62ba7d 100644 --- a/lalrpop/src/lr1/lane_table/lane/mod.rs +++ b/lalrpop/src/lr1/lane_table/lane/mod.rs @@ -10,22 +10,24 @@ use lr1::state_graph::StateGraph; use super::table::{ConflictIndex, LaneTable}; -pub struct LaneTracer<'trace, 'grammar: 'trace> { - states: &'trace [LR0State<'grammar>], - first_sets: FirstSets, - state_graph: StateGraph, +pub struct LaneTracer<'trace, 'grammar: 'trace, L: Lookahead + 'trace> { + states: &'trace [State<'grammar, L>], + first_sets: &'trace FirstSets, + state_graph: &'trace StateGraph, table: LaneTable<'grammar>, } -impl<'trace, 'grammar> LaneTracer<'trace, 'grammar> { +impl<'trace, 'grammar, L: Lookahead> LaneTracer<'trace, 'grammar, L> { pub fn new(grammar: &'grammar Grammar, - states: &'trace [LR0State<'grammar>], + states: &'trace [State<'grammar, L>], + first_sets: &'trace FirstSets, + state_graph: &'trace StateGraph, conflicts: usize) -> Self { LaneTracer { states: states, - first_sets: FirstSets::new(grammar), - state_graph: StateGraph::new(states), + first_sets: first_sets, + state_graph: state_graph, table: LaneTable::new(grammar, conflicts), } } @@ -37,25 +39,21 @@ impl<'trace, 'grammar> LaneTracer<'trace, 'grammar> { pub fn start_trace(&mut self, state: StateIndex, conflict: ConflictIndex, - item: LR0Item<'grammar>) { + action: Action<'grammar>) { let mut visited_set = Set::default(); // if the conflict item is a "shift" item, then the context // is always the terminal to shift (and conflicts only arise // around shifting terminal, so it must be a terminal) - match item.shift_symbol() { - Some((Symbol::Terminal(term), _)) => { + match action { + Action::Shift(term, _) => { let mut token_set = TokenSet::new(); token_set.insert(Token::Terminal(term)); self.table.add_lookahead(state, conflict, &token_set); } - Some((Symbol::Nonterminal(_), _)) => { - panic!("invalid conflict item `{:?}`: shifts nonterminal", - item); - } - - None => { + Action::Reduce(prod) => { + let item = Item::lr0(prod, prod.symbols.len()); self.continue_trace(state, conflict, item, &mut visited_set); } } @@ -114,14 +112,14 @@ impl<'trace, 'grammar> LaneTracer<'trace, 'grammar> { let state_items = &self.states[state.0].items.vec; let nonterminal = item.production.nonterminal; - for &pred_item in state_items.iter() - .filter(|i| i.can_shift_nonterminal(nonterminal)) { + for pred_item in state_items.iter() + .filter(|i| i.can_shift_nonterminal(nonterminal)) { let symbol_sets = pred_item.symbol_sets(); let mut first = self.first_sets.first0(symbol_sets.suffix); let derives_epsilon = first.take_eof(); self.table.add_lookahead(state, conflict, &first); if derives_epsilon { - self.continue_trace(state, conflict, pred_item, visited); + self.continue_trace(state, conflict, pred_item.to_lr0(), visited); } } } diff --git a/lalrpop/src/lr1/lane_table/mod.rs b/lalrpop/src/lr1/lane_table/mod.rs index 98d7afe..904a57a 100644 --- a/lalrpop/src/lr1/lane_table/mod.rs +++ b/lalrpop/src/lr1/lane_table/mod.rs @@ -1,72 +1,26 @@ use collections::Set; -use lr1::build; use lr1::core::*; -use lr1::lookahead::{Lookahead, Nil}; +use lr1::lookahead::Lookahead; use grammar::repr::*; +mod construct; mod lane; mod table; #[cfg(test)] mod test; -use self::lane::*; -use self::table::*; - pub fn build_lane_table_states<'grammar>(grammar: &'grammar Grammar, start: NonterminalString) -> LR1Result<'grammar> { - let (lr0_states, lr0_conflicts) = match build::build_lr0_states(grammar, start) { - Ok(s) => (s, vec![]), - Err(e) => (e.states, e.conflicts), - }; - - // this is mostly just dummy code to ensure that things get used - // and avoid dead-code warnings - for conflict in lr0_conflicts { - let inconsistent_state = &lr0_states[conflict.state.0]; - let conflicting_items = conflicting_items(inconsistent_state); - println!("conflicting_items={:#?}", conflicting_items); - let mut tracer = LaneTracer::new(&grammar, &lr0_states, conflicting_items.len()); - for (i, &conflicting_item) in conflicting_items.iter().enumerate() { - tracer.start_trace(inconsistent_state.index, - ConflictIndex::new(i), - conflicting_item); - } - let _ = tracer.into_table(); - } - - unimplemented!() + construct::LaneTableConstruct::new(grammar, start).construct() } -fn conflicting_items<'grammar>(state: &LR0State<'grammar>) -> Set> { - let conflicts = Nil::conflicts(state); - - let reductions1 = conflicts.iter() - .map(|c| Item::lr0(c.production, c.production.symbols.len())); - - let reductions2 = conflicts.iter() - .filter_map(|c| { - match c.action { - Action::Reduce(p) => Some(Item::lr0(p, p.symbols.len())), - Action::Shift(..) => None, - } - }); - - let shifts = conflicts.iter() - .filter_map(|c| { - match c.action { - Action::Shift(term, _) => Some(term), - Action::Reduce(..) => None, - } - }) - .flat_map(|term| { - state.items - .vec - .iter() - .filter(move |item| item.can_shift_terminal(term)) - .cloned() - }); - - reductions1.chain(reductions2).chain(shifts).collect() +fn conflicting_actions<'grammar, L: Lookahead>(state: &State<'grammar, L>) + -> Set> +{ + let conflicts = L::conflicts(state); + let reductions = conflicts.iter().map(|c| Action::Reduce(c.production)); + let actions = conflicts.iter().map(|c| c.action); + reductions.chain(actions).collect() } diff --git a/lalrpop/src/lr1/lane_table/table/context_set/mod.rs b/lalrpop/src/lr1/lane_table/table/context_set/mod.rs new file mode 100644 index 0000000..83988b0 --- /dev/null +++ b/lalrpop/src/lr1/lane_table/table/context_set/mod.rs @@ -0,0 +1,98 @@ +//! A key part of the lane-table algorithm is the idea of a CONTEXT +//! SET (my name, the paper has no name for this). Basically it +//! represents the LR1 context under which a given conflicting action +//! would take place. +//! +//! So, for example, imagine this grammar: +//! +//! ```notrust +//! A = B x +//! | C y +//! B = z +//! C = z +//! ``` +//! +//! This gives rise to states like: +//! +//! - `S0 = { * B x, * C y, B = * z, C = * z }` +//! - `S1 = { B = z *, C = z * }` +//! +//! This second state has two conflicting items. Let's call them +//! conflicts 0 and 1 respectively. The conflict set would then have +//! two entries (one for each conflict) and it would map each of them +//! to a TokenSet supplying context. So when we trace everything +//! out we might get a ContextSet of: +//! +//! - `[ 0: x, 1: y ]` +//! +//! In general, you want to ensure that the token sets of all +//! conflicting items are pairwise-disjoint, or else if you get to a +//! state that has both of those items (which, by definition, does +//! arise) you won't know which to take. In this case, we're all set, +//! because item 0 occurs only with lookahead `x` and item 1 with +//! lookahead `y`. + +use collections::{Set, Map}; +use lr1::core::*; +use lr1::lookahead::*; +mod test; + +use super::ConflictIndex; + +#[derive(Clone, Debug)] +pub struct ContextSet { + values: Vec +} + +#[derive(Debug)] +pub struct OverlappingLookahead; + +impl ContextSet { + pub fn new(num_conflicts: usize) -> Self { + ContextSet { + values: (0..num_conflicts).map(|_| TokenSet::new()).collect() + } + } + + pub fn union(set1: &ContextSet, set2: &ContextSet) -> Result { + let mut result = set1.clone(); + for (i, t) in set2.values.iter().enumerate() { + result.insert(ConflictIndex::new(i), t)?; + } + Ok(result) + } + + /// Attempts to merge the values `conflict: set` into this + /// conflict set. If this would result in an invalid conflict set + /// (where two conflicts have overlapping lookahead), then returns + /// `Err(OverlappingLookahead)` and has no effect. + /// + /// Assuming no errors, returns `Ok(true)` if this resulted in any + /// modifications, and `Ok(false)` otherwise. + pub fn insert(&mut self, conflict: ConflictIndex, set: &TokenSet) -> Result { + for (value, index) in self.values.iter().zip((0..).map(ConflictIndex::new)) { + if index != conflict { + if value.is_intersecting(&set) { + return Err(OverlappingLookahead); + } + } + } + + Ok(self.values[conflict.index].union_with(&set)) + } + + pub fn apply<'grammar>(&self, + state: &mut LR1State<'grammar>, + actions: &Set>) { + // create a map from each action to its lookahead + let lookaheads: Map, &TokenSet> = actions.iter() + .cloned() + .zip(&self.values) + .collect(); + + for &mut (ref mut lookahead, production) in &mut state.reductions { + let action = Action::Reduce(production); + *lookahead = lookaheads[&action].clone(); + } + } +} diff --git a/lalrpop/src/lr1/lane_table/table/context_set/test.rs b/lalrpop/src/lr1/lane_table/table/context_set/test.rs new file mode 100644 index 0000000..67005ab --- /dev/null +++ b/lalrpop/src/lr1/lane_table/table/context_set/test.rs @@ -0,0 +1 @@ +#![cfg(test)] diff --git a/lalrpop/src/lr1/lane_table/table/mod.rs b/lalrpop/src/lr1/lane_table/table/mod.rs index e2ba2e9..f4758f4 100644 --- a/lalrpop/src/lr1/lane_table/table/mod.rs +++ b/lalrpop/src/lr1/lane_table/table/mod.rs @@ -21,6 +21,9 @@ use std::default::Default; use std::fmt::{Debug, Error, Formatter}; use std::iter; +pub mod context_set; +use self::context_set::{ContextSet, OverlappingLookahead}; + #[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)] pub struct ConflictIndex { index: usize, @@ -62,6 +65,79 @@ impl<'grammar> LaneTable<'grammar> { pub fn add_successor(&mut self, state: StateIndex, succ: StateIndex) { self.successors.push(state, succ); } + + /// Unions together the lookaheads for each column and returns a + /// context set containing all of them. For an LALR(1) grammar, + /// these token sets will be mutually disjoint, as discussed in + /// the [README]; otherwise `Err` will be returned. + /// + /// [README]: ../README.md + pub fn columns(&self) -> Result { + let mut columns = ContextSet::new(self.conflicts); + for (&(_, conflict_index), set) in &self.lookaheads { + columns.insert(conflict_index, set)?; + } + Ok(columns) + } + + pub fn successors(&self, state: StateIndex) -> Option<&Set> { + self.successors.get(&state) + } + + /// Returns the state of states in the table that are **not** + /// reachable from another state in the table. These are called + /// "beachhead states". + pub fn beachhead_states(&self) -> Set { + // set of all states that are reachable from another state + let reachable: Set = + self.successors.iter() + .flat_map(|(_pred, succ)| succ) + .cloned() + .collect(); + + self.lookaheads.keys() + .map(|&(state_index, _)| state_index) + .filter(|s| !reachable.contains(s)) + .collect() + } + + pub fn context_set(&self, state: StateIndex) -> Result { + let mut set = ContextSet::new(self.conflicts); + for (&(state_index, conflict_index), token_set) in &self.lookaheads { + if state_index == state { + set.insert(conflict_index, token_set)?; + } + } + Ok(set) + } + + /// Returns a map containing all states that appear in the table, + /// along with the context set for each state (i.e., each row in + /// the table, basically). Returns Err if any state has a conflict + /// between the context sets even within its own row. + pub fn rows(&self) -> Result, StateIndex> { + let mut map = Map::new(); + for (&(state_index, conflict_index), token_set) in &self.lookaheads { + match { + map.entry(state_index) + .or_insert_with(|| ContextSet::new(self.conflicts)) + .insert(conflict_index, token_set) + } { + Ok(_changed) => { } + Err(OverlappingLookahead) => return Err(state_index) + } + } + + // In some cases, there are states that have no context at + // all, only successors. In that case, make sure to add an + // empty row for them. + for (&state_index, _) in &self.successors { + map.entry(state_index) + .or_insert_with(|| ContextSet::new(self.conflicts)); + } + + Ok(map) + } } impl<'grammar> Debug for LaneTable<'grammar> { diff --git a/lalrpop/src/lr1/lane_table/test.rs b/lalrpop/src/lr1/lane_table/test.rs index 67b1b96..70369de 100644 --- a/lalrpop/src/lr1/lane_table/test.rs +++ b/lalrpop/src/lr1/lane_table/test.rs @@ -3,15 +3,27 @@ use grammar::repr::*; use test_util::{expect_debug, normalized_grammar}; use lr1::build; use lr1::core::*; +use lr1::first::FirstSets; use lr1::interpret; +use lr1::state_graph::StateGraph; use lr1::tls::Lr1Tls; use tls::Tls; +use super::construct::*; use super::lane::*; use super::table::*; +macro_rules! tokens { + ($($x:expr),*) => { + vec![$(TerminalString::quoted(intern($x))),*].into_iter() + } +} + fn sym(t: &str) -> Symbol { - if t.chars().next().unwrap().is_uppercase() { + if t.chars() + .next() + .unwrap() + .is_uppercase() { Symbol::Nonterminal(nt(t)) } else { Symbol::Terminal(term(t)) @@ -33,7 +45,7 @@ fn traverse(states: &[LR0State], tokens: &[&str]) -> StateIndex { /// A simplified version of the paper's initial grammar; this version /// only has one inconsistent state (the same state they talk about in /// the paper). -pub fn paper_example_small() -> Grammar { +pub fn paper_example_g0() -> Grammar { normalized_grammar(r#" grammar; @@ -42,6 +54,39 @@ pub G: () = { Y "d", }; +X: () = { + "e" X, + "e",} +; + +Y: () = { + "e" Y, + "e" +}; +"#) +} + +/// A (corrected) version of the sample grammar G1 from the paper. The +/// grammar as written in the text omits some items, but the diagrams +/// seem to contain the full set. I believe this is one of the +/// smallest examples that still requires splitting states from the +/// LR0 states. +pub fn paper_example_g1() -> Grammar { + normalized_grammar(r#" +grammar; + +pub G: () = { + // if "a" is input, then lookahead "d" means "reduce X" + // and lookahead "c" means "reduce "Y" + "a" X "d", + "a" Y "c", + + // if "b" is input, then lookahead "d" means "reduce Y" + // and lookahead "c" means "reduce X. + "b" X "c", + "b" Y "d", +}; + X: () = { "e" X, "e", @@ -67,9 +112,15 @@ fn build_table<'grammar>(grammar: &'grammar Grammar, println!("inconsistent_state={:#?}", inconsistent_state.items); // Extract conflicting items and trace the lanes, constructing a table - let conflicting_items = super::conflicting_items(inconsistent_state); + let conflicting_items = super::conflicting_actions(inconsistent_state); println!("conflicting_items={:#?}", conflicting_items); - let mut tracer = LaneTracer::new(&grammar, &lr0_err.states, conflicting_items.len()); + let first_sets = FirstSets::new(&grammar); + let state_graph = StateGraph::new(&lr0_err.states); + let mut tracer = LaneTracer::new(&grammar, + &lr0_err.states, + &first_sets, + &state_graph, + conflicting_items.len()); for (i, &conflicting_item) in conflicting_items.iter().enumerate() { tracer.start_trace(inconsistent_state.index, ConflictIndex::new(i), @@ -80,19 +131,91 @@ fn build_table<'grammar>(grammar: &'grammar Grammar, } #[test] -fn small_conflict_1() { +fn g0_conflict_1() { let _tls = Tls::test(); - let grammar = paper_example_small(); + let grammar = paper_example_g0(); let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); let table = build_table(&grammar, "G", &["e"]); println!("{:#?}", table); + // conflicting_actions={ + // Shift("e") // C0 + // Reduce(X = "e" => ActionFn(4)) // C1 + // Reduce(Y = "e" => ActionFn(6)) // C2 + // } expect_debug(&table, r#" -| State | C0 | C1 | C2 | C3 | C4 | C5 | Successors | -| S0 | | ["c"] | | | ["d"] | | {S3} | -| S3 | ["e"] | [] | ["e"] | ["e"] | [] | ["e"] | {S3} | +| State | C0 | C1 | C2 | Successors | +| S0 | | ["c"] | ["d"] | {S3} | +| S3 | ["e"] | [] | [] | {S3} | "# - .trim_left()); + .trim_left()); +} + +#[test] +fn paper_example_g1_conflict_1() { + let _tls = Tls::test(); + let grammar = paper_example_g1(); + let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); + let table = build_table(&grammar, "G", &["a", "e"]); + println!("{:#?}", table); + // conflicting_actions={ + // Shift("e") // C0 + // Reduce(X = "e" => ActionFn(6)) // C1 + // Reduce(Y = "e" => ActionFn(8)) // C2 + // } + expect_debug(&table, + r#" +| State | C0 | C1 | C2 | Successors | +| S1 | | ["d"] | ["c"] | {S5} | +| S2 | | ["c"] | ["d"] | {S5} | +| S5 | ["e"] | [] | [] | {S5} | +"# + .trim_left()); +} + +#[test] +fn paper_example_g0_build() { + let _tls = Tls::test(); + let grammar = paper_example_g0(); + let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); + let lr0_err = build::build_lr0_states(&grammar, nt("G")).unwrap_err(); + let states = LaneTableConstruct::new(&grammar, nt("G")).construct() + .expect("failed to build lane table states"); + + // we do not require more *states* than LR(0), just different lookahead + assert_eq!(states.len(), lr0_err.states.len()); + + let tree = interpret::interpret(&states, tokens!["e", "c"]).unwrap(); + expect_debug(&tree, r#"[G: [X: "e"], "c"]"#); + + let tree = interpret::interpret(&states, tokens!["e", "e", "c"]).unwrap(); + expect_debug(&tree, r#"[G: [X: "e", [X: "e"]], "c"]"#); + + let tree = interpret::interpret(&states, tokens!["e", "e", "d"]).unwrap(); + expect_debug(&tree, r#"[G: [Y: "e", [Y: "e"]], "d"]"#); + + interpret::interpret(&states, tokens!["e", "e", "e"]).unwrap_err(); +} + +#[test] +fn paper_example_g1_build() { + let _tls = Tls::test(); + let grammar = paper_example_g1(); + let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); + let lr0_err = build::build_lr0_states(&grammar, nt("G")).unwrap_err(); + let states = LaneTableConstruct::new(&grammar, nt("G")).construct() + .expect("failed to build lane table states"); + + // we require more *states* than LR(0), not just different lookahead + assert_eq!(states.len() - lr0_err.states.len(), 1); + + let tree = interpret::interpret(&states, tokens!["a", "e", "e", "d"]).unwrap(); + expect_debug(&tree, r#"[G: "a", [X: "e", [X: "e"]], "d"]"#); + + let tree = interpret::interpret(&states, tokens!["b", "e", "e", "d"]).unwrap(); + expect_debug(&tree, r#"[G: "b", [Y: "e", [Y: "e"]], "d"]"#); + + interpret::interpret(&states, tokens!["e", "e", "e"]).unwrap_err(); } pub fn paper_example_large() -> Grammar { @@ -158,26 +281,35 @@ fn large_conflict_1() { let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); let table = build_table(&grammar, "G", &["x", "s", "k", "t"]); println!("{:#?}", table); + + // conflicting_actions={ + // Shift("s") // C0 + // Reduce(U = U "k" "t") // C1 + // Reduce(X = "k" "t") // C2 + // Reduce(Y = "k" "t") // C3 + // } + expect_debug(&table, r#" | State | C0 | C1 | C2 | C3 | Successors | -| S1 | ["k"] | | | | {S5} | -| S2 | ["k"] | | | | {S7} | -| S3 | ["k"] | | | | {S7} | -| S4 | ["k"] | | | | {S7} | +| S1 | | ["k"] | | | {S5} | +| S2 | | ["k"] | | | {S7} | +| S3 | | ["k"] | | | {S7} | +| S4 | | ["k"] | | | {S7} | | S5 | | | ["a"] | ["r"] | {S16} | | S7 | | | ["c", "w"] | ["d"] | {S16} | | S16 | | | | | {S27} | -| S27 | ["k"] | ["s"] | | | {S32} | +| S27 | ["s"] | ["k"] | | | {S32} | | S32 | | | ["z"] | ["u"] | {S16} | "# - .trim_left()); + .trim_left()); // ^^ This differs in some particulars from what appears in the // paper, but I believe it to be correct, and the paper to be wrong. // - // Here is the table using the state names from the paper. I've marked - // the differences with `(*)`. + // Here is the table using the state names from the paper. I've + // marked the differences with `(*)`. Note that the paper does not + // include the C0 column (the shift). // // | State | pi1 | pi2 | pi3 | Successors | // | B | ["k"] | | *1 | {G} | @@ -204,3 +336,16 @@ fn large_conflict_1() { // X P", and the lookahead from the "X" here is FIRST(P) which is // "z". } + +#[test] +fn paper_example_large_build() { + let _tls = Tls::test(); + let grammar = paper_example_large(); + let _lr1_tls = Lr1Tls::install(grammar.terminals.clone()); + let states = LaneTableConstruct::new(&grammar, nt("G")).construct() + .expect("failed to build lane table states"); + + let tree = interpret::interpret(&states, tokens!["y", "s", "k", "t", "c", "b"]).unwrap(); + expect_debug(&tree, r#"[G: "y", [W: [U: "s"], [X: "k", "t"], [C: "c"]], "b"]"#); +} + diff --git a/lalrpop/src/lr1/lookahead.rs b/lalrpop/src/lr1/lookahead.rs index 670f004..b3f2099 100644 --- a/lalrpop/src/lr1/lookahead.rs +++ b/lalrpop/src/lr1/lookahead.rs @@ -152,11 +152,23 @@ impl TokenSet { pub fn new() -> Self { with(|terminals| { TokenSet { - bit_set: BitSet::with_capacity(terminals.all.len() + 1) + bit_set: BitSet::with_capacity(terminals.all.len() + 2) } }) } + /// A TokenSet containing all possible terminals + EOF. + pub fn all() -> Self { + let mut s = TokenSet::new(); + with(|terminals| { + for i in 0 .. terminals.all.len() { + s.bit_set.insert(i); + } + s.insert_eof(); + }); + s + } + pub fn eof() -> Self { let mut set = TokenSet::new(); set.insert_eof();