Merge pull request #414 from Marwes/shrink_lrgrammar2

feat: Shrink LALRPOPs own parser to 1Mb
2025-03-16 17:00:53 +00:00 · 2018-10-27 13:24:36 +02:00 · 2018-10-27 13:24:36 +02:00 · eba453d876
commit eba453d876
parent 490dd9d565 9f3a978fde
13 changed files with 29003 additions and 84447 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1 +1,3 @@
+* text=auto
+
 lalrpop/src/parser/lrgrammar.lalrpop text eol=lf
--- a/lalrpop-test/src/lib.rs
+++ b/lalrpop-test/src/lib.rs
@ -927,6 +927,7 @@ fn verify_lalrpop_generates_itself() {
        Command::new("../target/debug/lalrpop")
            .args(&[
                "--force",
+                "--no-whitespace",
                "--out-dir",
                out_dir,
                copied_grammar_file
--- a/lalrpop/build.rs
+++ b/lalrpop/build.rs
@ -0,0 +1,58 @@
+use std::env;
+use std::error::Error;
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::process::{exit, Command};
+
+fn main() {
+    if let Err(err) = main_() {
+        eprintln!("{}", err);
+        exit(1);
+    }
+}
+
+fn main_() -> Result<(), Box<Error>> {
+    let grammar_file = "src/parser/lrgrammar.lalrpop";
+    println!(r#"cargo:rerun-if-changed={}"#, grammar_file);
+
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("cargo did not set OUT_DIR"));
+
+    fs::create_dir_all(out_dir.join("src/parser"))?;
+
+    let target_dir = if Path::new("target").exists() {
+        Path::new("target")
+    } else {
+        Path::new("../target")
+    };
+
+    let lalrpop_path = target_dir
+        .join("debug/lalrpop")
+        .with_extension(env::consts::EXE_EXTENSION);
+    println!(r#"cargo:rerun-if-changed={}"#, lalrpop_path.display());
+
+    if lalrpop_path.exists() {
+        // If compiling lalrpop itself, enable test parsers
+        if target_dir.exists() {
+            env::set_var("CARGO_FEATURE_TEST", "1");
+            println!(r#"cargo:rustc-cfg=feature="test""#);
+        }
+
+        let copied_grammar = out_dir.join("src/parser/lrgrammar.lalrpop");
+        fs::copy(grammar_file, &copied_grammar)
+            .map_err(|err| format!("Unable to grammar to OUT_DIR: {}", err))?;
+        let status = Command::new(lalrpop_path)
+            .args(&[
+                "--force",
+                "--features",
+                "test",
+                copied_grammar
+                    .to_str()
+                    .expect("grammar path is not valid UTF-8"),
+            ])
+            .status()?;
+        if !status.success() {
+            return Err("Compiling the .lalrpop file failed".into());
+        }
+    }
+    Ok(())
+}
--- a/lalrpop/src/api/mod.rs
+++ b/lalrpop/src/api/mod.rs
@ -103,6 +103,13 @@ impl Configuration {
        self
    }

+    /// If false, shrinks the generated code by removing redundant white space.
+    /// Default is true.
+    pub fn emit_whitespace(&mut self, val: bool) -> &mut Configuration {
+        self.session.emit_whitespace = val;
+        self
+    }
+
    /// If true, emit report file about generated code.
    pub fn emit_report(&mut self, val: bool) -> &mut Configuration {
        self.session.emit_report = val;
--- a/lalrpop/src/lib.rs
+++ b/lalrpop/src/lib.rs
@ -1,57 +1,58 @@
-// Need this for rusty_peg
-#![recursion_limit = "256"]
-// I hate this lint.
-#![allow(unused_parens)]
-// The builtin tests don't cover the CLI and so forth, and it's just
-// too darn annoying to try and make them do so.
-#![cfg_attr(test, allow(dead_code))]
-
-extern crate ascii_canvas;
-extern crate atty;
-extern crate bit_set;
-extern crate diff;
-extern crate ena;
-extern crate itertools;
-extern crate lalrpop_util;
-extern crate petgraph;
-extern crate regex;
-extern crate regex_syntax;
-extern crate sha2;
-extern crate string_cache;
-extern crate term;
-extern crate unicode_xid;
-
-#[cfg(test)]
-extern crate rand;
-
-// hoist the modules that define macros up earlier
-#[macro_use]
-mod rust;
-#[macro_use]
-mod log;
-
-mod api;
-mod build;
-mod collections;
-mod file_text;
-mod grammar;
-mod kernel_set;
-mod lexer;
-mod lr1;
-mod message;
-mod normalize;
-mod parser;
-mod session;
-mod tls;
-mod tok;
-mod util;
-
-#[cfg(test)]
-mod generate;
-#[cfg(test)]
-mod test_util;
-
-pub use api::process_root;
-pub use api::process_root_unconditionally;
-pub use api::Configuration;
-use ascii_canvas::style;
+// Need this for rusty_peg
+#![recursion_limit = "256"]
+// I hate this lint.
+#![allow(unused_parens)]
+// The builtin tests don't cover the CLI and so forth, and it's just
+// too darn annoying to try and make them do so.
+#![cfg_attr(test, allow(dead_code))]
+
+extern crate ascii_canvas;
+extern crate atty;
+extern crate bit_set;
+extern crate diff;
+extern crate ena;
+extern crate itertools;
+#[cfg_attr(any(feature = "test", test), macro_use)]
+extern crate lalrpop_util;
+extern crate petgraph;
+extern crate regex;
+extern crate regex_syntax;
+extern crate sha2;
+extern crate string_cache;
+extern crate term;
+extern crate unicode_xid;
+
+#[cfg(test)]
+extern crate rand;
+
+// hoist the modules that define macros up earlier
+#[macro_use]
+mod rust;
+#[macro_use]
+mod log;
+
+mod api;
+mod build;
+mod collections;
+mod file_text;
+mod grammar;
+mod kernel_set;
+mod lexer;
+mod lr1;
+mod message;
+mod normalize;
+mod parser;
+mod session;
+mod tls;
+mod tok;
+mod util;
+
+#[cfg(test)]
+mod generate;
+#[cfg(test)]
+mod test_util;
+
+pub use api::process_root;
+pub use api::process_root_unconditionally;
+pub use api::Configuration;
+use ascii_canvas::style;
--- a/lalrpop/src/main.rs
+++ b/lalrpop/src/main.rs
@ -51,6 +51,10 @@ fn main1() -> io::Result<()> {
        config.emit_comments(true);
    }

+    if args.flag_no_whitespace {
+        config.emit_whitespace(false);
+    }
+
    if args.flag_report {
        config.emit_report(true);
    }
@ -101,6 +105,7 @@ Options:
    --features FEATURES  Comma separated list of features for conditional compilation.
    -f, --force          Force execution, even if the .lalrpop file is older than the .rs file.
    -c, --color          Force colorful output, even if this is not a TTY.
+    --no-whitespace      Removes redundant whitespace from the generated file. (Default: false)
    --comments           Enable comments in the generated code.
    --report             Generate report files.
 ";
@ -114,6 +119,7 @@ struct Args {
    flag_force: bool,
    flag_color: bool,
    flag_comments: bool,
+    flag_no_whitespace: bool,
    flag_report: bool,
    flag_version: bool,
 }
@ -181,4 +187,13 @@ mod test {
            .unwrap();
        assert_eq!(args.flag_features, Some("test,abc".to_string()));
    }
+
+    #[test]
+    fn emit_whitespace() {
+        let argv = || vec!["lalrpop", "--no-whitespace", "file.lalrpop"];
+        let args: Args = Docopt::new(USAGE)
+            .and_then(|d| d.argv(argv().into_iter()).deserialize())
+            .unwrap();
+        assert!(args.flag_no_whitespace, true);
+    }
 }
--- a/lalrpop/src/parser/lrgrammar.lalrpop
+++ b/lalrpop/src/parser/lrgrammar.lalrpop
@ -6,9 +6,19 @@ use tok::{self, Tok};
 use util::strip;
 use lalrpop_util::ParseError;

+use super::Top;
+
 grammar<'input>(text: &'input str);

-pub Grammar: Grammar =
+pub Top: Top = {
+    "StartGrammar" <Grammar> => Top::Grammar(<>),
+    "StartPattern" <Pattern> => Top::Pattern(<>),
+    "StartMatchMapping" <MatchMapping> => Top::MatchMapping(<>),
+    "StartTypeRef" <TypeRef> => Top::TypeRef(<>),
+    "StartGrammarWhereClauses" <GrammarWhereClauses> => Top::GrammarWhereClauses(<>),
+};
+
+Grammar: Grammar =
    <module_attributes:ShebangAttribute*>
    <uses:Use*>
    <annotations:Annotation*>
@ -36,7 +46,7 @@ TypeParameter: TypeParameter = {
    <l:Id> => TypeParameter::Id(l)
 };

-pub GrammarWhereClauses: Vec<WhereClause<TypeRef>> =
+GrammarWhereClauses: Vec<WhereClause<TypeRef>> =
    "where" <Comma<GrammarWhereClause>>;

 GrammarWhereClause: WhereClause<TypeRef> = {
@ -226,7 +236,7 @@ SymbolKind1: SymbolKind = {
        SymbolKind::Error,
 };

-pub TypeRef: TypeRef = {
+TypeRef: TypeRef = {
    "(" <Comma<TypeRef>> ")" =>
        TypeRef::Tuple(<>),

@ -299,7 +309,7 @@ MatchItem: MatchItem = {

 MatchSymbol = QuotedLiteral;

-pub MatchMapping = Terminal;
+MatchMapping = Terminal;

 EnumToken: EnumToken =
    "enum" <lo:@L> <t:TypeRef> <hi:@R> "{"
@ -327,7 +337,7 @@ Conversion: Conversion =
                        to: pattern })
    };

-pub Pattern: Pattern<TypeRef> =
+Pattern: Pattern<TypeRef> =
    <lo:@L> <k:PatternKind> <hi:@R> => Pattern { span: Span(lo, hi), kind: k };

 PatternKind: PatternKind<TypeRef> = {
@ -472,5 +482,11 @@ extern {
        "*" => Tok::Star,
        "~~" => Tok::TildeTilde,
        "_" => Tok::Underscore,
+
+        "StartGrammar" => Tok::StartGrammar,
+        "StartPattern" => Tok::StartPattern,
+        "StartMatchMapping" => Tok::StartMatchMapping,
+        "StartTypeRef" => Tok::StartTypeRef,
+        "StartGrammarWhereClauses" => Tok::StartGrammarWhereClauses,
    }
 }
--- a/lalrpop/src/parser/lrgrammar.rs
+++ b/lalrpop/src/parser/lrgrammar.rs
--- a/lalrpop/src/parser/mod.rs
+++ b/lalrpop/src/parser/mod.rs
@ -1,54 +1,84 @@
-use grammar::parse_tree::*;
-use grammar::pattern::*;
-use lalrpop_util;
-use tok;
-
-#[allow(dead_code)]
-mod lrgrammar;
-
-#[cfg(test)]
-mod test;
-
-pub type ParseError<'input> = lalrpop_util::ParseError<usize, tok::Tok<'input>, tok::Error>;
-
-pub fn parse_grammar<'input>(input: &'input str) -> Result<Grammar, ParseError<'input>> {
-    let tokenizer = tok::Tokenizer::new(input, 0);
-    let mut grammar = try!(lrgrammar::GrammarParser::new().parse(input, tokenizer));
-
-    // find a unique prefix that does not appear anywhere in the input
-    while input.contains(&grammar.prefix) {
-        grammar.prefix.push('_');
-    }
-
-    Ok(grammar)
-}
-
-fn parse_pattern<'input>(
-    input: &'input str,
-    offset: usize,
-) -> Result<Pattern<TypeRef>, ParseError<'input>> {
-    let tokenizer = tok::Tokenizer::new(input, offset);
-    lrgrammar::PatternParser::new().parse(input, tokenizer)
-}
-
-fn parse_match_mapping<'input>(
-    input: &'input str,
-    offset: usize,
-) -> Result<MatchMapping, ParseError<'input>> {
-    let tokenizer = tok::Tokenizer::new(input, offset);
-    lrgrammar::MatchMappingParser::new().parse(input, tokenizer)
-}
-
-#[cfg(test)]
-pub fn parse_type_ref<'input>(input: &'input str) -> Result<TypeRef, ParseError<'input>> {
-    let tokenizer = tok::Tokenizer::new(input, 0);
-    lrgrammar::TypeRefParser::new().parse(input, tokenizer)
-}
-
-#[cfg(test)]
-pub fn parse_where_clauses<'input>(
-    input: &'input str,
-) -> Result<Vec<WhereClause<TypeRef>>, ParseError<'input>> {
-    let tokenizer = tok::Tokenizer::new(input, 0);
-    lrgrammar::GrammarWhereClausesParser::new().parse(input, tokenizer)
-}
+use std::iter;
+
+use grammar::parse_tree::*;
+use grammar::pattern::*;
+use lalrpop_util;
+use tok;
+
+#[cfg(not(any(feature = "test", test)))]
+#[allow(dead_code)]
+mod lrgrammar;
+
+#[cfg(any(feature = "test", test))]
+lalrpop_mod!(
+    // ---------------------------------------------------------------------------------------
+    // NOTE: Run `cargo build -p lalrpop` once before running `cargo test` to create this file
+    // ---------------------------------------------------------------------------------------
+    #[allow(dead_code)]
+    lrgrammar,
+    "/src/parser/lrgrammar.rs"
+);
+
+#[cfg(test)]
+mod test;
+
+pub enum Top {
+    Grammar(Grammar),
+    Pattern(Pattern<TypeRef>),
+    MatchMapping(TerminalString),
+    TypeRef(TypeRef),
+    GrammarWhereClauses(Vec<WhereClause<TypeRef>>),
+}
+
+pub type ParseError<'input> = lalrpop_util::ParseError<usize, tok::Tok<'input>, tok::Error>;
+
+macro_rules! parser {
+    ($input: expr, $offset: expr, $pat: ident, $tok: ident) => {{
+        let input = $input;
+        let tokenizer =
+            iter::once(Ok((0, tok::Tok::$tok, 0))).chain(tok::Tokenizer::new(input, $offset));
+        lrgrammar::TopParser::new()
+            .parse(input, tokenizer)
+            .map(|top| match top {
+                Top::$pat(x) => x,
+                _ => unreachable!(),
+            })
+    }};
+}
+
+pub fn parse_grammar<'input>(input: &'input str) -> Result<Grammar, ParseError<'input>> {
+    let mut grammar = try!(parser!(input, 0, Grammar, StartGrammar));
+
+    // find a unique prefix that does not appear anywhere in the input
+    while input.contains(&grammar.prefix) {
+        grammar.prefix.push('_');
+    }
+
+    Ok(grammar)
+}
+
+fn parse_pattern<'input>(
+    input: &'input str,
+    offset: usize,
+) -> Result<Pattern<TypeRef>, ParseError<'input>> {
+    parser!(input, offset, Pattern, StartPattern)
+}
+
+fn parse_match_mapping<'input>(
+    input: &'input str,
+    offset: usize,
+) -> Result<MatchMapping, ParseError<'input>> {
+    parser!(input, offset, MatchMapping, StartMatchMapping)
+}
+
+#[cfg(test)]
+pub fn parse_type_ref<'input>(input: &'input str) -> Result<TypeRef, ParseError<'input>> {
+    parser!(input, 0, TypeRef, StartTypeRef)
+}
+
+#[cfg(test)]
+pub fn parse_where_clauses<'input>(
+    input: &'input str,
+) -> Result<Vec<WhereClause<TypeRef>>, ParseError<'input>> {
+    parser!(input, 0, GrammarWhereClauses, StartGrammarWhereClauses)
+}
--- a/lalrpop/src/rust/mod.rs
+++ b/lalrpop/src/rust/mod.rs
@ -53,11 +53,15 @@ impl<W: Write> RustWrite<W> {
    }

    fn write_indentation(&mut self) -> io::Result<()> {
-        write!(self.write, "{0:1$}", "", self.indent)
+        if Tls::session().emit_whitespace {
+            write!(self.write, "{0:1$}", "", self.indent)?;
+        }
+        Ok(())
    }

    fn write_indented(&mut self, out: &str) -> io::Result<()> {
-        writeln!(self.write, "{0:1$}{2}", "", self.indent, out)
+        self.write_indentation()?;
+        writeln!(self.write, "{}", out)
    }

    pub fn write_table_row<I, C>(&mut self, iterable: I) -> io::Result<()>
@ -65,7 +69,8 @@ impl<W: Write> RustWrite<W> {
        I: IntoIterator<Item = (i32, C)>,
        C: fmt::Display,
    {
-        if Tls::session().emit_comments {
+        let session = Tls::session();
+        if session.emit_comments {
            for (i, comment) in iterable {
                try!(self.write_indentation());
                try!(writeln!(self.write, "{}, {}", i, comment));
@ -74,7 +79,7 @@ impl<W: Write> RustWrite<W> {
            try!(self.write_indentation());
            let mut first = true;
            for (i, _comment) in iterable {
-                if !first {
+                if !first && session.emit_whitespace {
                    try!(write!(self.write, " "));
                }
                try!(write!(self.write, "{},", i));
--- a/lalrpop/src/session.rs
+++ b/lalrpop/src/session.rs
@ -41,6 +41,9 @@ pub struct Session {
    /// forth.
    pub emit_comments: bool,

+    /// Emit whitespace in the generated code to improve readability.
+    pub emit_whitespace: bool,
+
    /// Emit report file about generated code
    pub emit_report: bool,

@ -92,6 +95,7 @@ impl Session {
            out_dir: None,
            force_build: false,
            emit_comments: false,
+            emit_whitespace: true,
            emit_report: false,
            color_config: ColorConfig::default(),
            max_errors: 1,
@ -117,6 +121,7 @@ impl Session {
            out_dir: None,
            force_build: false,
            emit_comments: false,
+            emit_whitespace: true,
            emit_report: false,
            color_config: ColorConfig::IfTty,
            max_errors: 1,
--- a/lalrpop/src/tok/mod.rs
+++ b/lalrpop/src/tok/mod.rs
@ -97,6 +97,15 @@ pub enum Tok<'input> {
    Underscore,
    Bang,
    ShebangAttribute(&'input str), // #![...]
+
+    // Dummy tokens for parser sharing
+    StartGrammar,
+    StartPattern,
+    StartMatchMapping,
+    #[allow(dead_code)]
+    StartGrammarWhereClauses,
+    #[allow(dead_code)]
+    StartTypeRef,
 }

 pub struct Tokenizer<'input> {
--- a/snap.sh
+++ b/snap.sh
@ -1,3 +1,3 @@
 #!/bin/bash

-cargo run -p lalrpop -- --force --out-dir . lalrpop/src/parser/lrgrammar.lalrpop
+cargo run -p lalrpop -- --force --no-whitespace --out-dir . lalrpop/src/parser/lrgrammar.lalrpop