diff --git a/lalrpop/src/build/mod.rs b/lalrpop/src/build/mod.rs index 160a99f..f77f56a 100644 --- a/lalrpop/src/build/mod.rs +++ b/lalrpop/src/build/mod.rs @@ -233,6 +233,9 @@ fn parse_and_normalize_grammar(session: &Session, file_text: &FileText) -> io::R tok::ErrorCode::UnterminatedCharacterLiteral => { "unterminated character literal; missing `'`?" } + tok::ErrorCode::UnterminatedAttribute => { + "unterminated #! attribute; missing ]?" + } tok::ErrorCode::ExpectedStringLiteral => "expected string literal; missing `\"`?", tok::ErrorCode::UnterminatedCode => { "unterminated code block; perhaps a missing `;`, `)`, `]` or `}`?" diff --git a/lalrpop/src/parser/lrgrammar.lalrpop b/lalrpop/src/parser/lrgrammar.lalrpop index e879bd5..c10e3b8 100644 --- a/lalrpop/src/parser/lrgrammar.lalrpop +++ b/lalrpop/src/parser/lrgrammar.lalrpop @@ -401,6 +401,7 @@ extern { "=>@R" => Tok::EqualsGreaterThanLookbehind, ">" => Tok::GreaterThan, "#" => Tok::Hash, + "#![...]" => Tok::ShebangAttribute(<&'input str>), "{" => Tok::LeftBrace, "[" => Tok::LeftBracket, "(" => Tok::LeftParen, diff --git a/lalrpop/src/tok/mod.rs b/lalrpop/src/tok/mod.rs index 49100dc..792007c 100644 --- a/lalrpop/src/tok/mod.rs +++ b/lalrpop/src/tok/mod.rs @@ -21,6 +21,7 @@ pub enum ErrorCode { UnterminatedEscape, UnterminatedStringLiteral, UnterminatedCharacterLiteral, + UnterminatedAttribute, UnterminatedCode, ExpectedStringLiteral, } @@ -87,7 +88,8 @@ pub enum Tok<'input> { Star, TildeTilde, Underscore, - Bang + Bang, + ShebangAttribute(&'input str), // #![...] } pub struct Tokenizer<'input> { @@ -117,6 +119,40 @@ const KEYWORDS: &'static [(&'static str, Tok<'static>)] = &[ ("type", Type), ]; +/* + * Helper for backtracking. + */ +macro_rules! first { + ($this:expr, $action:expr, $fallback:expr) => { + { + let fallback_state = ($this.chars.clone(), $this.lookahead); + let result = $action; + match result { + Ok(_) => { + Some(result) + } + _ => { + $this.chars = fallback_state.0; + $this.lookahead = fallback_state.1; + Some($fallback) + } + } + } + } +} + +macro_rules! try_opt { + ($e:expr, $err:expr) => { + { + let r = $e; + match r { + Some(Ok(val)) => val, + Some(Err(err)) => return Err(err), + None => return $err, + } + } + } +} impl<'input> Tokenizer<'input> { pub fn new(text: &'input str, shift: usize) -> Tokenizer<'input> { @@ -130,6 +166,47 @@ impl<'input> Tokenizer<'input> { t } + fn shebang_attribute(&mut self, idx0: usize) -> Result>, Error> { + try_opt!(self.expect_char('!'), error(ErrorCode::UnrecognizedToken, idx0)); + try_opt!(self.expect_char('['), error(ErrorCode::UnterminatedAttribute, idx0)); + let mut sq_bracket_counter = 1; + while let Some((idx1, c)) = self.lookahead { + match c { + '[' => { + self.bump(); + sq_bracket_counter += 1 + } + ']' => { + self.bump(); + sq_bracket_counter -= 1; + match sq_bracket_counter { + 0 => { + let idx2 = idx1 + 1; + let data = &self.text[idx0..idx2]; + self.bump(); + return Ok((idx0, ShebangAttribute(data), idx2)) + }, + n if n < 0 => { + return error(UnrecognizedToken, idx0) + } + _ => () + } + } + '"' => { + self.bump(); + let _ = try!(self.string_literal(idx1)); + } + '\n' => { + return error(UnrecognizedToken, idx0) + } + _ => { + self.bump(); + } + } + } + error(UnrecognizedToken, idx0) + } + fn next_unshifted(&mut self) -> Option>, Error>> { loop { return match self.lookahead { @@ -195,7 +272,9 @@ impl<'input> Tokenizer<'input> { } Some((idx0, '#')) => { self.bump(); - Some(Ok((idx0, Hash, idx0+1))) + first!(self, + { self.shebang_attribute(idx0) }, + { Ok((idx0, Hash, idx0+1)) }) } Some((idx0, '>')) => { self.bump(); @@ -493,7 +572,6 @@ impl<'input> Tokenizer<'input> { false } }; - match self.take_until(terminate) { Some(idx1) => { self.bump(); // consume the closing quote @@ -686,6 +764,23 @@ impl<'input> Tokenizer<'input> { self.bump().map(|p| {p.0}) }) } + + fn expect_char(&mut self, c : char) -> Option> { + match self.lookahead { + Some((idx0, cc)) if c == cc => { + self.bump(); + Some(Ok((idx0))) + } + Some((idx0, cc)) => { + self.bump(); + Some(error(UnrecognizedToken, idx0)) + } + None => { + None + } + } + } + } impl<'input> Iterator for Tokenizer<'input> { diff --git a/lalrpop/src/tok/test.rs b/lalrpop/src/tok/test.rs index 9e93ca5..a271979 100644 --- a/lalrpop/src/tok/test.rs +++ b/lalrpop/src/tok/test.rs @@ -385,6 +385,55 @@ fn regex1() { ]); } +#[test] +fn hash_token() { + test(r#" # "#, vec![ + (r#" ~ "#, Hash) + ]); +} + +#[test] +fn shebang_attribute_normal_text() { + test(r#" #![Attribute] "#, vec![ + (r#" ~~~~~~~~~~~~~ "#, ShebangAttribute("#![Attribute]")) + ]); +} + +#[test] +fn shebang_attribute_special_characters_without_quotes() { + test(r#" #![set width = 80] "#, vec![ + (r#" ~~~~~~~~~~~~~~~~~~ "#, ShebangAttribute("#![set width = 80]")) + ]); +} + +#[test] +fn shebang_attribute_special_characters_with_quotes() { + test(r#" #![set width = "80"] "#, vec![ + (r#" ~~~~~~~~~~~~~~~~~~~~ "#, ShebangAttribute(r#"#![set width = "80"]"#)) + ]); +} + +#[test] +fn shebang_attribute_special_characters_closing_sqbracket_in_string_literal() { + test(r#" #![set width = "80]"] "#, vec![ + (r#" ~~~~~~~~~~~~~~~~~~~~~ "#, ShebangAttribute(r#"#![set width = "80]"]"#)) + ]); +} + +#[test] +fn shebang_attribute_special_characters_opening_sqbracket_in_string_literal() { + test(r#" #![set width = "[80"] "#, vec![ + (r#" ~~~~~~~~~~~~~~~~~~~~~ "#, ShebangAttribute(r#"#![set width = "[80"]"#)) + ]); +} + +#[test] +fn shebang_attribute_special_characters_nested_sqbrackets() { + test(r#" #![set width = [80]] "#, vec![ + (r#" ~~~~~~~~~~~~~~~~~~~~ "#, ShebangAttribute(r#"#![set width = [80]]"#)) + ]); +} + #[test] fn regex2() { test(r#"r"(123""#, vec![