From 5890485b20699facbf8ad76565df4cd0df303c7d Mon Sep 17 00:00:00 2001 From: Niko Matsakis Date: Fri, 24 Jul 2015 05:06:42 -0400 Subject: [PATCH] Patch up tokenizer for use/where; use is_xid_start from cargo --- lalrpop/Cargo.toml | 1 + lalrpop/src/lib.rs | 1 + lalrpop/src/normalize/prevalidate/test.rs | 7 ++ lalrpop/src/tok/mod.rs | 118 ++++++++++++---------- lalrpop/src/tok/test.rs | 39 +++++-- 5 files changed, 104 insertions(+), 62 deletions(-) diff --git a/lalrpop/Cargo.toml b/lalrpop/Cargo.toml index a2e9386..e291eea 100644 --- a/lalrpop/Cargo.toml +++ b/lalrpop/Cargo.toml @@ -9,6 +9,7 @@ diff = "0.1" rand = "0.3" itertools = "0.3" term = "0.2" +unicode-xid = "0.0.2" [dependencies.rusty-peg] git = "https://github.com/nikomatsakis/rusty-peg.git" \ No newline at end of file diff --git a/lalrpop/src/lib.rs b/lalrpop/src/lib.rs index 80b396d..5dd9594 100644 --- a/lalrpop/src/lib.rs +++ b/lalrpop/src/lib.rs @@ -15,6 +15,7 @@ extern crate rand; extern crate regex; extern crate term; extern crate itertools; +extern crate unicode_xid; // rust exports a macro that others use, so hoist it early. #[macro_use] diff --git a/lalrpop/src/normalize/prevalidate/test.rs b/lalrpop/src/normalize/prevalidate/test.rs index f4e6a29..eb38333 100644 --- a/lalrpop/src/normalize/prevalidate/test.rs +++ b/lalrpop/src/normalize/prevalidate/test.rs @@ -42,6 +42,13 @@ fn unknown_nonterminal_in_macro_arg() { r#"grammar; X = X Id<>>>Y<<<>; Id = T;"#); } +#[test] +fn unknown_nonterminal_in_repeat_question() { + check_err( + "no definition found for nonterminal `Y`", + r#"grammar; X = >>>Y<< { Mut, Pub, Token, - Use, - Where, + + // Special keywords: these are accompanied by a series of + // uninterpreted strings representing imports and stuff. + Use(&'input str), + Where(Vec<&'input str>), // Identifiers of various kinds: Escape(&'input str), @@ -90,8 +94,6 @@ const KEYWORDS: &'static [(&'static str, Tok<'static>)] = &[ ("mut", Mut), ("pub", Pub), ("token", Token), - ("use", Use), - ("where", Where), ]; impl<'input> Tokenizer<'input> { @@ -137,13 +139,13 @@ impl<'input> Tokenizer<'input> { Some((idx1, '?')) => { self.bump(); - let idx2 = try!(self.code(idx0)); + let idx2 = try!(self.code(idx0, "([{", "}])")); let code = &self.text[idx1+1..idx2]; Ok((idx0, EqualsGreaterThanQuestionCode(code), idx2)) } Some((idx1, _)) => { - let idx2 = try!(self.code(idx0)); + let idx2 = try!(self.code(idx0, "([{", "}])")); let code = &self.text[idx1..idx2]; Ok((idx0, EqualsGreaterThanCode(code), idx2)) } @@ -154,52 +156,37 @@ impl<'input> Tokenizer<'input> { } } - fn code(&mut self, idx0: usize) -> Result { + fn code(&mut self, idx0: usize, open_delims: &str, close_delims: &str) -> Result { // This is the interesting case. To find the end of the code, // we have to scan ahead, matching (), [], and {}, and looking // for a suitable terminator: `,`, `;`, `]`, `}`, or `)`. let mut balance = 0; // number of unclosed `(` etc loop { - match self.lookahead { - Some((_, '(')) | - Some((_, '[')) | - Some((_, '{')) => { + if let Some((idx, c)) = self.lookahead { + if open_delims.find(c).is_some() { balance += 1; - } + } else if balance > 0 { + if close_delims.find(c).is_some() { + balance -= 1; + } + } else { + debug_assert!(balance == 0); - Some((_, ')')) | - Some((_, ']')) | - Some((_, '}')) if balance > 0 => { - balance -= 1; + if c == ',' || c == ';' || close_delims.find(c).is_some() { + // Note: we do not consume the + // terminator. The code is everything *up + // to but not including* the terminating + // `,`, `;`, etc. + return Ok(idx); + } } - - None if balance == 0 => { - // Note: we do not consume the - // terminator. The code is everything *up - // to but not including* the terminating - // `,`, `;`, etc. - return Ok(self.text.len()); - } - - Some((idx2, ';')) | - Some((idx2, ',')) | - Some((idx2, ')')) | - Some((idx2, ']')) | - Some((idx2, '}')) if balance == 0 => { - // Note: we do not consume the - // terminator. The code is everything *up - // to but not including* the terminating - // `,`, `;`, etc. - return Ok(idx2); - } - - None if balance > 0 => { - // the input should not end with an - // unbalanced number of `{` etc! - return Err(UnterminatedCode(idx0)); - } - - _ => { } + } else if balance > 0 { + // the input should not end with an + // unbalanced number of `{` etc! + return Err(UnterminatedCode(idx0)); + } else { + debug_assert!(balance == 0); + return Ok(self.text.len()); } self.bump(); @@ -251,9 +238,40 @@ impl<'input> Tokenizer<'input> { (start, Lifetime(word), end) } - fn identifierish(&mut self, idx0: usize) -> Spanned> { + fn identifierish(&mut self, idx0: usize) -> Result>, Error> { let (start, word, end) = self.word(idx0); + if word == "use" { + let code_end = try!(self.code(idx0, "([{", "}])")); + let code = &self.text[end..code_end]; + return Ok((start, Tok::Use(code), code_end)); + } + + if word == "where" { + let mut wcs = vec![]; + let mut wc_start = end; + let mut wc_end; + loop { + // Note: do not include `{` as a delimeter here, as + // that is not legal in the trait/where-clause syntax, + // and in fact signals start of the fn body. But do + // include `<`. + wc_end = try!(self.code(wc_start, "([<", ">])")); + let wc = &self.text[wc_start..wc_end]; + wcs.push(wc); + + // if this ended in a comma, maybe expect another where-clause + if let Some((_, ',')) = self.lookahead { + self.bump(); + wc_start = wc_end + 1; + } else { + break; + } + } + + return Ok((start, Tok::Where(wcs), wc_end)); + } + let tok = // search for a keyword first; if none are found, this is // either a MacroId or an Id, depending on whether there @@ -269,7 +287,7 @@ impl<'input> Tokenizer<'input> { } }); - (start, tok, end) + Ok((start, tok, end)) } fn word(&mut self, idx0: usize) -> Spanned<&'input str> { @@ -473,7 +491,7 @@ impl<'input> Iterator for Tokenizer<'input> { } } Some((idx0, c)) if is_identifier_start(c) => { - Some(Ok(self.identifierish(idx0))) + Some(self.identifierish(idx0)) } Some((_, c)) if c.is_whitespace() => { self.bump(); @@ -491,11 +509,9 @@ impl<'input> Iterator for Tokenizer<'input> { } fn is_identifier_start(c: char) -> bool { - // for some reason c.is_xid_start() is not stable :( - c.is_alphabetic() || c == '_' + UnicodeXID::is_xid_start(c) } fn is_identifier_continue(c: char) -> bool { - // for some reason c.is_xid_continue() is not stable :( - c.is_alphabetic() || c == '_' || c.is_digit(10) + UnicodeXID::is_xid_continue(c) } diff --git a/lalrpop/src/tok/test.rs b/lalrpop/src/tok/test.rs index 571b9dd..4b4cb8d 100644 --- a/lalrpop/src/tok/test.rs +++ b/lalrpop/src/tok/test.rs @@ -1,4 +1,4 @@ -use super::{is_identifier_start, is_identifier_continue, Tok, Tokenizer}; +use super::{Tok, Tokenizer}; use super::Tok::*; fn test(input: &str, @@ -21,16 +21,6 @@ fn test(input: &str, assert_eq!(None, tokenizer.skip(len).next()); } -#[test] -fn identifier_start1() { - assert!(is_identifier_start('f')); -} - -#[test] -fn identifier_continue1() { - assert!(is_identifier_continue('o')); -} - #[test] fn basic() { test("extern foo", vec![ @@ -97,3 +87,30 @@ fn string_literals() { (r#" ~~~"#, Id("baz")), ]); } + +#[test] +fn use1() { + test(r#"use foo::bar; baz"#, vec![ + (r#"~~~~~~~~~~~~ "#, Use(" foo::bar")), + (r#" ~ "#, Semi), + (r#" ~~~"#, Id("baz")), + ]); +} + +#[test] +fn use2() { + test(r#"use {foo,bar}; baz"#, vec![ + (r#"~~~~~~~~~~~~~ "#, Use(" {foo,bar}")), + (r#" ~ "#, Semi), + (r#" ~~~"#, Id("baz")), + ]); +} + +#[test] +fn where1() { + test(r#"where ,baz;"#, vec![ + (r#"~~~~~~~~~~~~~~~~~~~ "#, Where(vec![" ", "baz"])), + (r#" ~"#, Semi), + ]); +} +