Patch up tokenizer for use/where; use is_xid_start from cargo

This commit is contained in:
Niko Matsakis 2015-07-24 05:06:42 -04:00
parent 1d4c5d3efc
commit 5890485b20
5 changed files with 104 additions and 62 deletions

View File

@ -9,6 +9,7 @@ diff = "0.1"
rand = "0.3"
itertools = "0.3"
term = "0.2"
unicode-xid = "0.0.2"
[dependencies.rusty-peg]
git = "https://github.com/nikomatsakis/rusty-peg.git"

View File

@ -15,6 +15,7 @@ extern crate rand;
extern crate regex;
extern crate term;
extern crate itertools;
extern crate unicode_xid;
// rust exports a macro that others use, so hoist it early.
#[macro_use]

View File

@ -42,6 +42,13 @@ fn unknown_nonterminal_in_macro_arg() {
r#"grammar; X = X Id<>>>Y<<<>; Id<T> = T;"#);
}
#[test]
fn unknown_nonterminal_in_repeat_question() {
check_err(
"no definition found for nonterminal `Y`",
r#"grammar; X = >>>Y<<<?;"#);
}
#[test]
fn repeated_macro_arg() {
check_err(

View File

@ -1,6 +1,7 @@
//! A tokenizer for use in LALRPOP itself.
use std::str::CharIndices;
use unicode_xid::UnicodeXID;
use self::Error::*;
use self::Tok::*;
@ -26,8 +27,11 @@ pub enum Tok<'input> {
Mut,
Pub,
Token,
Use,
Where,
// Special keywords: these are accompanied by a series of
// uninterpreted strings representing imports and stuff.
Use(&'input str),
Where(Vec<&'input str>),
// Identifiers of various kinds:
Escape(&'input str),
@ -90,8 +94,6 @@ const KEYWORDS: &'static [(&'static str, Tok<'static>)] = &[
("mut", Mut),
("pub", Pub),
("token", Token),
("use", Use),
("where", Where),
];
impl<'input> Tokenizer<'input> {
@ -137,13 +139,13 @@ impl<'input> Tokenizer<'input> {
Some((idx1, '?')) => {
self.bump();
let idx2 = try!(self.code(idx0));
let idx2 = try!(self.code(idx0, "([{", "}])"));
let code = &self.text[idx1+1..idx2];
Ok((idx0, EqualsGreaterThanQuestionCode(code), idx2))
}
Some((idx1, _)) => {
let idx2 = try!(self.code(idx0));
let idx2 = try!(self.code(idx0, "([{", "}])"));
let code = &self.text[idx1..idx2];
Ok((idx0, EqualsGreaterThanCode(code), idx2))
}
@ -154,52 +156,37 @@ impl<'input> Tokenizer<'input> {
}
}
fn code(&mut self, idx0: usize) -> Result<usize, Error> {
fn code(&mut self, idx0: usize, open_delims: &str, close_delims: &str) -> Result<usize, Error> {
// This is the interesting case. To find the end of the code,
// we have to scan ahead, matching (), [], and {}, and looking
// for a suitable terminator: `,`, `;`, `]`, `}`, or `)`.
let mut balance = 0; // number of unclosed `(` etc
loop {
match self.lookahead {
Some((_, '(')) |
Some((_, '[')) |
Some((_, '{')) => {
if let Some((idx, c)) = self.lookahead {
if open_delims.find(c).is_some() {
balance += 1;
}
} else if balance > 0 {
if close_delims.find(c).is_some() {
balance -= 1;
}
} else {
debug_assert!(balance == 0);
Some((_, ')')) |
Some((_, ']')) |
Some((_, '}')) if balance > 0 => {
balance -= 1;
if c == ',' || c == ';' || close_delims.find(c).is_some() {
// Note: we do not consume the
// terminator. The code is everything *up
// to but not including* the terminating
// `,`, `;`, etc.
return Ok(idx);
}
}
None if balance == 0 => {
// Note: we do not consume the
// terminator. The code is everything *up
// to but not including* the terminating
// `,`, `;`, etc.
return Ok(self.text.len());
}
Some((idx2, ';')) |
Some((idx2, ',')) |
Some((idx2, ')')) |
Some((idx2, ']')) |
Some((idx2, '}')) if balance == 0 => {
// Note: we do not consume the
// terminator. The code is everything *up
// to but not including* the terminating
// `,`, `;`, etc.
return Ok(idx2);
}
None if balance > 0 => {
// the input should not end with an
// unbalanced number of `{` etc!
return Err(UnterminatedCode(idx0));
}
_ => { }
} else if balance > 0 {
// the input should not end with an
// unbalanced number of `{` etc!
return Err(UnterminatedCode(idx0));
} else {
debug_assert!(balance == 0);
return Ok(self.text.len());
}
self.bump();
@ -251,9 +238,40 @@ impl<'input> Tokenizer<'input> {
(start, Lifetime(word), end)
}
fn identifierish(&mut self, idx0: usize) -> Spanned<Tok<'input>> {
fn identifierish(&mut self, idx0: usize) -> Result<Spanned<Tok<'input>>, Error> {
let (start, word, end) = self.word(idx0);
if word == "use" {
let code_end = try!(self.code(idx0, "([{", "}])"));
let code = &self.text[end..code_end];
return Ok((start, Tok::Use(code), code_end));
}
if word == "where" {
let mut wcs = vec![];
let mut wc_start = end;
let mut wc_end;
loop {
// Note: do not include `{` as a delimeter here, as
// that is not legal in the trait/where-clause syntax,
// and in fact signals start of the fn body. But do
// include `<`.
wc_end = try!(self.code(wc_start, "([<", ">])"));
let wc = &self.text[wc_start..wc_end];
wcs.push(wc);
// if this ended in a comma, maybe expect another where-clause
if let Some((_, ',')) = self.lookahead {
self.bump();
wc_start = wc_end + 1;
} else {
break;
}
}
return Ok((start, Tok::Where(wcs), wc_end));
}
let tok =
// search for a keyword first; if none are found, this is
// either a MacroId or an Id, depending on whether there
@ -269,7 +287,7 @@ impl<'input> Tokenizer<'input> {
}
});
(start, tok, end)
Ok((start, tok, end))
}
fn word(&mut self, idx0: usize) -> Spanned<&'input str> {
@ -473,7 +491,7 @@ impl<'input> Iterator for Tokenizer<'input> {
}
}
Some((idx0, c)) if is_identifier_start(c) => {
Some(Ok(self.identifierish(idx0)))
Some(self.identifierish(idx0))
}
Some((_, c)) if c.is_whitespace() => {
self.bump();
@ -491,11 +509,9 @@ impl<'input> Iterator for Tokenizer<'input> {
}
fn is_identifier_start(c: char) -> bool {
// for some reason c.is_xid_start() is not stable :(
c.is_alphabetic() || c == '_'
UnicodeXID::is_xid_start(c)
}
fn is_identifier_continue(c: char) -> bool {
// for some reason c.is_xid_continue() is not stable :(
c.is_alphabetic() || c == '_' || c.is_digit(10)
UnicodeXID::is_xid_continue(c)
}

View File

@ -1,4 +1,4 @@
use super::{is_identifier_start, is_identifier_continue, Tok, Tokenizer};
use super::{Tok, Tokenizer};
use super::Tok::*;
fn test(input: &str,
@ -21,16 +21,6 @@ fn test(input: &str,
assert_eq!(None, tokenizer.skip(len).next());
}
#[test]
fn identifier_start1() {
assert!(is_identifier_start('f'));
}
#[test]
fn identifier_continue1() {
assert!(is_identifier_continue('o'));
}
#[test]
fn basic() {
test("extern foo", vec![
@ -97,3 +87,30 @@ fn string_literals() {
(r#" ~~~"#, Id("baz")),
]);
}
#[test]
fn use1() {
test(r#"use foo::bar; baz"#, vec![
(r#"~~~~~~~~~~~~ "#, Use(" foo::bar")),
(r#" ~ "#, Semi),
(r#" ~~~"#, Id("baz")),
]);
}
#[test]
fn use2() {
test(r#"use {foo,bar}; baz"#, vec![
(r#"~~~~~~~~~~~~~ "#, Use(" {foo,bar}")),
(r#" ~ "#, Semi),
(r#" ~~~"#, Id("baz")),
]);
}
#[test]
fn where1() {
test(r#"where <foo,bar>,baz;"#, vec![
(r#"~~~~~~~~~~~~~~~~~~~ "#, Where(vec![" <foo,bar>", "baz"])),
(r#" ~"#, Semi),
]);
}