Patch up tokenizer for use/where; use is_xid_start from cargo

2025-03-16 17:00:53 +00:00 · 2015-07-24 05:06:42 -04:00 · 2015-07-24 05:06:42 -04:00 · 5890485b20
commit 5890485b20
parent 1d4c5d3efc
5 changed files with 104 additions and 62 deletions
--- a/lalrpop/Cargo.toml
+++ b/lalrpop/Cargo.toml
@ -9,6 +9,7 @@ diff = "0.1"
 rand = "0.3"
 itertools = "0.3"
 term = "0.2"
+unicode-xid = "0.0.2"

 [dependencies.rusty-peg]
 git = "https://github.com/nikomatsakis/rusty-peg.git"
--- a/lalrpop/src/lib.rs
+++ b/lalrpop/src/lib.rs
@ -15,6 +15,7 @@ extern crate rand;
 extern crate regex;
 extern crate term;
 extern crate itertools;
+extern crate unicode_xid;

 // rust exports a macro that others use, so hoist it early.
 #[macro_use]
--- a/lalrpop/src/normalize/prevalidate/test.rs
+++ b/lalrpop/src/normalize/prevalidate/test.rs
@ -42,6 +42,13 @@ fn unknown_nonterminal_in_macro_arg() {
        r#"grammar; X = X Id<>>>Y<<<>; Id<T> = T;"#);
 }

+#[test]
+fn unknown_nonterminal_in_repeat_question() {
+    check_err(
+        "no definition found for nonterminal `Y`",
+        r#"grammar; X = >>>Y<<<?;"#);
+}
+
 #[test]
 fn repeated_macro_arg() {
    check_err(
--- a/lalrpop/src/tok/mod.rs
+++ b/lalrpop/src/tok/mod.rs
@ -1,6 +1,7 @@
 //! A tokenizer for use in LALRPOP itself.

 use std::str::CharIndices;
+use unicode_xid::UnicodeXID;

 use self::Error::*;
 use self::Tok::*;
@ -26,8 +27,11 @@ pub enum Tok<'input> {
    Mut,
    Pub,
    Token,
-    Use,
-    Where,
+
+    // Special keywords: these are accompanied by a series of
+    // uninterpreted strings representing imports and stuff.
+    Use(&'input str),
+    Where(Vec<&'input str>),

    // Identifiers of various kinds:
    Escape(&'input str),
@ -90,8 +94,6 @@ const KEYWORDS: &'static [(&'static str, Tok<'static>)] = &[
    ("mut", Mut),
    ("pub", Pub),
    ("token", Token),
-    ("use", Use),
-    ("where", Where),
    ];

 impl<'input> Tokenizer<'input> {
@ -137,13 +139,13 @@ impl<'input> Tokenizer<'input> {

            Some((idx1, '?')) => {
                self.bump();
-                let idx2 = try!(self.code(idx0));
+                let idx2 = try!(self.code(idx0, "([{", "}])"));
                let code = &self.text[idx1+1..idx2];
                Ok((idx0, EqualsGreaterThanQuestionCode(code), idx2))
            }

            Some((idx1, _)) => {
-                let idx2 = try!(self.code(idx0));
+                let idx2 = try!(self.code(idx0, "([{", "}])"));
                let code = &self.text[idx1..idx2];
                Ok((idx0, EqualsGreaterThanCode(code), idx2))
            }
@ -154,52 +156,37 @@ impl<'input> Tokenizer<'input> {
        }
    }

-    fn code(&mut self, idx0: usize) -> Result<usize, Error> {
+    fn code(&mut self, idx0: usize, open_delims: &str, close_delims: &str) -> Result<usize, Error> {
        // This is the interesting case. To find the end of the code,
        // we have to scan ahead, matching (), [], and {}, and looking
        // for a suitable terminator: `,`, `;`, `]`, `}`, or `)`.
        let mut balance = 0; // number of unclosed `(` etc
        loop {
-            match self.lookahead {
-                Some((_, '(')) |
-                Some((_, '[')) |
-                Some((_, '{')) => {
+            if let Some((idx, c)) = self.lookahead {
+                if open_delims.find(c).is_some() {
                    balance += 1;
-                }
+                } else if balance > 0 {
+                    if close_delims.find(c).is_some() {
+                        balance -= 1;
+                    }
+                } else {
+                    debug_assert!(balance == 0);

-                Some((_, ')')) |
-                Some((_, ']')) |
-                Some((_, '}')) if balance > 0 => {
-                    balance -= 1;
+                    if c == ',' || c == ';' || close_delims.find(c).is_some() {
+                        // Note: we do not consume the
+                        // terminator. The code is everything *up
+                        // to but not including* the terminating
+                        // `,`, `;`, etc.
+                        return Ok(idx);
+                    }
                }
-
-                None if balance == 0 => {
-                    // Note: we do not consume the
-                    // terminator. The code is everything *up
-                    // to but not including* the terminating
-                    // `,`, `;`, etc.
-                    return Ok(self.text.len());
-                }
-
-                Some((idx2, ';')) |
-                Some((idx2, ',')) |
-                Some((idx2, ')')) |
-                Some((idx2, ']')) |
-                Some((idx2, '}')) if balance == 0 => {
-                    // Note: we do not consume the
-                    // terminator. The code is everything *up
-                    // to but not including* the terminating
-                    // `,`, `;`, etc.
-                    return Ok(idx2);
-                }
-
-                None if balance > 0 => {
-                    // the input should not end with an
-                    // unbalanced number of `{` etc!
-                    return Err(UnterminatedCode(idx0));
-                }
-
-                _ => { }
+            } else if balance > 0 {
+                // the input should not end with an
+                // unbalanced number of `{` etc!
+                return Err(UnterminatedCode(idx0));
+            } else {
+                debug_assert!(balance == 0);
+                return Ok(self.text.len());
            }

            self.bump();
@ -251,9 +238,40 @@ impl<'input> Tokenizer<'input> {
        (start, Lifetime(word), end)
    }

-    fn identifierish(&mut self, idx0: usize) -> Spanned<Tok<'input>> {
+    fn identifierish(&mut self, idx0: usize) -> Result<Spanned<Tok<'input>>, Error> {
        let (start, word, end) = self.word(idx0);

+        if word == "use" {
+            let code_end = try!(self.code(idx0, "([{", "}])"));
+            let code = &self.text[end..code_end];
+            return Ok((start, Tok::Use(code), code_end));
+        }
+
+        if word == "where" {
+            let mut wcs = vec![];
+            let mut wc_start = end;
+            let mut wc_end;
+            loop {
+                // Note: do not include `{` as a delimeter here, as
+                // that is not legal in the trait/where-clause syntax,
+                // and in fact signals start of the fn body. But do
+                // include `<`.
+                wc_end = try!(self.code(wc_start, "([<", ">])"));
+                let wc = &self.text[wc_start..wc_end];
+                wcs.push(wc);
+
+                // if this ended in a comma, maybe expect another where-clause
+                if let Some((_, ',')) = self.lookahead {
+                    self.bump();
+                    wc_start = wc_end + 1;
+                } else {
+                    break;
+                }
+            }
+
+            return Ok((start, Tok::Where(wcs), wc_end));
+        }
+
        let tok =
            // search for a keyword first; if none are found, this is
            // either a MacroId or an Id, depending on whether there
@ -269,7 +287,7 @@ impl<'input> Tokenizer<'input> {
                        }
                    });

-        (start, tok, end)
+        Ok((start, tok, end))
    }

    fn word(&mut self, idx0: usize) -> Spanned<&'input str> {
@ -473,7 +491,7 @@ impl<'input> Iterator for Tokenizer<'input> {
                    }
                }
                Some((idx0, c)) if is_identifier_start(c) => {
-                    Some(Ok(self.identifierish(idx0)))
+                    Some(self.identifierish(idx0))
                }
                Some((_, c)) if c.is_whitespace() => {
                    self.bump();
@ -491,11 +509,9 @@ impl<'input> Iterator for Tokenizer<'input> {
 }

 fn is_identifier_start(c: char) -> bool {
-    // for some reason c.is_xid_start() is not stable :(
-    c.is_alphabetic() || c == '_'
+    UnicodeXID::is_xid_start(c)
 }

 fn is_identifier_continue(c: char) -> bool {
-    // for some reason c.is_xid_continue() is not stable :(
-    c.is_alphabetic() || c == '_' || c.is_digit(10)
+    UnicodeXID::is_xid_continue(c)
 }
--- a/lalrpop/src/tok/test.rs
+++ b/lalrpop/src/tok/test.rs
@ -1,4 +1,4 @@
-use super::{is_identifier_start, is_identifier_continue, Tok, Tokenizer};
+use super::{Tok, Tokenizer};
 use super::Tok::*;

 fn test(input: &str,
@ -21,16 +21,6 @@ fn test(input: &str,
    assert_eq!(None, tokenizer.skip(len).next());
 }

-#[test]
-fn identifier_start1() {
-    assert!(is_identifier_start('f'));
-}
-
-#[test]
-fn identifier_continue1() {
-    assert!(is_identifier_continue('o'));
-}
-
 #[test]
 fn basic() {
    test("extern foo", vec![
@ -97,3 +87,30 @@ fn string_literals() {
        (r#"              ~~~"#, Id("baz")),
    ]);
 }
+
+#[test]
+fn use1() {
+    test(r#"use foo::bar; baz"#, vec![
+        (r#"~~~~~~~~~~~~     "#, Use(" foo::bar")),
+        (r#"            ~    "#, Semi),
+        (r#"              ~~~"#, Id("baz")),
+    ]);
+}
+
+#[test]
+fn use2() {
+    test(r#"use {foo,bar}; baz"#, vec![
+        (r#"~~~~~~~~~~~~~     "#, Use(" {foo,bar}")),
+        (r#"             ~    "#, Semi),
+        (r#"               ~~~"#, Id("baz")),
+    ]);
+}
+
+#[test]
+fn where1() {
+    test(r#"where <foo,bar>,baz;"#, vec![
+        (r#"~~~~~~~~~~~~~~~~~~~ "#, Where(vec![" <foo,bar>", "baz"])),
+        (r#"                   ~"#, Semi),
+    ]);
+}
+