Add tutorial for tokens with references

2025-03-16 17:00:53 +00:00 · 2019-12-26 23:58:47 +01:00 · 2019-12-26 23:58:47 +01:00 · ad1a4f8863
commit ad1a4f8863
parent 16363051e6
3 changed files with 128 additions and 3 deletions
--- a/doc/calculator/src/calculator8.lalrpop
+++ b/doc/calculator/src/calculator8.lalrpop
@ -1,7 +1,7 @@
 use ast::{ExprSymbol, Opcode};
 use tok8::Tok;

-grammar<'input>(text: &'input str);
+grammar<'input>(input: &'input str);

 pub Expr: Box<ExprSymbol<'input>> = { // (1)
    Expr "ExprOp" Factor => Box::new(ExprSymbol::Op(<>)), // (2)
@ -30,7 +30,6 @@ extern {
        "(" => Tok::ParenOpen,
        ")" => Tok::ParenClose,
    }
-
 }


--- a/doc/calculator/src/tok8.rs
+++ b/doc/calculator/src/tok8.rs
@ -4,7 +4,6 @@ pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;

 #[derive(Copy, Clone, Debug)]
 pub enum Tok<'input> {
-    Num(i32),
    NumSymbol(&'input str),
    FactorOp(Opcode),
    ExprOp(Opcode),
--- a/doc/src/lexer_tutorial/003_token_references.md
+++ b/doc/src/lexer_tutorial/003_token_references.md
@ -1 +1,128 @@
 # Using tokens with references
+
+When using a custom lexer, you might want tokens to hold references to the original input.
+This allows to use references to the input when the grammar can have arbitrary symbols such as variable names.
+Using references instead of copying the symbols can improve performance and memory usage of the parser.
+
+## The Lexer
+
+We can now create a new calculator parser that can deal with symbols the same way an interpreter would deal with variables.
+First we need the corresponding AST :
+
+``` rust
+pub enum ExprSymbol<'input>{
+    NumSymbol(&'input str),
+    Op(Box<ExprSymbol<'input>>, Opcode, Box<ExprSymbol<'input>>),
+    Error,
+}
+```
+
+Then, we need to build the tokens:
+
+
+``` rust
+#[derive(Copy, Clone, Debug)]
+pub enum Tok<'input> {
+    NumSymbol(&'input str),
+    FactorOp(Opcode),
+    ExprOp(Opcode),
+    ParenOpen,
+    ParenClose,
+}
+```
+
+Then, we can build the lexer itself.
+It's  quite simple, it returns any operator, and if it detects any other character, stores the beginning then continues to the next operator and sends the symbol it just parsed.
+
+``` rust
+use std::str::CharIndices;
+
+pub struct Lexer<'input> {
+    chars: std::iter::Peekable<CharIndices<'input>>,
+    input: &'input str,
+}
+
+impl<'input> Lexer<'input> {
+    pub fn new(input: &'input str) -> Self {
+        Lexer {
+            chars: input.char_indices().peekable(),
+            input,
+        }
+    }
+}
+
+impl<'input> Iterator for Lexer<'input> {
+    type Item = Spanned<Tok<'input>, usize, ()>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            match self.chars.next() {
+                Some((_, ' '))  | Some((_, '\n')) | Some((_, '\t')) => continue,
+                Some((i, ')')) => return Some(Ok((i, Tok::ParenClose, i + 1))),
+                Some((i, '(')) => return Some(Ok((i, Tok::ParenOpen, i + 1))),
+                Some((i, '+')) => return Some(Ok((i, Tok::ExprOp(Opcode::Add), i + 1))),
+                Some((i, '-')) => return Some(Ok((i, Tok::ExprOp(Opcode::Sub), i + 1))),
+                Some((i, '*')) => return Some(Ok((i, Tok::FactorOp(Opcode::Mul), i + 1))),
+                Some((i, '/')) => return Some(Ok((i, Tok::FactorOp(Opcode::Div), i + 1))),
+
+                None => return None, // End of file
+                Some((i,_)) => {
+                    loop {
+                        match self.chars.peek() {
+                            Some((j, ')'))|Some((j, '('))|Some((j, '+'))|Some((j, '-'))|Some((j, '*'))|Some((j, '/'))|Some((j,' '))
+                            => return Some(Ok((i, Tok::NumSymbol(&self.input[i..*j]), *j))),
+                            None => return Some(Ok((i, Tok::NumSymbol(&self.input[i..]),self.input.len()))),
+                            _ => {self.chars.next();},
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+## The parser
+
+We can then take a look at the corresponding parser with a new grammar:
+
+``` rust
+Term: Box<ExprSymbol<'input>> = {
+    "num" => Box::new(ExprSymbol::NumSymbol(<>)),
+    "(" <Expr> ")"
+};
+```
+
+We need to pass the input to the parser so that the input's lifetime is known to the borrow checker when compiling the generated parser.
+``` rust
+grammar<'input>(input: &'input str);
+```
+
+Then we just need to define the tokens the same as before :
+
+``` rust
+extern {
+    type Location = usize;
+    type Error = ();
+    
+    enum Tok<'input> {
+        "num" => Tok::NumSymbol(<&'input str>),
+        "FactorOp" => Tok::FactorOp(<Opcode>),
+        "ExprOp" => Tok::ExprOp(<Opcode>),
+        "(" => Tok::ParenOpen,
+        ")" => Tok::ParenClose,
+    }
+}
+```
+
+# Calling the parser
+We can finally run the parser we built:
+
+``` rust
+let input = "22 * pi + 66";
+let lexer = Lexer::new(input);
+let expr = calculator8::ExprParser::new()
+    .parse(input,lexer)
+    .unwrap();
+assert_eq!(&format!("{:?}", expr), "((\"22\" * \"pi\") + \"66\")");
+```