mirror of
https://github.com/fluencelabs/lalrpop
synced 2025-03-16 17:00:53 +00:00
Merge pull request #496 from Sosthene-Guedon/doc-improvement
Doc improvement
This commit is contained in:
commit
25e88edfe9
@ -6,6 +6,12 @@ pub enum Expr {
|
||||
Error,
|
||||
}
|
||||
|
||||
pub enum ExprSymbol<'input>{
|
||||
NumSymbol(&'input str),
|
||||
Op(Box<ExprSymbol<'input>>, Opcode, Box<ExprSymbol<'input>>),
|
||||
Error,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum Opcode {
|
||||
Mul,
|
||||
@ -25,6 +31,17 @@ impl Debug for Expr {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'input> Debug for ExprSymbol<'input> {
|
||||
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
|
||||
use self::ExprSymbol::*;
|
||||
match *self {
|
||||
NumSymbol(n) => write!(fmt, "{:?}", n),
|
||||
Op(ref l, op, ref r) => write!(fmt, "({:?} {:?} {:?})", l, op, r),
|
||||
Error => write!(fmt, "error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for Opcode {
|
||||
fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
|
||||
use self::Opcode::*;
|
||||
|
34
doc/calculator/src/calculator7.lalrpop
Normal file
34
doc/calculator/src/calculator7.lalrpop
Normal file
@ -0,0 +1,34 @@
|
||||
use std::str::FromStr;
|
||||
use ast::{Expr, Opcode};
|
||||
|
||||
grammar(scale: i32);
|
||||
|
||||
pub Expr: Box<Expr> = { // (1)
|
||||
Expr ExprOp Factor => Box::new(Expr::Op(<>)), // (2)
|
||||
Factor,
|
||||
};
|
||||
|
||||
ExprOp: Opcode = { // (3)
|
||||
"+" => Opcode::Add,
|
||||
"-" => Opcode::Sub,
|
||||
};
|
||||
|
||||
Factor: Box<Expr> = {
|
||||
Factor FactorOp Term => Box::new(Expr::Op(<>)),
|
||||
Term,
|
||||
};
|
||||
|
||||
FactorOp: Opcode = {
|
||||
"*" => Opcode::Mul,
|
||||
"/" => Opcode::Div,
|
||||
};
|
||||
|
||||
Term: Box<Expr> = {
|
||||
Num => Box::new(Expr::Number(<>)),
|
||||
"(" <Expr> ")"
|
||||
};
|
||||
|
||||
Num: i32 = {
|
||||
r"[0-9]+" => i32::from_str(<>).unwrap()*scale,
|
||||
};
|
||||
|
36
doc/calculator/src/calculator8.lalrpop
Normal file
36
doc/calculator/src/calculator8.lalrpop
Normal file
@ -0,0 +1,36 @@
|
||||
use ast::{ExprSymbol, Opcode};
|
||||
use tok8::Tok;
|
||||
|
||||
grammar<'input>(input: &'input str);
|
||||
|
||||
pub Expr: Box<ExprSymbol<'input>> = { // (1)
|
||||
Expr "ExprOp" Factor => Box::new(ExprSymbol::Op(<>)), // (2)
|
||||
Factor,
|
||||
};
|
||||
|
||||
Factor: Box<ExprSymbol<'input>> = {
|
||||
Factor "FactorOp" Term => Box::new(ExprSymbol::Op(<>)),
|
||||
Term,
|
||||
};
|
||||
|
||||
|
||||
Term: Box<ExprSymbol<'input>> = {
|
||||
"num" => Box::new(ExprSymbol::NumSymbol(<>)),
|
||||
"(" <Expr> ")"
|
||||
};
|
||||
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = ();
|
||||
|
||||
enum Tok<'input> {
|
||||
"num" => Tok::NumSymbol(<&'input str>),
|
||||
"FactorOp" => Tok::FactorOp(<Opcode>),
|
||||
"ExprOp" => Tok::ExprOp(<Opcode>),
|
||||
"(" => Tok::ParenOpen,
|
||||
")" => Tok::ParenClose,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -125,6 +125,31 @@ fn calculator6() {
|
||||
assert_eq!(errors.len(), 4);
|
||||
}
|
||||
|
||||
lalrpop_mod!(pub calculator7);
|
||||
|
||||
#[test]
|
||||
fn calculator7() {
|
||||
let scale = 2;
|
||||
let expr = calculator7::ExprParser::new()
|
||||
.parse(scale,"11 * 22 + 33")
|
||||
.unwrap();
|
||||
assert_eq!(&format!("{:?}", expr), "((22 * 44) + 66)");
|
||||
}
|
||||
|
||||
lalrpop_mod!(pub calculator8);
|
||||
mod tok8;
|
||||
use tok8::Lexer;
|
||||
|
||||
#[test]
|
||||
fn calculator8() {
|
||||
let input = "22 * pi + 66";
|
||||
let lexer = Lexer::new(input);
|
||||
let expr = calculator8::ExprParser::new()
|
||||
.parse(input,lexer)
|
||||
.unwrap();
|
||||
assert_eq!(&format!("{:?}", expr), "((\"22\" * \"pi\") + \"66\")");
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
|
59
doc/calculator/src/tok8.rs
Normal file
59
doc/calculator/src/tok8.rs
Normal file
@ -0,0 +1,59 @@
|
||||
use ast::Opcode;
|
||||
|
||||
pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum Tok<'input> {
|
||||
NumSymbol(&'input str),
|
||||
FactorOp(Opcode),
|
||||
ExprOp(Opcode),
|
||||
ParenOpen,
|
||||
ParenClose,
|
||||
}
|
||||
|
||||
use std::str::CharIndices;
|
||||
|
||||
pub struct Lexer<'input> {
|
||||
chars: std::iter::Peekable<CharIndices<'input>>,
|
||||
input: &'input str,
|
||||
}
|
||||
|
||||
impl<'input> Lexer<'input> {
|
||||
pub fn new(input: &'input str) -> Self {
|
||||
Lexer {
|
||||
chars: input.char_indices().peekable(),
|
||||
input,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'input> Iterator for Lexer<'input> {
|
||||
type Item = Spanned<Tok<'input>, usize, ()>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some((_, ' ')) | Some((_, '\n')) | Some((_, '\t')) => continue,
|
||||
Some((i, ')')) => return Some(Ok((i, Tok::ParenClose, i + 1))),
|
||||
Some((i, '(')) => return Some(Ok((i, Tok::ParenOpen, i + 1))),
|
||||
Some((i, '+')) => return Some(Ok((i, Tok::ExprOp(Opcode::Add), i + 1))),
|
||||
Some((i, '-')) => return Some(Ok((i, Tok::ExprOp(Opcode::Sub), i + 1))),
|
||||
Some((i, '*')) => return Some(Ok((i, Tok::FactorOp(Opcode::Mul), i + 1))),
|
||||
Some((i, '/')) => return Some(Ok((i, Tok::FactorOp(Opcode::Div), i + 1))),
|
||||
|
||||
None => return None, // End of file
|
||||
Some((i,_)) => {
|
||||
loop {
|
||||
match self.chars.peek() {
|
||||
Some((j, ')'))|Some((j, '('))|Some((j, '+'))|Some((j, '-'))|Some((j, '*'))|Some((j, '/'))|Some((j,' '))
|
||||
=> return Some(Ok((i, Tok::NumSymbol(&self.input[i..*j]), *j))),
|
||||
None => return Some(Ok((i, Tok::NumSymbol(&self.input[i..]),self.input.len()))),
|
||||
_ => {},
|
||||
}
|
||||
self.chars.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -7,12 +7,15 @@
|
||||
- [Adding LALRPOP to your project](tutorial/001_adding_lalrpop.md)
|
||||
- [Parsing parenthesized numbers](tutorial/002_paren_numbers.md)
|
||||
- [Type inference](tutorial/003_type_inference.md)
|
||||
- [Controlling the lexer](tutorial/004_controlling_lexer.md)
|
||||
- [Handling full expressions](tutorial/005_full_expressions.md)
|
||||
- [Building ASTs](tutorial/006_building_asts.md)
|
||||
- [Macros](tutorial/007_macros.md)
|
||||
- [Error recovery](tutorial/008_error_recovery.md)
|
||||
- [Writing a custom lexer](lexer_tutorial/index.md)
|
||||
- [Handling full expressions](tutorial/004_full_expressions.md)
|
||||
- [Building ASTs](tutorial/005_building_asts.md)
|
||||
- [Macros](tutorial/006_macros.md)
|
||||
- [Error recovery](tutorial/007_error_recovery.md)
|
||||
- [Passing state parameter](tutorial/008_state_parameter.md)
|
||||
- [Controlling the lexer](lexer_tutorial/index.md)
|
||||
- [LALRPOP's lexer generator](lexer_tutorial/001_lexer_gen.md)
|
||||
- [Writing a custom lexer](lexer_tutorial/002_writing_custom_lexer.md)
|
||||
- [Using tokens with references](lexer_tutorial/003_token_references.md)
|
||||
- [Advanced setup](advanced_setup.md)
|
||||
- [Generate in source tree](generate_in_source.md)
|
||||
-----------
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Controlling the lexer
|
||||
# LALRPOP's lexer generator
|
||||
|
||||
This example dives a bit deeper into how LALRPOP works. In particular,
|
||||
it dives into the meaning of those strings and regular expression that
|
201
doc/src/lexer_tutorial/002_writing_custom_lexer.md
Normal file
201
doc/src/lexer_tutorial/002_writing_custom_lexer.md
Normal file
@ -0,0 +1,201 @@
|
||||
# Writing a custom lexer
|
||||
|
||||
Let's say we want to parse the Whitespace language, so we've put together a grammar like the following:
|
||||
|
||||
```lalrpop
|
||||
pub Program = <Statement*>;
|
||||
|
||||
Statement: ast::Stmt = {
|
||||
" " <StackOp>,
|
||||
"\t" " " <MathOp>,
|
||||
"\t" "\t" <HeapOp>,
|
||||
"\n" <FlowCtrl>,
|
||||
"\t" "\n" <Io>,
|
||||
};
|
||||
|
||||
StackOp: ast::Stmt = {
|
||||
" " <Number> => ast::Stmt::Push(<>),
|
||||
"\n" " " => ast::Stmt::Dup,
|
||||
"\n" "\t" => ast::Stmt::Swap,
|
||||
"\n" "\n" => ast::Stmt::Discard,
|
||||
};
|
||||
|
||||
MathOp: ast::Stmt = {
|
||||
" " " " => ast::Stmt::Add,
|
||||
" " "\t" => ast::Stmt::Sub,
|
||||
" " "\n" => ast::Stmt::Mul,
|
||||
"\t" " " => ast::Stmt::Div,
|
||||
"\t" "\t" => ast::Stmt::Mod,
|
||||
};
|
||||
|
||||
// Remainder omitted
|
||||
```
|
||||
|
||||
Naturally, it doesn't work. By default, LALRPOP generates a tokenizer that skips all whitespace -- including newlines. What we *want* is to capture whitespace characters and ignore the rest as comments, and LALRPOP does the opposite of that.
|
||||
|
||||
At the moment, LALRPOP doesn't allow you to configure the default tokenizer. In the future it will become quite flexible, but for now we have to write our own.
|
||||
|
||||
Let's start by defining the stream format. The parser will accept an iterator where each item in the stream has the following structure:
|
||||
|
||||
```lalrpop
|
||||
pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
|
||||
```
|
||||
|
||||
`Loc` is typically just a `usize`, representing a byte offset into the input string. Each token is accompanied by two of them, marking the start and end positions where it was found. `Error` can be pretty much anything you choose. And of course `Tok` is the meat of the stream, defining what possible values the tokens themselves can have. Following the conventions of Rust iterators, we'll signal a valid token with `Some(Ok(...))`, an error with `Some(Err(...))`, and EOF with `None`.
|
||||
|
||||
(Note that the term "tokenizer" normally refers to a piece of code that simply splits up the stream, whereas a "lexer" also tags each token with its lexical category. What we're writing is the latter.)
|
||||
|
||||
Whitespace is a simple language from a lexical standpoint, with only three valid tokens:
|
||||
|
||||
```lalrpop
|
||||
pub enum Tok {
|
||||
Space,
|
||||
Tab,
|
||||
Linefeed,
|
||||
}
|
||||
```
|
||||
|
||||
Everything else is a comment. There are no invalid lexes, so we'll define our own error type, a void enum:
|
||||
|
||||
```lalrpop
|
||||
pub enum LexicalError {
|
||||
// Not possible
|
||||
}
|
||||
```
|
||||
|
||||
Now for the lexer itself. We'll take a string slice as its input. For each token we process, we'll want to know the character value, and the byte offset in the string where it begins. We can do that by wrapping the `CharIndices` iterator, which yields tuples of `(usize, char)` representing exactly that information.
|
||||
|
||||
```lalrpop
|
||||
use std::str::CharIndices;
|
||||
|
||||
pub struct Lexer<'input> {
|
||||
chars: CharIndices<'input>,
|
||||
}
|
||||
|
||||
impl<'input> Lexer<'input> {
|
||||
pub fn new(input: &'input str) -> Self {
|
||||
Lexer { chars: input.char_indices() }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
(The lifetime parameter `'input` indicates that the Lexer cannot outlive the string it's trying to parse.)
|
||||
|
||||
Let's review our rules:
|
||||
|
||||
- For a space character, we output `Tok::Space`.
|
||||
- For a tab character, we output `Tok::Tab`.
|
||||
- For a linefeed (newline) character, we output `Tok::Linefeed`.
|
||||
- We skip all other characters.
|
||||
- If we've reached the end of the string, we'll return `None` to signal EOF.
|
||||
|
||||
Writing a lexer for a language with multi-character tokens can get very complicated, but this is so straightforward, we can translate it directly into code without thinking very hard. Here's our `Iterator` implementation:
|
||||
|
||||
```lalrpop
|
||||
impl<'input> Iterator for Lexer<'input> {
|
||||
type Item = Spanned<Tok, usize, LexicalError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some((i, ' ')) => return Some(Ok((i, Tok::Space, i+1))),
|
||||
Some((i, '\t')) => return Some(Ok((i, Tok::Tab, i+1))),
|
||||
Some((i, '\n')) => return Some(Ok((i, Tok::Linefeed, i+1))),
|
||||
|
||||
None => return None, // End of file
|
||||
_ => continue, // Comment; skip this character
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
That's it. That's all we need.
|
||||
|
||||
## Updating the parser
|
||||
|
||||
To use this with LALRPOP, we need to expose its API to the parser. It's pretty easy to do, but also somewhat magical, so pay close attention. Pick a convenient place in the grammar file (I chose the bottom) and insert an `extern` block:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
Now we tell LALRPOP about the `Location` and `Error` types, as if we're writing a trait:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
We expose the `Tok` type by kinda sorta redeclaring it:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
enum lexer::Tok {
|
||||
// ...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Now we have to declare each of our terminals. For each variant of `Tok`, we pick what name the parser will see, and write a pattern of the form `name => lexer::Tok::Variant`, similar to how action code works in grammar rules. The name can be an identifier, or a string literal. We'll use the latter.
|
||||
|
||||
Here's the whole thing:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
enum lexer::Tok {
|
||||
" " => lexer::Tok::Space,
|
||||
"\t" => lexer::Tok::Tab,
|
||||
"\n" => lexer::Tok::Linefeed,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
From now on, the parser will take a `Lexer` as its input instead of a string slice, like so:
|
||||
|
||||
```rust
|
||||
let lexer = lexer::Lexer::new("\n\n\n");
|
||||
match parser::parse_Program(lexer) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
And any time we write a string literal in the grammar, it'll substitute a variant of our `Tok` enum. This means **we don't have to change any of the rules we already wrote!** This will work as-is:
|
||||
|
||||
```lalrpop
|
||||
FlowCtrl: ast::Stmt = {
|
||||
" " " " <Label> => ast::Stmt::Mark(<>),
|
||||
" " "\t" <Label> => ast::Stmt::Call(<>),
|
||||
" " "\n" <Label> => ast::Stmt::Jump(<>),
|
||||
"\t" " " <Label> => ast::Stmt::Jz(<>),
|
||||
"\t" "\t" <Label> => ast::Stmt::Js(<>),
|
||||
"\t" "\n" => ast::Stmt::Return,
|
||||
"\n" "\n" => ast::Stmt::Exit,
|
||||
};
|
||||
```
|
||||
|
||||
The complete grammar is available in `whitespace/src/parser.lalrpop`.
|
||||
|
||||
## Where to go from here
|
||||
|
||||
Things to try that apply to lexers in general:
|
||||
|
||||
- Longer tokens
|
||||
- Tokens that require tracking internal lexer state
|
||||
|
||||
Things to try that are LALRPOP-specific:
|
||||
|
||||
- Persuade a lexer generator to output the `Spanned` format
|
||||
- Make this tutorial better
|
133
doc/src/lexer_tutorial/003_token_references.md
Normal file
133
doc/src/lexer_tutorial/003_token_references.md
Normal file
@ -0,0 +1,133 @@
|
||||
# Using tokens with references
|
||||
|
||||
When using a custom lexer, you might want tokens to hold references to the original input.
|
||||
This allows to use references to the input when the grammar can have arbitrary symbols such as variable names.
|
||||
Using references instead of copying the symbols can improve performance and memory usage of the parser.
|
||||
|
||||
## The Lexer
|
||||
|
||||
We can now create a new calculator parser that can deal with symbols the same way an interpreter would deal with variables.
|
||||
First we need the corresponding AST :
|
||||
|
||||
``` rust
|
||||
pub enum ExprSymbol<'input>{
|
||||
NumSymbol(&'input str),
|
||||
Op(Box<ExprSymbol<'input>>, Opcode, Box<ExprSymbol<'input>>),
|
||||
Error,
|
||||
}
|
||||
```
|
||||
|
||||
Then, we need to build the tokens:
|
||||
|
||||
|
||||
``` rust
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum Tok<'input> {
|
||||
NumSymbol(&'input str),
|
||||
FactorOp(Opcode),
|
||||
ExprOp(Opcode),
|
||||
ParenOpen,
|
||||
ParenClose,
|
||||
}
|
||||
```
|
||||
Notice the NumSymbol type holding a reference to the original input.
|
||||
It represents both numbers and variable names as a slice of the original input.
|
||||
|
||||
Then, we can build the lexer itself.
|
||||
|
||||
``` rust
|
||||
use std::str::CharIndices;
|
||||
|
||||
pub struct Lexer<'input> {
|
||||
chars: std::iter::Peekable<CharIndices<'input>>,
|
||||
input: &'input str,
|
||||
}
|
||||
|
||||
impl<'input> Lexer<'input> {
|
||||
pub fn new(input: &'input str) -> Self {
|
||||
Lexer {
|
||||
chars: input.char_indices().peekable(),
|
||||
input,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
It needs to hold a reference to the input to put slices in the tokens.
|
||||
|
||||
``` rust
|
||||
impl<'input> Iterator for Lexer<'input> {
|
||||
type Item = Spanned<Tok<'input>, usize, ()>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some((_, ' ')) | Some((_, '\n')) | Some((_, '\t')) => continue,
|
||||
Some((i, ')')) => return Some(Ok((i, Tok::ParenClose, i + 1))),
|
||||
Some((i, '(')) => return Some(Ok((i, Tok::ParenOpen, i + 1))),
|
||||
Some((i, '+')) => return Some(Ok((i, Tok::ExprOp(Opcode::Add), i + 1))),
|
||||
Some((i, '-')) => return Some(Ok((i, Tok::ExprOp(Opcode::Sub), i + 1))),
|
||||
Some((i, '*')) => return Some(Ok((i, Tok::FactorOp(Opcode::Mul), i + 1))),
|
||||
Some((i, '/')) => return Some(Ok((i, Tok::FactorOp(Opcode::Div), i + 1))),
|
||||
|
||||
None => return None, // End of file
|
||||
Some((i,_)) => {
|
||||
loop {
|
||||
match self.chars.peek() {
|
||||
Some((j, ')'))|Some((j, '('))|Some((j, '+'))|Some((j, '-'))|Some((j, '*'))|Some((j, '/'))|Some((j,' '))
|
||||
=> return Some(Ok((i, Tok::NumSymbol(&self.input[i..*j]), *j))),
|
||||
None => return Some(Ok((i, Tok::NumSymbol(&self.input[i..]),self.input.len()))),
|
||||
_ => {self.chars.next();},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
It's quite simple, it returns any operator, and if it detects any other character, stores the beginning then continues to the next operator and sends the symbol it just parsed.
|
||||
|
||||
## The parser
|
||||
|
||||
We can then take a look at the corresponding parser with a new grammar:
|
||||
|
||||
``` rust
|
||||
Term: Box<ExprSymbol<'input>> = {
|
||||
"num" => Box::new(ExprSymbol::NumSymbol(<>)),
|
||||
"(" <Expr> ")"
|
||||
};
|
||||
```
|
||||
|
||||
We need to pass the input to the parser so that the input's lifetime is known to the borrow checker when compiling the generated parser.
|
||||
``` rust
|
||||
grammar<'input>(input: &'input str);
|
||||
```
|
||||
|
||||
Then we just need to define the tokens the same as before :
|
||||
|
||||
``` rust
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = ();
|
||||
|
||||
enum Tok<'input> {
|
||||
"num" => Tok::NumSymbol(<&'input str>),
|
||||
"FactorOp" => Tok::FactorOp(<Opcode>),
|
||||
"ExprOp" => Tok::ExprOp(<Opcode>),
|
||||
"(" => Tok::ParenOpen,
|
||||
")" => Tok::ParenClose,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
# Calling the parser
|
||||
We can finally run the parser we built:
|
||||
|
||||
``` rust
|
||||
let input = "22 * pi + 66";
|
||||
let lexer = Lexer::new(input);
|
||||
let expr = calculator8::ExprParser::new()
|
||||
.parse(input,lexer)
|
||||
.unwrap();
|
||||
assert_eq!(&format!("{:?}", expr), "((\"22\" * \"pi\") + \"66\")");
|
||||
```
|
@ -1,201 +1,7 @@
|
||||
# Writing a custom lexer
|
||||
# Fine control over the lexer
|
||||
|
||||
Let's say we want to parse the Whitespace language, so we've put together a grammar like the following:
|
||||
This part is about controling the inner workings of LALRPOP's built-in lexer generator and using your own hand written parser.
|
||||
|
||||
```lalrpop
|
||||
pub Program = <Statement*>;
|
||||
|
||||
Statement: ast::Stmt = {
|
||||
" " <StackOp>,
|
||||
"\t" " " <MathOp>,
|
||||
"\t" "\t" <HeapOp>,
|
||||
"\n" <FlowCtrl>,
|
||||
"\t" "\n" <Io>,
|
||||
};
|
||||
|
||||
StackOp: ast::Stmt = {
|
||||
" " <Number> => ast::Stmt::Push(<>),
|
||||
"\n" " " => ast::Stmt::Dup,
|
||||
"\n" "\t" => ast::Stmt::Swap,
|
||||
"\n" "\n" => ast::Stmt::Discard,
|
||||
};
|
||||
|
||||
MathOp: ast::Stmt = {
|
||||
" " " " => ast::Stmt::Add,
|
||||
" " "\t" => ast::Stmt::Sub,
|
||||
" " "\n" => ast::Stmt::Mul,
|
||||
"\t" " " => ast::Stmt::Div,
|
||||
"\t" "\t" => ast::Stmt::Mod,
|
||||
};
|
||||
|
||||
// Remainder omitted
|
||||
```
|
||||
|
||||
Naturally, it doesn't work. By default, LALRPOP generates a tokenizer that skips all whitespace -- including newlines. What we *want* is to capture whitespace characters and ignore the rest as comments, and LALRPOP does the opposite of that.
|
||||
|
||||
At the moment, LALRPOP doesn't allow you to configure the default tokenizer. In the future it will become quite flexible, but for now we have to write our own.
|
||||
|
||||
Let's start by defining the stream format. The parser will accept an iterator where each item in the stream has the following structure:
|
||||
|
||||
```lalrpop
|
||||
pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
|
||||
```
|
||||
|
||||
`Loc` is typically just a `usize`, representing a byte offset into the input string. Each token is accompanied by two of them, marking the start and end positions where it was found. `Error` can be pretty much anything you choose. And of course `Tok` is the meat of the stream, defining what possible values the tokens themselves can have. Following the conventions of Rust iterators, we'll signal a valid token with `Some(Ok(...))`, an error with `Some(Err(...))`, and EOF with `None`.
|
||||
|
||||
(Note that the term "tokenizer" normally refers to a piece of code that simply splits up the stream, whereas a "lexer" also tags each token with its lexical category. What we're writing is the latter.)
|
||||
|
||||
Whitespace is a simple language from a lexical standpoint, with only three valid tokens:
|
||||
|
||||
```lalrpop
|
||||
pub enum Tok {
|
||||
Space,
|
||||
Tab,
|
||||
Linefeed,
|
||||
}
|
||||
```
|
||||
|
||||
Everything else is a comment. There are no invalid lexes, so we'll define our own error type, a void enum:
|
||||
|
||||
```lalrpop
|
||||
pub enum LexicalError {
|
||||
// Not possible
|
||||
}
|
||||
```
|
||||
|
||||
Now for the lexer itself. We'll take a string slice as its input. For each token we process, we'll want to know the character value, and the byte offset in the string where it begins. We can do that by wrapping the `CharIndices` iterator, which yields tuples of `(usize, char)` representing exactly that information.
|
||||
|
||||
```lalrpop
|
||||
use std::str::CharIndices;
|
||||
|
||||
pub struct Lexer<'input> {
|
||||
chars: CharIndices<'input>,
|
||||
}
|
||||
|
||||
impl<'input> Lexer<'input> {
|
||||
pub fn new(input: &'input str) -> Self {
|
||||
Lexer { chars: input.char_indices() }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
(The lifetime parameter `'input` indicates that the Lexer cannot outlive the string it's trying to parse.)
|
||||
|
||||
Let's review our rules:
|
||||
|
||||
- For a space character, we output `Tok::Space`.
|
||||
- For a tab character, we output `Tok::Tab`.
|
||||
- For a linefeed (newline) character, we output `Tok::Linefeed`.
|
||||
- We skip all other characters.
|
||||
- If we've reached the end of the string, we'll return `None` to signal EOF.
|
||||
|
||||
Writing a lexer for a language with multi-character tokens can get very complicated, but this is so straightforward, we can translate it directly into code without thinking very hard. Here's our `Iterator` implementation:
|
||||
|
||||
```lalrpop
|
||||
impl<'input> Iterator for Lexer<'input> {
|
||||
type Item = Spanned<Tok, usize, LexicalError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some((i, ' ')) => return Some(Ok((i, Tok::Space, i+1))),
|
||||
Some((i, '\t')) => return Some(Ok((i, Tok::Tab, i+1))),
|
||||
Some((i, '\n')) => return Some(Ok((i, Tok::Linefeed, i+1))),
|
||||
|
||||
None => return None, // End of file
|
||||
_ => continue, // Comment; skip this character
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
That's it. That's all we need.
|
||||
|
||||
## Updating the parser
|
||||
|
||||
To use this with LALRPOP, we need to expose its API to the parser. It's pretty easy to do, but also somewhat magical, so pay close attention. Pick a convenient place in the grammar file (I chose the bottom) and insert an `extern` block:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
Now we tell LALRPOP about the `Location` and `Error` types, as if we're writing a trait:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
We expose the `Tok` type by kinda sorta redeclaring it:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
enum lexer::Tok {
|
||||
// ...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Now we have to declare each of our terminals. For each variant of `Tok`, we pick what name the parser will see, and write a pattern of the form `name => lexer::Tok::Variant`, similar to how action code works in grammar rules. The name can be an identifier, or a string literal. We'll use the latter.
|
||||
|
||||
Here's the whole thing:
|
||||
|
||||
```lalrpop
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = lexer::LexicalError;
|
||||
|
||||
enum lexer::Tok {
|
||||
" " => lexer::Tok::Space,
|
||||
"\t" => lexer::Tok::Tab,
|
||||
"\n" => lexer::Tok::Linefeed,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
From now on, the parser will take a `Lexer` as its input instead of a string slice, like so:
|
||||
|
||||
```rust
|
||||
let lexer = lexer::Lexer::new("\n\n\n");
|
||||
match parser::parse_Program(lexer) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
And any time we write a string literal in the grammar, it'll substitute a variant of our `Tok` enum. This means **we don't have to change any of the rules we already wrote!** This will work as-is:
|
||||
|
||||
```lalrpop
|
||||
FlowCtrl: ast::Stmt = {
|
||||
" " " " <Label> => ast::Stmt::Mark(<>),
|
||||
" " "\t" <Label> => ast::Stmt::Call(<>),
|
||||
" " "\n" <Label> => ast::Stmt::Jump(<>),
|
||||
"\t" " " <Label> => ast::Stmt::Jz(<>),
|
||||
"\t" "\t" <Label> => ast::Stmt::Js(<>),
|
||||
"\t" "\n" => ast::Stmt::Return,
|
||||
"\n" "\n" => ast::Stmt::Exit,
|
||||
};
|
||||
```
|
||||
|
||||
The complete grammar is available in `whitespace/src/parser.lalrpop`.
|
||||
|
||||
## Where to go from here
|
||||
|
||||
Things to try that apply to lexers in general:
|
||||
|
||||
- Longer tokens
|
||||
- Tokens that require tracking internal lexer state
|
||||
|
||||
Things to try that are LALRPOP-specific:
|
||||
|
||||
- Persuade a lexer generator to output the `Spanned` format
|
||||
- Make this tutorial better
|
||||
- [LALRPOP's lexer generator](001_lexer_gen.md)
|
||||
- [Writing a custom lexer](002_writing_custom_lexer.md)
|
||||
- [Using tokens with references](003_token_references.md)
|
||||
|
36
doc/src/tutorial/008_state_parameter.md
Normal file
36
doc/src/tutorial/008_state_parameter.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Passing state parameter
|
||||
|
||||
By default, the parser doesn't take any argument other than the input.
|
||||
When building the AST, it might be useful to pass parameters to the parser, which might be needed to the construction of the tree.
|
||||
|
||||
Going back to the calculator4 example it is possible to pass an argument to the parser :
|
||||
|
||||
|
||||
```rust
|
||||
grammar(scale: i32);
|
||||
```
|
||||
|
||||
```rust
|
||||
Num: i32 = {
|
||||
r"[0-9]+" => i32::from_str(<>).unwrap()*scale,
|
||||
};
|
||||
```
|
||||
|
||||
Here the parser will accept a scale parameter that will scale every number encountered.
|
||||
|
||||
We can then call the parser with the state parameter :
|
||||
|
||||
```rust
|
||||
#[test]
|
||||
fn calculator7() {
|
||||
let scale = 2;
|
||||
let expr = calculator7::ExprParser::new()
|
||||
.parse(scale,"11 * 22 + 33")
|
||||
.unwrap();
|
||||
assert_eq!(&format!("{:?}", expr), "((22 * 44) + 66)");
|
||||
}
|
||||
```
|
||||
|
||||
For a more practical example with a custom tree structure, check out [this parser](https://github.com/lalrpop/lalrpop/blob/master/lalrpop-test/src/expr_arena.lalrpop) using [this structure](https://github.com/lalrpop/lalrpop/blob/master/lalrpop-test/src/expr_arena_ast.rs) to build the AST.
|
||||
|
||||
|
@ -3,14 +3,14 @@ This is a tutorial for how to write a complete parser for a simple calculator us
|
||||
If you are unfamiliar with what a parser generator is, you should read [Crash course on parsers]
|
||||
first.
|
||||
|
||||
- [Adding LALRPOP to your project](tutorial/001_adding_lalrpop.html)
|
||||
- [Parsing parenthesized numbers](tutorial/002_paren_numbers.html)
|
||||
- [Type inference](tutorial/003_type_inference.html)
|
||||
- [Controlling the lexer](tutorial/004_controlling_lexer.html)
|
||||
- [Handling full expressions](tutorial/005_full_expressions.html)
|
||||
- [Building ASTs](tutorial/006_building_asts.html)
|
||||
- [Macros](tutorial/007_macros.html)
|
||||
- [Error recovery](tutorial/008_error_recovery.html)
|
||||
- [Adding LALRPOP to your project](001_adding_lalrpop.md)
|
||||
- [Parsing parenthesized numbers](002_paren_numbers.md)
|
||||
- [Type inference](003_type_inference.md)
|
||||
- [Handling full expressions](004_full_expressions.md)
|
||||
- [Building ASTs](005_building_asts.md)
|
||||
- [Macros](006_macros.md)
|
||||
- [Error recovery](007_error_recovery.md)
|
||||
- [Passing state parameter](008_state_parameter.md)
|
||||
|
||||
This tutorial is still incomplete. Here are some topics that I aim to
|
||||
cover when I get time to write about them:
|
||||
|
Loading…
x
Reference in New Issue
Block a user