From a1a046f9dde731e6e3d0ffe825ad01aeb1c9e70e Mon Sep 17 00:00:00 2001 From: Johannes Stoelp Date: Fri, 3 Sep 2021 00:06:38 +0200 Subject: ch2: added parser following chapter 2 --- src/main.rs | 66 ++++++++- src/parser.rs | 433 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 494 insertions(+), 5 deletions(-) create mode 100644 src/parser.rs (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 361f76f..0e91734 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,74 @@ mod lexer; +mod parser; -use lexer::Lexer; +use lexer::{Lexer, Token}; +use parser::Parser; use std::io::Read; +fn handle_definition(p: &mut Parser) +where + I: Iterator, +{ + match p.parse_definition() { + Ok(expr) => println!("Parse 'def'\n{:?}", expr), + Err(err) => { + eprintln!("Error: {:?}", err); + p.get_next_token(); + } + } +} + +fn handle_extern(p: &mut Parser) +where + I: Iterator, +{ + match p.parse_extern() { + Ok(expr) => println!("Parse 'extern'\n{:?}", expr), + Err(err) => { + eprintln!("Error: {:?}", err); + p.get_next_token(); + } + } +} + +fn handle_top_level_expression(p: &mut Parser) +where + I: Iterator, +{ + match p.parse_top_level_expr() { + Ok(expr) => println!("Parse top-level expression\n{:?}", expr), + Err(err) => { + eprintln!("Error: {:?}", err); + p.get_next_token(); + } + } +} + fn main() { - println!("Lex stdin."); - println!("ENTER to lex current input."); + println!("Parse stdin."); + println!("ENTER to parse current input."); println!("C-c to exit."); - let mut lex = Lexer::new(std::io::stdin().bytes().filter_map(|v| { + + let lexer = Lexer::new(std::io::stdin().bytes().filter_map(|v| { let v = v.ok()?; Some(v.into()) })); + let mut parser = Parser::new(lexer); + + // Throw first coin and initialize cur_tok. + parser.get_next_token(); + loop { - println!("{:?}", lex.gettok()); + match *parser.cur_tok() { + Token::Eof => break, + Token::Char(';') => { + // Ignore top-level semicolon. + parser.get_next_token() + } + Token::Def => handle_definition(&mut parser), + Token::Extern => handle_extern(&mut parser), + _ => handle_top_level_expression(&mut parser), + } } } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..f18d49e --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,433 @@ +use crate::lexer::{Lexer, Token}; + +#[derive(Debug, PartialEq)] +pub enum ExprAST { + /// Number - Expression class for numeric literals like "1.0". + Number(f64), + + /// Variable - Expression class for referencing a variable, like "a". + Variable(String), + + /// Binary - Expression class for a binary operator. + Binary(char, Box, Box), + + /// Call - Expression class for function calls. + Call(String, Vec>), +} + +/// PrototypeAST - This class represents the "prototype" for a function, +/// which captures its name, and its argument names (thus implicitly the number +/// of arguments the function takes). +#[derive(Debug)] +pub struct PrototypeAST(String, Vec); + +/// FunctionAST - This class represents a function definition itself. +#[derive(Debug)] +pub struct FunctionAST(Box, Box); + +/// Parse result with String as Error type (to be compliant with tutorial). +type ParseResult = Result; + +pub struct Parser +where + I: Iterator, +{ + lexer: Lexer, + cur_tok: Option, +} + +impl Parser +where + I: Iterator, +{ + pub fn new(lexer: Lexer) -> Self { + Parser { + lexer, + cur_tok: None, + } + } + + // ----------------------- + // Simple Token Buffer + // ----------------------- + + /// Implement the global variable `int CurTok;` from the tutorial. + /// + /// # Panics + /// Panics if the parser doesn't have a current token. + pub fn cur_tok(&self) -> &Token { + self.cur_tok.as_ref().expect("Parser: Expected cur_token!") + } + + /// Advance the `cur_tok` by getting the next token from the lexer. + /// + /// Implement the fucntion `int getNextToken();` from the tutorial. + pub fn get_next_token(&mut self) { + self.cur_tok = Some(self.lexer.gettok()); + } + + // ---------------------------- + // Basic Expression Parsing + // ---------------------------- + + /// numberexpr ::= number + /// + /// Implement `std::unique_ptr ParseNumberExpr();` from the tutorial. + fn parse_num_expr(&mut self) -> ParseResult> { + match *self.cur_tok() { + Token::Number(num) => { + // Consume the number token. + self.get_next_token(); + Ok(Box::new(ExprAST::Number(num))) + } + _ => unreachable!(), + } + } + + /// parenexpr ::= '(' expression ')' + /// + /// Implement `std::unique_ptr ParseParenExpr();` from the tutorial. + fn parse_paren_expr(&mut self) -> ParseResult> { + // Eat '(' token. + assert_eq!(*self.cur_tok(), Token::Char('(')); + self.get_next_token(); + + let v = self.parse_expression()?; + + if *self.cur_tok() == Token::Char(')') { + // Eat ')' token. + self.get_next_token(); + Ok(v) + } else { + Err("expected ')'".into()) + } + } + + /// identifierexpr + /// ::= identifier + /// ::= identifier '(' expression* ')' + /// + /// Implement `std::unique_ptr ParseIdentifierExpr();` from the tutorial. + fn parse_identifier_expr(&mut self) -> ParseResult> { + let id_name = match self.cur_tok.take() { + Some(Token::Identifier(id)) => { + // Consume identifier. + self.get_next_token(); + id + } + _ => unreachable!(), + }; + + if *self.cur_tok() != Token::Char('(') { + // Simple variable reference. + Ok(Box::new(ExprAST::Variable(id_name))) + } else { + // Call. + + // Eat '(' token. + self.get_next_token(); + + let mut args: Vec> = Vec::new(); + + // If there are arguments collect them. + if *self.cur_tok() != Token::Char(')') { + loop { + let arg = self.parse_expression()?; + args.push(arg); + + if *self.cur_tok() == Token::Char(')') { + // Eat ')' token. + self.get_next_token(); + break; + } + + if *self.cur_tok() != Token::Char(',') { + return Err("Expected ')' or ',' in argument list".into()); + } + + self.get_next_token(); + } + } + + Ok(Box::new(ExprAST::Call(id_name, args))) + } + } + + /// primary + /// ::= identifierexpr + /// ::= numberexpr + /// ::= parenexpr + /// + /// Implement `std::unique_ptr ParsePrimary();` from the tutorial. + fn parse_primary(&mut self) -> ParseResult> { + match *self.cur_tok() { + Token::Identifier(_) => self.parse_identifier_expr(), + Token::Number(_) => self.parse_num_expr(), + Token::Char('(') => self.parse_paren_expr(), + _ => Err("unknown token when expecting an expression".into()), + } + } + + // ----------------------------- + // Binary Expression Parsing + // ----------------------------- + + /// /// expression + /// ::= primary binoprhs + /// + /// Implement `std::unique_ptr ParseExpression();` from the tutorial. + fn parse_expression(&mut self) -> ParseResult> { + let lhs = self.parse_primary()?; + self.parse_bin_op_rhs(0, lhs) + } + + /// binoprhs + /// ::= ('+' primary)* + /// + /// Implement `std::unique_ptr ParseBinOpRHS(int ExprPrec, std::unique_ptr LHS);` from the tutorial. + fn parse_bin_op_rhs( + &mut self, + expr_prec: isize, + mut lhs: Box, + ) -> ParseResult> { + loop { + let tok_prec = get_tok_precedence(self.cur_tok()); + + // Not a binary operator or precedence is too small. + if tok_prec < expr_prec { + return Ok(lhs); + } + + let binop = match self.cur_tok.take() { + Some(Token::Char(c)) => { + // Eat binary operator. + self.get_next_token(); + c + } + _ => unreachable!(), + }; + + // lhs BINOP1 rhs BINOP2 remrhs + // ^^^^^^ ^^^^^^ + // tok_prec next_prec + // + // In case BINOP1 has higher precedence, we are done here and can build a 'Binary' AST + // node between 'lhs' and 'rhs'. + // + // In case BINOP2 has higher precedence, we take 'rhs' as 'lhs' and recurse into the + // 'remrhs' expression first. + + // Parse primary expression after binary operator. + let mut rhs = self.parse_primary()?; + + let next_prec = get_tok_precedence(self.cur_tok()); + if tok_prec < next_prec { + // BINOP2 has higher precedence thatn BINOP1, recurse into 'remhs'. + rhs = self.parse_bin_op_rhs(tok_prec + 1, rhs)? + } + + lhs = Box::new(ExprAST::Binary(binop, lhs, rhs)); + } + } + + // -------------------- + // Parsing the Rest + // -------------------- + + /// prototype + /// ::= id '(' id* ')' + /// + /// Implement `std::unique_ptr ParsePrototype();` from the tutorial. + fn parse_prototype(&mut self) -> ParseResult> { + let id_name = match self.cur_tok.take() { + Some(Token::Identifier(id)) => { + // Consume the identifier. + self.get_next_token(); + id + } + other => { + // Plug back current token. + self.cur_tok = other; + return Err("Expected function name in prototype".into()); + } + }; + + if *self.cur_tok() != Token::Char('(') { + return Err("Expected '(' in prototype".into()); + } + + let mut args: Vec = Vec::new(); + loop { + self.get_next_token(); + + match self.cur_tok.take() { + Some(Token::Identifier(arg)) => args.push(arg), + other => { + self.cur_tok = other; + break; + } + } + } + + if *self.cur_tok() != Token::Char(')') { + return Err("Expected ')' in prototype".into()); + } + + // Consume ')'. + self.get_next_token(); + + Ok(Box::new(PrototypeAST(id_name, args))) + } + + /// definition ::= 'def' prototype expression + /// + /// Implement `std::unique_ptr ParseDefinition();` from the tutorial. + pub fn parse_definition(&mut self) -> ParseResult> { + // Consume 'def' token. + assert_eq!(*self.cur_tok(), Token::Def); + self.get_next_token(); + + let proto = self.parse_prototype()?; + let expr = self.parse_expression()?; + + Ok(Box::new(FunctionAST(proto, expr))) + } + + /// external ::= 'extern' prototype + /// + /// Implement `std::unique_ptr ParseExtern();` from the tutorial. + pub fn parse_extern(&mut self) -> ParseResult> { + // Consume 'extern' token. + assert_eq!(*self.cur_tok(), Token::Extern); + self.get_next_token(); + + self.parse_prototype() + } + + /// toplevelexpr ::= expression + /// + /// Implement `std::unique_ptr ParseTopLevelExpr();` from the tutorial. + pub fn parse_top_level_expr(&mut self) -> ParseResult> { + let e = self.parse_expression()?; + let proto = Box::new(PrototypeAST("".into(), Vec::new())); + Ok(Box::new(FunctionAST(proto, e))) + } +} + +/// Get the binary operator precedence. +/// +/// Implement `int GetTokPrecedence();` from the tutorial. +fn get_tok_precedence(tok: &Token) -> isize { + match tok { + Token::Char('<') => 10, + Token::Char('+') => 20, + Token::Char('-') => 20, + Token::Char('*') => 40, + _ => -1, + } +} + +#[cfg(test)] +mod test { + use super::{ExprAST, Parser}; + use crate::lexer::Lexer; + + fn parser(input: &str) -> Parser { + let l = Lexer::new(input.chars()); + let mut p = Parser::new(l); + + // Drop initial coin, initialize cur_tok. + p.get_next_token(); + + p + } + + #[test] + fn parse_number() { + let mut p = parser("13.37"); + + assert_eq!(p.parse_num_expr(), Ok(Box::new(ExprAST::Number(13.37f64)))); + } + + #[test] + fn parse_variable() { + let mut p = parser("foop"); + + assert_eq!( + p.parse_identifier_expr(), + Ok(Box::new(ExprAST::Variable("foop".into()))) + ); + } + + #[test] + fn parse_primary() { + let mut p = parser("1337 foop \n bla(123)"); + + assert_eq!(p.parse_primary(), Ok(Box::new(ExprAST::Number(1337f64)))); + + assert_eq!( + p.parse_primary(), + Ok(Box::new(ExprAST::Variable("foop".into()))) + ); + + assert_eq!( + p.parse_primary(), + Ok(Box::new(ExprAST::Call( + "bla".into(), + vec![Box::new(ExprAST::Number(123f64))] + ))) + ); + } + + #[test] + fn parse_binary_op() { + // Operator before RHS has higher precedence, expected AST + // + // - + // / \ + // + c + // / \ + // a b + let mut p = parser("a + b - c"); + + let binexpr_ab = Box::new(ExprAST::Binary( + '+', + Box::new(ExprAST::Variable("a".into())), + Box::new(ExprAST::Variable("b".into())), + )); + + let binexpr_abc = Box::new(ExprAST::Binary( + '-', + binexpr_ab, + Box::new(ExprAST::Variable("c".into())), + )); + + assert_eq!(p.parse_expression(), Ok(binexpr_abc)); + } + + #[test] + fn parse_binary_op2() { + // Operator after RHS has higher precedence, expected AST + // + // + + // / \ + // a * + // / \ + // b c + let mut p = parser("a + b * c"); + + let binexpr_bc = Box::new(ExprAST::Binary( + '*', + Box::new(ExprAST::Variable("b".into())), + Box::new(ExprAST::Variable("c".into())), + )); + + let binexpr_abc = Box::new(ExprAST::Binary( + '+', + Box::new(ExprAST::Variable("a".into())), + binexpr_bc, + )); + + assert_eq!(p.parse_expression(), Ok(binexpr_abc)); + } +} -- cgit v1.2.3