From dfe15c16de2f8c80de55ac442ff8d8ff3687fb11 Mon Sep 17 00:00:00 2001 From: Johannes Stoelp Date: Mon, 30 Aug 2021 22:13:08 +0200 Subject: ch1: added minimal lexer implementation following chapter 1 --- src/lexer.rs | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 src/lexer.rs (limited to 'src/lexer.rs') diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..a25f0ab --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,181 @@ +#[derive(Debug, PartialEq)] +pub enum Token { + Eof, + Def, + Extern, + Identifier(String), + Number(f64), + Char(char), +} + +pub struct Lexer +where + I: Iterator, +{ + input: I, + last_char: Option, +} + +impl Lexer +where + I: Iterator, +{ + pub fn new(mut input: I) -> Lexer { + let last_char = input.next(); + Lexer { input, last_char } + } + + fn step(&mut self) -> Option { + self.last_char = self.input.next(); + self.last_char + } + + /// Lex and return the next token. + /// + /// Implement `int gettok();` from the tutorial. + pub fn gettok(&mut self) -> Token { + // Eat up whitespaces. + while matches!(self.last_char, Some(c) if c.is_ascii_whitespace()) { + self.step(); + } + + // Unpack last char or return EOF. + let last_char = if let Some(c) = self.last_char { + c + } else { + return Token::Eof; + }; + + // Identifier: [a-zA-Z][a-zA-Z0-9]* + if last_char.is_ascii_alphabetic() { + let mut ident = String::new(); + ident.push(last_char); + + while let Some(c) = self.step() { + if c.is_ascii_alphanumeric() { + ident.push(c) + } else { + break; + } + } + + match ident.as_ref() { + "def" => return Token::Def, + "extern" => return Token::Extern, + _ => {} + } + + return Token::Identifier(ident); + } + + // Number: [0-9.]+ + if last_char.is_ascii_digit() || last_char == '.' { + let mut num = String::new(); + num.push(last_char); + + while let Some(c) = self.step() { + if c.is_ascii_digit() || c == '.' { + num.push(c) + } else { + break; + } + } + + let num: f64 = num.parse().unwrap_or_default(); + return Token::Number(num); + } + + // Eat up comment. + if last_char == '#' { + loop { + match self.step() { + Some(c) if c == '\r' || c == '\n' => return self.gettok(), + None => return Token::Eof, + _ => { /* consume comment */ } + } + } + } + + // Advance last char and return currently last char. + self.step(); + Token::Char(last_char) + } +} + +#[cfg(test)] +mod test { + use super::{Lexer, Token}; + + #[test] + fn test_identifier() { + let mut lex = Lexer::new("a b c".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_keyword() { + let mut lex = Lexer::new("def extern".chars()); + assert_eq!(Token::Def, lex.gettok()); + assert_eq!(Token::Extern, lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_number() { + let mut lex = Lexer::new("12.34".chars()); + assert_eq!(Token::Number(12.34f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new(" 1.0 2.0 3.0".chars()); + assert_eq!(Token::Number(1.0f64), lex.gettok()); + assert_eq!(Token::Number(2.0f64), lex.gettok()); + assert_eq!(Token::Number(3.0f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("12.34.56".chars()); + assert_eq!(Token::Number(0f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_comment() { + let mut lex = Lexer::new("# some comment".chars()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("abc # some comment \n xyz".chars()); + assert_eq!(Token::Identifier("abc".into()), lex.gettok()); + assert_eq!(Token::Identifier("xyz".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_chars() { + let mut lex = Lexer::new("a+b-c".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Char('+'), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Char('-'), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_whitespaces() { + let mut lex = Lexer::new(" +a b c! ".chars()); + assert_eq!(Token::Char('+'), lex.gettok()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Char('!'), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("\n a \n\r b \r \n c \r\r \n ".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } +} -- cgit v1.2.3