From dfe15c16de2f8c80de55ac442ff8d8ff3687fb11 Mon Sep 17 00:00:00 2001 From: Johannes Stoelp Date: Mon, 30 Aug 2021 22:13:08 +0200 Subject: ch1: added minimal lexer implementation following chapter 1 --- .gitignore | 1 + Cargo.lock | 7 +++ Cargo.toml | 6 ++ LICENSE | 21 +++++++ README.md | 9 +++ src/lexer.rs | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 18 ++++++ 7 files changed, 243 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 src/lexer.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..dd2a11a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "llvm-kaleidoscope-rs" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e345bab --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "llvm-kaleidoscope-rs" +version = "0.1.0" +edition = "2018" + +[dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6144be6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Johannes Stoelp + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d1b4e6 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# llvm-kaleidoscope-rs + +Follow the official llvm tutorial [`Kaleidoscope: Implementing a Language with +LLVM`][llvm-tutorial] to learn about `llvm` and practice some `rust`. + +## License +This project is licensed under the [MIT](LICENSE) license. + +[llvm-tutorial]: https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/index.html diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..a25f0ab --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,181 @@ +#[derive(Debug, PartialEq)] +pub enum Token { + Eof, + Def, + Extern, + Identifier(String), + Number(f64), + Char(char), +} + +pub struct Lexer +where + I: Iterator, +{ + input: I, + last_char: Option, +} + +impl Lexer +where + I: Iterator, +{ + pub fn new(mut input: I) -> Lexer { + let last_char = input.next(); + Lexer { input, last_char } + } + + fn step(&mut self) -> Option { + self.last_char = self.input.next(); + self.last_char + } + + /// Lex and return the next token. + /// + /// Implement `int gettok();` from the tutorial. + pub fn gettok(&mut self) -> Token { + // Eat up whitespaces. + while matches!(self.last_char, Some(c) if c.is_ascii_whitespace()) { + self.step(); + } + + // Unpack last char or return EOF. + let last_char = if let Some(c) = self.last_char { + c + } else { + return Token::Eof; + }; + + // Identifier: [a-zA-Z][a-zA-Z0-9]* + if last_char.is_ascii_alphabetic() { + let mut ident = String::new(); + ident.push(last_char); + + while let Some(c) = self.step() { + if c.is_ascii_alphanumeric() { + ident.push(c) + } else { + break; + } + } + + match ident.as_ref() { + "def" => return Token::Def, + "extern" => return Token::Extern, + _ => {} + } + + return Token::Identifier(ident); + } + + // Number: [0-9.]+ + if last_char.is_ascii_digit() || last_char == '.' { + let mut num = String::new(); + num.push(last_char); + + while let Some(c) = self.step() { + if c.is_ascii_digit() || c == '.' { + num.push(c) + } else { + break; + } + } + + let num: f64 = num.parse().unwrap_or_default(); + return Token::Number(num); + } + + // Eat up comment. + if last_char == '#' { + loop { + match self.step() { + Some(c) if c == '\r' || c == '\n' => return self.gettok(), + None => return Token::Eof, + _ => { /* consume comment */ } + } + } + } + + // Advance last char and return currently last char. + self.step(); + Token::Char(last_char) + } +} + +#[cfg(test)] +mod test { + use super::{Lexer, Token}; + + #[test] + fn test_identifier() { + let mut lex = Lexer::new("a b c".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_keyword() { + let mut lex = Lexer::new("def extern".chars()); + assert_eq!(Token::Def, lex.gettok()); + assert_eq!(Token::Extern, lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_number() { + let mut lex = Lexer::new("12.34".chars()); + assert_eq!(Token::Number(12.34f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new(" 1.0 2.0 3.0".chars()); + assert_eq!(Token::Number(1.0f64), lex.gettok()); + assert_eq!(Token::Number(2.0f64), lex.gettok()); + assert_eq!(Token::Number(3.0f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("12.34.56".chars()); + assert_eq!(Token::Number(0f64), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_comment() { + let mut lex = Lexer::new("# some comment".chars()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("abc # some comment \n xyz".chars()); + assert_eq!(Token::Identifier("abc".into()), lex.gettok()); + assert_eq!(Token::Identifier("xyz".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_chars() { + let mut lex = Lexer::new("a+b-c".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Char('+'), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Char('-'), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } + + #[test] + fn test_whitespaces() { + let mut lex = Lexer::new(" +a b c! ".chars()); + assert_eq!(Token::Char('+'), lex.gettok()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Char('!'), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + + let mut lex = Lexer::new("\n a \n\r b \r \n c \r\r \n ".chars()); + assert_eq!(Token::Identifier("a".into()), lex.gettok()); + assert_eq!(Token::Identifier("b".into()), lex.gettok()); + assert_eq!(Token::Identifier("c".into()), lex.gettok()); + assert_eq!(Token::Eof, lex.gettok()); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..361f76f --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +mod lexer; + +use lexer::Lexer; +use std::io::Read; + +fn main() { + println!("Lex stdin."); + println!("ENTER to lex current input."); + println!("C-c to exit."); + let mut lex = Lexer::new(std::io::stdin().bytes().filter_map(|v| { + let v = v.ok()?; + Some(v.into()) + })); + + loop { + println!("{:?}", lex.gettok()); + } +} -- cgit v1.2.3