aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJohannes Stoelp <johannes.stoelp@gmail.com>2021-08-30 22:13:08 +0200
committerJohannes Stoelp <johannes.stoelp@gmail.com>2021-09-02 23:46:44 +0200
commitdfe15c16de2f8c80de55ac442ff8d8ff3687fb11 (patch)
tree36c2989b35995692df4f8d2552eab3de87c2ee8b
downloadllvm-kaleidoscope-rs-dfe15c16de2f8c80de55ac442ff8d8ff3687fb11.tar.gz
llvm-kaleidoscope-rs-dfe15c16de2f8c80de55ac442ff8d8ff3687fb11.zip
ch1: added minimal lexer implementation following chapter 1chapter1
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock7
-rw-r--r--Cargo.toml6
-rw-r--r--LICENSE21
-rw-r--r--README.md9
-rw-r--r--src/lexer.rs181
-rw-r--r--src/main.rs18
7 files changed, 243 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..dd2a11a
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "llvm-kaleidoscope-rs"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..e345bab
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "llvm-kaleidoscope-rs"
+version = "0.1.0"
+edition = "2018"
+
+[dependencies]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6144be6
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Johannes Stoelp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8d1b4e6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# llvm-kaleidoscope-rs
+
+Follow the official llvm tutorial [`Kaleidoscope: Implementing a Language with
+LLVM`][llvm-tutorial] to learn about `llvm` and practice some `rust`.
+
+## License
+This project is licensed under the [MIT](LICENSE) license.
+
+[llvm-tutorial]: https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/index.html
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..a25f0ab
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,181 @@
+#[derive(Debug, PartialEq)]
+pub enum Token {
+ Eof,
+ Def,
+ Extern,
+ Identifier(String),
+ Number(f64),
+ Char(char),
+}
+
+pub struct Lexer<I>
+where
+ I: Iterator<Item = char>,
+{
+ input: I,
+ last_char: Option<char>,
+}
+
+impl<I> Lexer<I>
+where
+ I: Iterator<Item = char>,
+{
+ pub fn new(mut input: I) -> Lexer<I> {
+ let last_char = input.next();
+ Lexer { input, last_char }
+ }
+
+ fn step(&mut self) -> Option<char> {
+ self.last_char = self.input.next();
+ self.last_char
+ }
+
+ /// Lex and return the next token.
+ ///
+ /// Implement `int gettok();` from the tutorial.
+ pub fn gettok(&mut self) -> Token {
+ // Eat up whitespaces.
+ while matches!(self.last_char, Some(c) if c.is_ascii_whitespace()) {
+ self.step();
+ }
+
+ // Unpack last char or return EOF.
+ let last_char = if let Some(c) = self.last_char {
+ c
+ } else {
+ return Token::Eof;
+ };
+
+ // Identifier: [a-zA-Z][a-zA-Z0-9]*
+ if last_char.is_ascii_alphabetic() {
+ let mut ident = String::new();
+ ident.push(last_char);
+
+ while let Some(c) = self.step() {
+ if c.is_ascii_alphanumeric() {
+ ident.push(c)
+ } else {
+ break;
+ }
+ }
+
+ match ident.as_ref() {
+ "def" => return Token::Def,
+ "extern" => return Token::Extern,
+ _ => {}
+ }
+
+ return Token::Identifier(ident);
+ }
+
+ // Number: [0-9.]+
+ if last_char.is_ascii_digit() || last_char == '.' {
+ let mut num = String::new();
+ num.push(last_char);
+
+ while let Some(c) = self.step() {
+ if c.is_ascii_digit() || c == '.' {
+ num.push(c)
+ } else {
+ break;
+ }
+ }
+
+ let num: f64 = num.parse().unwrap_or_default();
+ return Token::Number(num);
+ }
+
+ // Eat up comment.
+ if last_char == '#' {
+ loop {
+ match self.step() {
+ Some(c) if c == '\r' || c == '\n' => return self.gettok(),
+ None => return Token::Eof,
+ _ => { /* consume comment */ }
+ }
+ }
+ }
+
+ // Advance last char and return currently last char.
+ self.step();
+ Token::Char(last_char)
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::{Lexer, Token};
+
+ #[test]
+ fn test_identifier() {
+ let mut lex = Lexer::new("a b c".chars());
+ assert_eq!(Token::Identifier("a".into()), lex.gettok());
+ assert_eq!(Token::Identifier("b".into()), lex.gettok());
+ assert_eq!(Token::Identifier("c".into()), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+
+ #[test]
+ fn test_keyword() {
+ let mut lex = Lexer::new("def extern".chars());
+ assert_eq!(Token::Def, lex.gettok());
+ assert_eq!(Token::Extern, lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+
+ #[test]
+ fn test_number() {
+ let mut lex = Lexer::new("12.34".chars());
+ assert_eq!(Token::Number(12.34f64), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+
+ let mut lex = Lexer::new(" 1.0 2.0 3.0".chars());
+ assert_eq!(Token::Number(1.0f64), lex.gettok());
+ assert_eq!(Token::Number(2.0f64), lex.gettok());
+ assert_eq!(Token::Number(3.0f64), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+
+ let mut lex = Lexer::new("12.34.56".chars());
+ assert_eq!(Token::Number(0f64), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+
+ #[test]
+ fn test_comment() {
+ let mut lex = Lexer::new("# some comment".chars());
+ assert_eq!(Token::Eof, lex.gettok());
+
+ let mut lex = Lexer::new("abc # some comment \n xyz".chars());
+ assert_eq!(Token::Identifier("abc".into()), lex.gettok());
+ assert_eq!(Token::Identifier("xyz".into()), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+
+ #[test]
+ fn test_chars() {
+ let mut lex = Lexer::new("a+b-c".chars());
+ assert_eq!(Token::Identifier("a".into()), lex.gettok());
+ assert_eq!(Token::Char('+'), lex.gettok());
+ assert_eq!(Token::Identifier("b".into()), lex.gettok());
+ assert_eq!(Token::Char('-'), lex.gettok());
+ assert_eq!(Token::Identifier("c".into()), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+
+ #[test]
+ fn test_whitespaces() {
+ let mut lex = Lexer::new(" +a b c! ".chars());
+ assert_eq!(Token::Char('+'), lex.gettok());
+ assert_eq!(Token::Identifier("a".into()), lex.gettok());
+ assert_eq!(Token::Identifier("b".into()), lex.gettok());
+ assert_eq!(Token::Identifier("c".into()), lex.gettok());
+ assert_eq!(Token::Char('!'), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+
+ let mut lex = Lexer::new("\n a \n\r b \r \n c \r\r \n ".chars());
+ assert_eq!(Token::Identifier("a".into()), lex.gettok());
+ assert_eq!(Token::Identifier("b".into()), lex.gettok());
+ assert_eq!(Token::Identifier("c".into()), lex.gettok());
+ assert_eq!(Token::Eof, lex.gettok());
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..361f76f
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,18 @@
+mod lexer;
+
+use lexer::Lexer;
+use std::io::Read;
+
+fn main() {
+ println!("Lex stdin.");
+ println!("ENTER to lex current input.");
+ println!("C-c to exit.");
+ let mut lex = Lexer::new(std::io::stdin().bytes().filter_map(|v| {
+ let v = v.ok()?;
+ Some(v.into())
+ }));
+
+ loop {
+ println!("{:?}", lex.gettok());
+ }
+}