From 5838b7eec96e40cfd1d0a0bd370bdf9bdd21cf32 Mon Sep 17 00:00:00 2001
From: Nick Krichevsky <nick@ollien.com>
Date: Tue, 14 May 2024 22:00:28 -0400
Subject: [PATCH] Add initial Lexer

---
 Cargo.lock  |  88 +++++++++++++
 Cargo.toml  |   3 +
 src/lex.rs  | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs  |  42 +++++++
 src/main.rs |  49 +++++++-
 5 files changed, 527 insertions(+), 2 deletions(-)
 create mode 100644 Cargo.lock
 create mode 100644 src/lex.rs
 create mode 100644 src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..e617b40
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,88 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "anyhow"
+version = "1.0.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3"
+
+[[package]]
+name = "either"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "jlox-rust"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "itertools",
+ "thiserror",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.82"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
diff --git a/Cargo.toml b/Cargo.toml
index c3f4df5..a7adcfa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+anyhow = "1.0.83"
+itertools = "0.12.1"
+thiserror = "1.0.60"
diff --git a/src/lex.rs b/src/lex.rs
new file mode 100644
index 0000000..5523961
--- /dev/null
+++ b/src/lex.rs
@@ -0,0 +1,347 @@
+use itertools::{FoldWhile, Itertools};
+use std::iter::{self, Peekable};
+
+use thiserror::Error;
+
+use crate::{ScriptError, ScriptErrors};
+
+#[derive(Debug, Clone)]
+pub enum TokenKind {
+    // Punctuation
+    LeftParen,
+    RightParen,
+    LeftBrace,
+    RightBrace,
+    Comma,
+    Dot,
+    Minus,
+    Plus,
+    SemiColon,
+    Slash,
+    Star,
+    Bang,
+    BangEqual,
+    Equal,
+    EqualEqual,
+    Greater,
+    GreaterEqual,
+    Less,
+    LessEqual,
+
+    // Literals
+    Identifier(String),
+    String(String),
+    Number(f64),
+
+    // Keywords
+    And,
+    Class,
+    Else,
+    False,
+    Fun,
+    For,
+    If,
+    Nil,
+    Or,
+    Print,
+    Return,
+    Super,
+    This,
+    True,
+    Var,
+    While,
+    Eof,
+}
+
+#[derive(Debug, Clone)]
+pub struct Token {
+    kind: TokenKind,
+    lexeme: String,
+    line: usize,
+}
+
+#[derive(Debug, Clone)]
+struct Consumed {
+    token: Option<Token>,
+    span: Span,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct Span {
+    lines: usize,
+    chars: usize,
+}
+
+#[derive(Error, Debug, Clone)]
+#[error("{error}")]
+struct ScanError {
+    span: Span,
+    error: ScriptError,
+}
+
+impl Token {
+    pub fn kind(&self) -> &TokenKind {
+        &self.kind
+    }
+
+    pub fn lexeme(&self) -> &str {
+        &self.lexeme
+    }
+
+    pub fn line(&self) -> usize {
+        self.line
+    }
+}
+
+pub fn scan_source(source: &str) -> Result<Vec<Token>, ScriptErrors> {
+    let mut idx = 0_usize;
+    let mut line = 1_usize;
+    let mut tokens = Vec::new();
+    let mut errors = Vec::new();
+    while idx < source.len() {
+        match scan_token(&source[idx..], line) {
+            Ok(consumed) => {
+                if let Some(token) = consumed.token {
+                    tokens.push(token);
+                }
+                idx += consumed.span.chars;
+                line += consumed.span.lines;
+            }
+            Err(err) => {
+                errors.push(err.error);
+                idx += err.span.chars;
+                line += err.span.lines;
+            }
+        }
+    }
+
+    if errors.is_empty() {
+        tokens.push(Token {
+            kind: TokenKind::Eof,
+            lexeme: String::new(),
+            line: line + 1,
+        });
+        Ok(tokens)
+    } else {
+        Err(errors.into())
+    }
+}
+
+fn scan_token(partial_source: &str, start_line: usize) -> Result<Consumed, ScanError> {
+    // We will deal in chars, but want to iterate in bytes to keep life simple
+    let mut source_chars = partial_source.bytes().map(|c| c as char).peekable();
+    let maybe_first_char = source_chars.next();
+    let mut match_one_or_two =
+        |first_char: char, second_char: char, if_matches: TokenKind, if_doesnt: TokenKind| {
+            Ok(match_one_or_two(
+                &mut source_chars,
+                first_char,
+                second_char,
+                if_matches,
+                if_doesnt,
+            ))
+        };
+
+    let match_res = match maybe_first_char {
+        None => panic!("cannot scan zero length source"),
+
+        Some(c @ '(') => Ok((TokenKind::LeftParen, c.to_string())),
+        Some(c @ ')') => Ok((TokenKind::RightParen, c.to_string())),
+        Some(c @ '{') => Ok((TokenKind::LeftBrace, c.to_string())),
+        Some(c @ '}') => Ok((TokenKind::RightBrace, c.to_string())),
+        Some(c @ ',') => Ok((TokenKind::Comma, c.to_string())),
+        Some(c @ '.') => Ok((TokenKind::Dot, c.to_string())),
+        Some(c @ '-') => Ok((TokenKind::Minus, c.to_string())),
+        Some(c @ '+') => Ok((TokenKind::Plus, c.to_string())),
+        Some(c @ ';') => Ok((TokenKind::SemiColon, c.to_string())),
+        Some(c @ '*') => Ok((TokenKind::Star, c.to_string())),
+
+        Some('!') => match_one_or_two('!', '=', TokenKind::BangEqual, TokenKind::Bang),
+        Some('=') => match_one_or_two('=', '=', TokenKind::EqualEqual, TokenKind::Equal),
+        Some('<') => match_one_or_two('<', '=', TokenKind::Less, TokenKind::LessEqual),
+        Some('>') => match_one_or_two('>', '=', TokenKind::Greater, TokenKind::GreaterEqual),
+
+        Some(c @ '/') => {
+            if next_matches(&mut source_chars, '/') {
+                return Ok(lex_comment_contents(&mut source_chars));
+            }
+
+            Ok((TokenKind::Slash, c.to_string()))
+        }
+
+        Some('"') => return lex_remaining_string_literal(&mut source_chars, start_line),
+
+        Some(' ' | '\r' | '\t') => {
+            return Ok(Consumed {
+                token: None,
+                span: Span { lines: 0, chars: 1 },
+            })
+        }
+
+        Some('\n') => {
+            return Ok(Consumed {
+                token: None,
+                span: Span { lines: 1, chars: 1 },
+            })
+        }
+
+        Some(c) if c.is_ascii_digit() => {
+            let raw_number = iter::once(c)
+                .chain(
+                    source_chars
+                        .take_while(|&inner_char| inner_char.is_ascii_digit() || inner_char == '.'),
+                )
+                .collect::<String>();
+
+            let number = raw_number.parse().unwrap_or_else(|err| {
+                panic!("could not parse lexed numeric literal '{raw_number}': {err}")
+            });
+
+            Ok((TokenKind::Number(number), raw_number))
+        }
+
+        Some(c) if c.is_ascii_alphabetic() => {
+            let raw_word = iter::once(c)
+                .chain(source_chars.take_while(|&inner_char| {
+                    inner_char.is_ascii_alphanumeric() || inner_char == '_'
+                }))
+                .collect::<String>();
+
+            Ok((tokenize_word(raw_word.clone()), raw_word))
+        }
+
+        Some(c) => Err(ScriptError {
+            line: start_line,
+            location: String::new(),
+            message: format!("Unexpected character {c}"),
+        }),
+    };
+
+    let (token_kind, lexeme) = match_res.map_err(|err| ScanError {
+        error: err,
+        span: Span { chars: 1, lines: 0 },
+    })?;
+
+    let span = Span {
+        chars: lexeme.len(),
+        lines: 0,
+    };
+
+    Ok(Consumed {
+        span,
+        token: Some(Token {
+            kind: token_kind,
+            lexeme,
+            line: start_line,
+        }),
+    })
+}
+
+fn lex_comment_contents<I: Iterator<Item = char>>(line_iter: &mut I) -> Consumed {
+    let comment_length = length_until_end_of_line(line_iter);
+    Consumed {
+        span: Span {
+            lines: 1,
+            chars: comment_length + 2,
+        },
+        token: None,
+    }
+}
+
+fn lex_remaining_string_literal<I: Iterator<Item = char>>(
+    line_iter: &mut I,
+    start_line: usize,
+) -> Result<Consumed, ScanError> {
+    let (contents, end_found, newlines) = line_iter
+        .fold_while(
+            (String::new(), false, 0),
+            |(mut s, _end_found, newlines), c| {
+                if c == '"' {
+                    FoldWhile::Done((s, true, newlines))
+                } else {
+                    s.push(c);
+                    FoldWhile::Continue((s, false, if c == '\n' { newlines + 1 } else { newlines }))
+                }
+            },
+        )
+        .into_inner();
+
+    if end_found {
+        Ok(Consumed {
+            token: Some(Token {
+                kind: TokenKind::String(contents.clone()),
+                lexeme: format!("\"{contents}\""),
+                line: start_line + newlines,
+            }),
+            span: Span {
+                lines: newlines,
+                // must include the quotes
+                chars: contents.len() + 2,
+            },
+        })
+    } else {
+        Err(ScanError {
+            span: Span {
+                // only need to include start quote
+                chars: contents.len() + 1,
+                lines: newlines,
+            },
+            error: ScriptError {
+                line: start_line,
+                location: String::new(),
+                message: "Unterminated string".to_string(),
+            },
+        })
+    }
+}
+
+fn next_matches<I: Iterator<Item = char>>(line_iter: &mut Peekable<I>, c: char) -> bool {
+    match line_iter.peek() {
+        Some(&peeked) if peeked == c => {
+            // Force the iterator to iterate
+            line_iter.next();
+            true
+        }
+        None | Some(_) => false,
+    }
+}
+
+fn match_one_or_two<I: Iterator<Item = char>>(
+    line_iter: &mut Peekable<I>,
+    first_char: char,
+    second_char: char,
+    if_matches: TokenKind,
+    if_doesnt: TokenKind,
+) -> (TokenKind, String) {
+    if next_matches(line_iter, '=') {
+        (if_matches, format!("{first_char}{second_char}"))
+    } else {
+        (if_doesnt, first_char.to_string())
+    }
+}
+
+fn length_until_end_of_line<I: Iterator<Item = char>>(iter: &mut I) -> usize {
+    iter.take_while(|&c| c != '\n').count()
+}
+
+fn tokenize_word(word: String) -> TokenKind {
+    match word.as_ref() {
+        "and" => TokenKind::And,
+        "class" => TokenKind::Class,
+        "else" => TokenKind::Else,
+        "false" => TokenKind::False,
+        "for" => TokenKind::For,
+        "fun" => TokenKind::Fun,
+        "if" => TokenKind::If,
+        "nil" => TokenKind::Nil,
+        "or" => TokenKind::Or,
+        "print" => TokenKind::Print,
+        "return" => TokenKind::Return,
+        "super" => TokenKind::Super,
+        "this" => TokenKind::This,
+        "true" => TokenKind::True,
+        "var" => TokenKind::Var,
+        "while" => TokenKind::While,
+        _other => TokenKind::Identifier(word),
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..bce58f0
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,42 @@
+#![warn(clippy::pedantic)]
+
+use std::fmt::{self, Display, Formatter};
+
+mod lex;
+
+#[derive(thiserror::Error, Debug, Clone)]
+#[error("[line {line}] Error {location}: {message}")]
+pub struct ScriptError {
+    line: usize,
+    location: String,
+    message: String,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub struct ScriptErrors(Vec<ScriptError>);
+
+impl From<Vec<ScriptError>> for ScriptErrors {
+    fn from(errs: Vec<ScriptError>) -> Self {
+        Self(errs)
+    }
+}
+
+impl Display for ScriptErrors {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        for err in &self.0 {
+            writeln!(f, "{err}")?;
+        }
+
+        Ok(())
+    }
+}
+
+pub fn run(script: &str) -> Result<(), ScriptErrors> {
+    let tokens = lex::scan_source(script)?;
+    for token in &tokens {
+        print!("{}", token.lexeme());
+    }
+    println!();
+
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index e7a11a9..6d5a04f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,48 @@
-fn main() {
-    println!("Hello, world!");
+#![warn(clippy::pedantic)]
+
+use anyhow::Context;
+use std::{
+    env, fs,
+    io::{self, Write},
+    process,
+};
+
+fn main() -> anyhow::Result<()> {
+    let args = env::args().collect::<Vec<String>>();
+    match &args[..] {
+        [_executable] => run_prompt(),
+        [_executable, filename] => run_file(filename),
+
+        _ => {
+            eprintln!("Usage: {} [script]", args[0]);
+            process::exit(1)
+        }
+    }
+}
+
+fn run_prompt() -> anyhow::Result<()> {
+    let stdin = io::stdin();
+    let mut stdout = io::stdout();
+    let mut print_prompt = || -> anyhow::Result<()> {
+        print!("> ");
+        stdout.flush().map_err(anyhow::Error::new)
+    };
+
+    print_prompt()?;
+    for line_res in stdin.lines() {
+        let line = line_res.with_context(|| "failed to read input line")?;
+        let run_res = jlox_rust::run(&line);
+        if let Err(err) = run_res {
+            eprintln!("{err:?}");
+        }
+
+        print_prompt()?;
+    }
+
+    Ok(())
+}
+
+fn run_file(path: &str) -> anyhow::Result<()> {
+    let script = fs::read_to_string(path)?;
+    jlox_rust::run(&script).map_err(anyhow::Error::new)
 }