From 5838b7eec96e40cfd1d0a0bd370bdf9bdd21cf32 Mon Sep 17 00:00:00 2001 From: Nick Krichevsky Date: Tue, 14 May 2024 22:00:28 -0400 Subject: [PATCH] Add initial Lexer --- Cargo.lock | 88 +++++++++++++ Cargo.toml | 3 + src/lex.rs | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 42 +++++++ src/main.rs | 49 +++++++- 5 files changed, 527 insertions(+), 2 deletions(-) create mode 100644 Cargo.lock create mode 100644 src/lex.rs create mode 100644 src/lib.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e617b40 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,88 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "anyhow" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3" + +[[package]] +name = "either" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "jlox-rust" +version = "0.1.0" +dependencies = [ + "anyhow", + "itertools", + "thiserror", +] + +[[package]] +name = "proc-macro2" +version = "1.0.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml index c3f4df5..a7adcfa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.83" +itertools = "0.12.1" +thiserror = "1.0.60" diff --git a/src/lex.rs b/src/lex.rs new file mode 100644 index 0000000..5523961 --- /dev/null +++ b/src/lex.rs @@ -0,0 +1,347 @@ +use itertools::{FoldWhile, Itertools}; +use std::iter::{self, Peekable}; + +use thiserror::Error; + +use crate::{ScriptError, ScriptErrors}; + +#[derive(Debug, Clone)] +pub enum TokenKind { + // Punctuation + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + SemiColon, + Slash, + Star, + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals + Identifier(String), + String(String), + Number(f64), + + // Keywords + And, + Class, + Else, + False, + Fun, + For, + If, + Nil, + Or, + Print, + Return, + Super, + This, + True, + Var, + While, + Eof, +} + +#[derive(Debug, Clone)] +pub struct Token { + kind: TokenKind, + lexeme: String, + line: usize, +} + +#[derive(Debug, Clone)] +struct Consumed { + token: Option, + span: Span, +} + +#[derive(Debug, Clone, Copy)] +struct Span { + lines: usize, + chars: usize, +} + +#[derive(Error, Debug, Clone)] +#[error("{error}")] +struct ScanError { + span: Span, + error: ScriptError, +} + +impl Token { + pub fn kind(&self) -> &TokenKind { + &self.kind + } + + pub fn lexeme(&self) -> &str { + &self.lexeme + } + + pub fn line(&self) -> usize { + self.line + } +} + +pub fn scan_source(source: &str) -> Result, ScriptErrors> { + let mut idx = 0_usize; + let mut line = 1_usize; + let mut tokens = Vec::new(); + let mut errors = Vec::new(); + while idx < source.len() { + match scan_token(&source[idx..], line) { + Ok(consumed) => { + if let Some(token) = consumed.token { + tokens.push(token); + } + idx += consumed.span.chars; + line += consumed.span.lines; + } + Err(err) => { + errors.push(err.error); + idx += err.span.chars; + line += err.span.lines; + } + } + } + + if errors.is_empty() { + tokens.push(Token { + kind: TokenKind::Eof, + lexeme: String::new(), + line: line + 1, + }); + Ok(tokens) + } else { + Err(errors.into()) + } +} + +fn scan_token(partial_source: &str, start_line: usize) -> Result { + // We will deal in chars, but want to iterate in bytes to keep life simple + let mut source_chars = partial_source.bytes().map(|c| c as char).peekable(); + let maybe_first_char = source_chars.next(); + let mut match_one_or_two = + |first_char: char, second_char: char, if_matches: TokenKind, if_doesnt: TokenKind| { + Ok(match_one_or_two( + &mut source_chars, + first_char, + second_char, + if_matches, + if_doesnt, + )) + }; + + let match_res = match maybe_first_char { + None => panic!("cannot scan zero length source"), + + Some(c @ '(') => Ok((TokenKind::LeftParen, c.to_string())), + Some(c @ ')') => Ok((TokenKind::RightParen, c.to_string())), + Some(c @ '{') => Ok((TokenKind::LeftBrace, c.to_string())), + Some(c @ '}') => Ok((TokenKind::RightBrace, c.to_string())), + Some(c @ ',') => Ok((TokenKind::Comma, c.to_string())), + Some(c @ '.') => Ok((TokenKind::Dot, c.to_string())), + Some(c @ '-') => Ok((TokenKind::Minus, c.to_string())), + Some(c @ '+') => Ok((TokenKind::Plus, c.to_string())), + Some(c @ ';') => Ok((TokenKind::SemiColon, c.to_string())), + Some(c @ '*') => Ok((TokenKind::Star, c.to_string())), + + Some('!') => match_one_or_two('!', '=', TokenKind::BangEqual, TokenKind::Bang), + Some('=') => match_one_or_two('=', '=', TokenKind::EqualEqual, TokenKind::Equal), + Some('<') => match_one_or_two('<', '=', TokenKind::Less, TokenKind::LessEqual), + Some('>') => match_one_or_two('>', '=', TokenKind::Greater, TokenKind::GreaterEqual), + + Some(c @ '/') => { + if next_matches(&mut source_chars, '/') { + return Ok(lex_comment_contents(&mut source_chars)); + } + + Ok((TokenKind::Slash, c.to_string())) + } + + Some('"') => return lex_remaining_string_literal(&mut source_chars, start_line), + + Some(' ' | '\r' | '\t') => { + return Ok(Consumed { + token: None, + span: Span { lines: 0, chars: 1 }, + }) + } + + Some('\n') => { + return Ok(Consumed { + token: None, + span: Span { lines: 1, chars: 1 }, + }) + } + + Some(c) if c.is_ascii_digit() => { + let raw_number = iter::once(c) + .chain( + source_chars + .take_while(|&inner_char| inner_char.is_ascii_digit() || inner_char == '.'), + ) + .collect::(); + + let number = raw_number.parse().unwrap_or_else(|err| { + panic!("could not parse lexed numeric literal '{raw_number}': {err}") + }); + + Ok((TokenKind::Number(number), raw_number)) + } + + Some(c) if c.is_ascii_alphabetic() => { + let raw_word = iter::once(c) + .chain(source_chars.take_while(|&inner_char| { + inner_char.is_ascii_alphanumeric() || inner_char == '_' + })) + .collect::(); + + Ok((tokenize_word(raw_word.clone()), raw_word)) + } + + Some(c) => Err(ScriptError { + line: start_line, + location: String::new(), + message: format!("Unexpected character {c}"), + }), + }; + + let (token_kind, lexeme) = match_res.map_err(|err| ScanError { + error: err, + span: Span { chars: 1, lines: 0 }, + })?; + + let span = Span { + chars: lexeme.len(), + lines: 0, + }; + + Ok(Consumed { + span, + token: Some(Token { + kind: token_kind, + lexeme, + line: start_line, + }), + }) +} + +fn lex_comment_contents>(line_iter: &mut I) -> Consumed { + let comment_length = length_until_end_of_line(line_iter); + Consumed { + span: Span { + lines: 1, + chars: comment_length + 2, + }, + token: None, + } +} + +fn lex_remaining_string_literal>( + line_iter: &mut I, + start_line: usize, +) -> Result { + let (contents, end_found, newlines) = line_iter + .fold_while( + (String::new(), false, 0), + |(mut s, _end_found, newlines), c| { + if c == '"' { + FoldWhile::Done((s, true, newlines)) + } else { + s.push(c); + FoldWhile::Continue((s, false, if c == '\n' { newlines + 1 } else { newlines })) + } + }, + ) + .into_inner(); + + if end_found { + Ok(Consumed { + token: Some(Token { + kind: TokenKind::String(contents.clone()), + lexeme: format!("\"{contents}\""), + line: start_line + newlines, + }), + span: Span { + lines: newlines, + // must include the quotes + chars: contents.len() + 2, + }, + }) + } else { + Err(ScanError { + span: Span { + // only need to include start quote + chars: contents.len() + 1, + lines: newlines, + }, + error: ScriptError { + line: start_line, + location: String::new(), + message: "Unterminated string".to_string(), + }, + }) + } +} + +fn next_matches>(line_iter: &mut Peekable, c: char) -> bool { + match line_iter.peek() { + Some(&peeked) if peeked == c => { + // Force the iterator to iterate + line_iter.next(); + true + } + None | Some(_) => false, + } +} + +fn match_one_or_two>( + line_iter: &mut Peekable, + first_char: char, + second_char: char, + if_matches: TokenKind, + if_doesnt: TokenKind, +) -> (TokenKind, String) { + if next_matches(line_iter, '=') { + (if_matches, format!("{first_char}{second_char}")) + } else { + (if_doesnt, first_char.to_string()) + } +} + +fn length_until_end_of_line>(iter: &mut I) -> usize { + iter.take_while(|&c| c != '\n').count() +} + +fn tokenize_word(word: String) -> TokenKind { + match word.as_ref() { + "and" => TokenKind::And, + "class" => TokenKind::Class, + "else" => TokenKind::Else, + "false" => TokenKind::False, + "for" => TokenKind::For, + "fun" => TokenKind::Fun, + "if" => TokenKind::If, + "nil" => TokenKind::Nil, + "or" => TokenKind::Or, + "print" => TokenKind::Print, + "return" => TokenKind::Return, + "super" => TokenKind::Super, + "this" => TokenKind::This, + "true" => TokenKind::True, + "var" => TokenKind::Var, + "while" => TokenKind::While, + _other => TokenKind::Identifier(word), + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..bce58f0 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,42 @@ +#![warn(clippy::pedantic)] + +use std::fmt::{self, Display, Formatter}; + +mod lex; + +#[derive(thiserror::Error, Debug, Clone)] +#[error("[line {line}] Error {location}: {message}")] +pub struct ScriptError { + line: usize, + location: String, + message: String, +} + +#[derive(thiserror::Error, Debug)] +pub struct ScriptErrors(Vec); + +impl From> for ScriptErrors { + fn from(errs: Vec) -> Self { + Self(errs) + } +} + +impl Display for ScriptErrors { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + for err in &self.0 { + writeln!(f, "{err}")?; + } + + Ok(()) + } +} + +pub fn run(script: &str) -> Result<(), ScriptErrors> { + let tokens = lex::scan_source(script)?; + for token in &tokens { + print!("{}", token.lexeme()); + } + println!(); + + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..6d5a04f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,48 @@ -fn main() { - println!("Hello, world!"); +#![warn(clippy::pedantic)] + +use anyhow::Context; +use std::{ + env, fs, + io::{self, Write}, + process, +}; + +fn main() -> anyhow::Result<()> { + let args = env::args().collect::>(); + match &args[..] { + [_executable] => run_prompt(), + [_executable, filename] => run_file(filename), + + _ => { + eprintln!("Usage: {} [script]", args[0]); + process::exit(1) + } + } +} + +fn run_prompt() -> anyhow::Result<()> { + let stdin = io::stdin(); + let mut stdout = io::stdout(); + let mut print_prompt = || -> anyhow::Result<()> { + print!("> "); + stdout.flush().map_err(anyhow::Error::new) + }; + + print_prompt()?; + for line_res in stdin.lines() { + let line = line_res.with_context(|| "failed to read input line")?; + let run_res = jlox_rust::run(&line); + if let Err(err) = run_res { + eprintln!("{err:?}"); + } + + print_prompt()?; + } + + Ok(()) +} + +fn run_file(path: &str) -> anyhow::Result<()> { + let script = fs::read_to_string(path)?; + jlox_rust::run(&script).map_err(anyhow::Error::new) }