Add initial Lexer

master
Nick Krichevsky 2024-05-14 22:00:28 -04:00
parent c5efd89e03
commit 5838b7eec9
5 changed files with 527 additions and 2 deletions

88
Cargo.lock generated Normal file
View File

@ -0,0 +1,88 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "anyhow"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3"
[[package]]
name = "either"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "jlox-rust"
version = "0.1.0"
dependencies = [
"anyhow",
"itertools",
"thiserror",
]
[[package]]
name = "proc-macro2"
version = "1.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "syn"
version = "2.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

View File

@ -6,3 +6,6 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.83"
itertools = "0.12.1"
thiserror = "1.0.60"

347
src/lex.rs Normal file
View File

@ -0,0 +1,347 @@
use itertools::{FoldWhile, Itertools};
use std::iter::{self, Peekable};
use thiserror::Error;
use crate::{ScriptError, ScriptErrors};
#[derive(Debug, Clone)]
pub enum TokenKind {
// Punctuation
LeftParen,
RightParen,
LeftBrace,
RightBrace,
Comma,
Dot,
Minus,
Plus,
SemiColon,
Slash,
Star,
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
// Literals
Identifier(String),
String(String),
Number(f64),
// Keywords
And,
Class,
Else,
False,
Fun,
For,
If,
Nil,
Or,
Print,
Return,
Super,
This,
True,
Var,
While,
Eof,
}
#[derive(Debug, Clone)]
pub struct Token {
kind: TokenKind,
lexeme: String,
line: usize,
}
#[derive(Debug, Clone)]
struct Consumed {
token: Option<Token>,
span: Span,
}
#[derive(Debug, Clone, Copy)]
struct Span {
lines: usize,
chars: usize,
}
#[derive(Error, Debug, Clone)]
#[error("{error}")]
struct ScanError {
span: Span,
error: ScriptError,
}
impl Token {
pub fn kind(&self) -> &TokenKind {
&self.kind
}
pub fn lexeme(&self) -> &str {
&self.lexeme
}
pub fn line(&self) -> usize {
self.line
}
}
pub fn scan_source(source: &str) -> Result<Vec<Token>, ScriptErrors> {
let mut idx = 0_usize;
let mut line = 1_usize;
let mut tokens = Vec::new();
let mut errors = Vec::new();
while idx < source.len() {
match scan_token(&source[idx..], line) {
Ok(consumed) => {
if let Some(token) = consumed.token {
tokens.push(token);
}
idx += consumed.span.chars;
line += consumed.span.lines;
}
Err(err) => {
errors.push(err.error);
idx += err.span.chars;
line += err.span.lines;
}
}
}
if errors.is_empty() {
tokens.push(Token {
kind: TokenKind::Eof,
lexeme: String::new(),
line: line + 1,
});
Ok(tokens)
} else {
Err(errors.into())
}
}
fn scan_token(partial_source: &str, start_line: usize) -> Result<Consumed, ScanError> {
// We will deal in chars, but want to iterate in bytes to keep life simple
let mut source_chars = partial_source.bytes().map(|c| c as char).peekable();
let maybe_first_char = source_chars.next();
let mut match_one_or_two =
|first_char: char, second_char: char, if_matches: TokenKind, if_doesnt: TokenKind| {
Ok(match_one_or_two(
&mut source_chars,
first_char,
second_char,
if_matches,
if_doesnt,
))
};
let match_res = match maybe_first_char {
None => panic!("cannot scan zero length source"),
Some(c @ '(') => Ok((TokenKind::LeftParen, c.to_string())),
Some(c @ ')') => Ok((TokenKind::RightParen, c.to_string())),
Some(c @ '{') => Ok((TokenKind::LeftBrace, c.to_string())),
Some(c @ '}') => Ok((TokenKind::RightBrace, c.to_string())),
Some(c @ ',') => Ok((TokenKind::Comma, c.to_string())),
Some(c @ '.') => Ok((TokenKind::Dot, c.to_string())),
Some(c @ '-') => Ok((TokenKind::Minus, c.to_string())),
Some(c @ '+') => Ok((TokenKind::Plus, c.to_string())),
Some(c @ ';') => Ok((TokenKind::SemiColon, c.to_string())),
Some(c @ '*') => Ok((TokenKind::Star, c.to_string())),
Some('!') => match_one_or_two('!', '=', TokenKind::BangEqual, TokenKind::Bang),
Some('=') => match_one_or_two('=', '=', TokenKind::EqualEqual, TokenKind::Equal),
Some('<') => match_one_or_two('<', '=', TokenKind::Less, TokenKind::LessEqual),
Some('>') => match_one_or_two('>', '=', TokenKind::Greater, TokenKind::GreaterEqual),
Some(c @ '/') => {
if next_matches(&mut source_chars, '/') {
return Ok(lex_comment_contents(&mut source_chars));
}
Ok((TokenKind::Slash, c.to_string()))
}
Some('"') => return lex_remaining_string_literal(&mut source_chars, start_line),
Some(' ' | '\r' | '\t') => {
return Ok(Consumed {
token: None,
span: Span { lines: 0, chars: 1 },
})
}
Some('\n') => {
return Ok(Consumed {
token: None,
span: Span { lines: 1, chars: 1 },
})
}
Some(c) if c.is_ascii_digit() => {
let raw_number = iter::once(c)
.chain(
source_chars
.take_while(|&inner_char| inner_char.is_ascii_digit() || inner_char == '.'),
)
.collect::<String>();
let number = raw_number.parse().unwrap_or_else(|err| {
panic!("could not parse lexed numeric literal '{raw_number}': {err}")
});
Ok((TokenKind::Number(number), raw_number))
}
Some(c) if c.is_ascii_alphabetic() => {
let raw_word = iter::once(c)
.chain(source_chars.take_while(|&inner_char| {
inner_char.is_ascii_alphanumeric() || inner_char == '_'
}))
.collect::<String>();
Ok((tokenize_word(raw_word.clone()), raw_word))
}
Some(c) => Err(ScriptError {
line: start_line,
location: String::new(),
message: format!("Unexpected character {c}"),
}),
};
let (token_kind, lexeme) = match_res.map_err(|err| ScanError {
error: err,
span: Span { chars: 1, lines: 0 },
})?;
let span = Span {
chars: lexeme.len(),
lines: 0,
};
Ok(Consumed {
span,
token: Some(Token {
kind: token_kind,
lexeme,
line: start_line,
}),
})
}
fn lex_comment_contents<I: Iterator<Item = char>>(line_iter: &mut I) -> Consumed {
let comment_length = length_until_end_of_line(line_iter);
Consumed {
span: Span {
lines: 1,
chars: comment_length + 2,
},
token: None,
}
}
fn lex_remaining_string_literal<I: Iterator<Item = char>>(
line_iter: &mut I,
start_line: usize,
) -> Result<Consumed, ScanError> {
let (contents, end_found, newlines) = line_iter
.fold_while(
(String::new(), false, 0),
|(mut s, _end_found, newlines), c| {
if c == '"' {
FoldWhile::Done((s, true, newlines))
} else {
s.push(c);
FoldWhile::Continue((s, false, if c == '\n' { newlines + 1 } else { newlines }))
}
},
)
.into_inner();
if end_found {
Ok(Consumed {
token: Some(Token {
kind: TokenKind::String(contents.clone()),
lexeme: format!("\"{contents}\""),
line: start_line + newlines,
}),
span: Span {
lines: newlines,
// must include the quotes
chars: contents.len() + 2,
},
})
} else {
Err(ScanError {
span: Span {
// only need to include start quote
chars: contents.len() + 1,
lines: newlines,
},
error: ScriptError {
line: start_line,
location: String::new(),
message: "Unterminated string".to_string(),
},
})
}
}
fn next_matches<I: Iterator<Item = char>>(line_iter: &mut Peekable<I>, c: char) -> bool {
match line_iter.peek() {
Some(&peeked) if peeked == c => {
// Force the iterator to iterate
line_iter.next();
true
}
None | Some(_) => false,
}
}
fn match_one_or_two<I: Iterator<Item = char>>(
line_iter: &mut Peekable<I>,
first_char: char,
second_char: char,
if_matches: TokenKind,
if_doesnt: TokenKind,
) -> (TokenKind, String) {
if next_matches(line_iter, '=') {
(if_matches, format!("{first_char}{second_char}"))
} else {
(if_doesnt, first_char.to_string())
}
}
fn length_until_end_of_line<I: Iterator<Item = char>>(iter: &mut I) -> usize {
iter.take_while(|&c| c != '\n').count()
}
fn tokenize_word(word: String) -> TokenKind {
match word.as_ref() {
"and" => TokenKind::And,
"class" => TokenKind::Class,
"else" => TokenKind::Else,
"false" => TokenKind::False,
"for" => TokenKind::For,
"fun" => TokenKind::Fun,
"if" => TokenKind::If,
"nil" => TokenKind::Nil,
"or" => TokenKind::Or,
"print" => TokenKind::Print,
"return" => TokenKind::Return,
"super" => TokenKind::Super,
"this" => TokenKind::This,
"true" => TokenKind::True,
"var" => TokenKind::Var,
"while" => TokenKind::While,
_other => TokenKind::Identifier(word),
}
}

42
src/lib.rs Normal file
View File

@ -0,0 +1,42 @@
#![warn(clippy::pedantic)]
use std::fmt::{self, Display, Formatter};
mod lex;
#[derive(thiserror::Error, Debug, Clone)]
#[error("[line {line}] Error {location}: {message}")]
pub struct ScriptError {
line: usize,
location: String,
message: String,
}
#[derive(thiserror::Error, Debug)]
pub struct ScriptErrors(Vec<ScriptError>);
impl From<Vec<ScriptError>> for ScriptErrors {
fn from(errs: Vec<ScriptError>) -> Self {
Self(errs)
}
}
impl Display for ScriptErrors {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
for err in &self.0 {
writeln!(f, "{err}")?;
}
Ok(())
}
}
pub fn run(script: &str) -> Result<(), ScriptErrors> {
let tokens = lex::scan_source(script)?;
for token in &tokens {
print!("{}", token.lexeme());
}
println!();
Ok(())
}

View File

@ -1,3 +1,48 @@
fn main() {
println!("Hello, world!");
#![warn(clippy::pedantic)]
use anyhow::Context;
use std::{
env, fs,
io::{self, Write},
process,
};
fn main() -> anyhow::Result<()> {
let args = env::args().collect::<Vec<String>>();
match &args[..] {
[_executable] => run_prompt(),
[_executable, filename] => run_file(filename),
_ => {
eprintln!("Usage: {} [script]", args[0]);
process::exit(1)
}
}
}
fn run_prompt() -> anyhow::Result<()> {
let stdin = io::stdin();
let mut stdout = io::stdout();
let mut print_prompt = || -> anyhow::Result<()> {
print!("> ");
stdout.flush().map_err(anyhow::Error::new)
};
print_prompt()?;
for line_res in stdin.lines() {
let line = line_res.with_context(|| "failed to read input line")?;
let run_res = jlox_rust::run(&line);
if let Err(err) = run_res {
eprintln!("{err:?}");
}
print_prompt()?;
}
Ok(())
}
fn run_file(path: &str) -> anyhow::Result<()> {
let script = fs::read_to_string(path)?;
jlox_rust::run(&script).map_err(anyhow::Error::new)
}