Add initial Lexer
parent
c5efd89e03
commit
5838b7eec9
|
@ -0,0 +1,88 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jlox-rust"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.63"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
|
@ -6,3 +6,6 @@ edition = "2021"
|
|||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.83"
|
||||
itertools = "0.12.1"
|
||||
thiserror = "1.0.60"
|
||||
|
|
|
@ -0,0 +1,347 @@
|
|||
use itertools::{FoldWhile, Itertools};
|
||||
use std::iter::{self, Peekable};
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::{ScriptError, ScriptErrors};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum TokenKind {
|
||||
// Punctuation
|
||||
LeftParen,
|
||||
RightParen,
|
||||
LeftBrace,
|
||||
RightBrace,
|
||||
Comma,
|
||||
Dot,
|
||||
Minus,
|
||||
Plus,
|
||||
SemiColon,
|
||||
Slash,
|
||||
Star,
|
||||
Bang,
|
||||
BangEqual,
|
||||
Equal,
|
||||
EqualEqual,
|
||||
Greater,
|
||||
GreaterEqual,
|
||||
Less,
|
||||
LessEqual,
|
||||
|
||||
// Literals
|
||||
Identifier(String),
|
||||
String(String),
|
||||
Number(f64),
|
||||
|
||||
// Keywords
|
||||
And,
|
||||
Class,
|
||||
Else,
|
||||
False,
|
||||
Fun,
|
||||
For,
|
||||
If,
|
||||
Nil,
|
||||
Or,
|
||||
Print,
|
||||
Return,
|
||||
Super,
|
||||
This,
|
||||
True,
|
||||
Var,
|
||||
While,
|
||||
Eof,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Token {
|
||||
kind: TokenKind,
|
||||
lexeme: String,
|
||||
line: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Consumed {
|
||||
token: Option<Token>,
|
||||
span: Span,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct Span {
|
||||
lines: usize,
|
||||
chars: usize,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{error}")]
|
||||
struct ScanError {
|
||||
span: Span,
|
||||
error: ScriptError,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn kind(&self) -> &TokenKind {
|
||||
&self.kind
|
||||
}
|
||||
|
||||
pub fn lexeme(&self) -> &str {
|
||||
&self.lexeme
|
||||
}
|
||||
|
||||
pub fn line(&self) -> usize {
|
||||
self.line
|
||||
}
|
||||
}
|
||||
|
||||
pub fn scan_source(source: &str) -> Result<Vec<Token>, ScriptErrors> {
|
||||
let mut idx = 0_usize;
|
||||
let mut line = 1_usize;
|
||||
let mut tokens = Vec::new();
|
||||
let mut errors = Vec::new();
|
||||
while idx < source.len() {
|
||||
match scan_token(&source[idx..], line) {
|
||||
Ok(consumed) => {
|
||||
if let Some(token) = consumed.token {
|
||||
tokens.push(token);
|
||||
}
|
||||
idx += consumed.span.chars;
|
||||
line += consumed.span.lines;
|
||||
}
|
||||
Err(err) => {
|
||||
errors.push(err.error);
|
||||
idx += err.span.chars;
|
||||
line += err.span.lines;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
tokens.push(Token {
|
||||
kind: TokenKind::Eof,
|
||||
lexeme: String::new(),
|
||||
line: line + 1,
|
||||
});
|
||||
Ok(tokens)
|
||||
} else {
|
||||
Err(errors.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_token(partial_source: &str, start_line: usize) -> Result<Consumed, ScanError> {
|
||||
// We will deal in chars, but want to iterate in bytes to keep life simple
|
||||
let mut source_chars = partial_source.bytes().map(|c| c as char).peekable();
|
||||
let maybe_first_char = source_chars.next();
|
||||
let mut match_one_or_two =
|
||||
|first_char: char, second_char: char, if_matches: TokenKind, if_doesnt: TokenKind| {
|
||||
Ok(match_one_or_two(
|
||||
&mut source_chars,
|
||||
first_char,
|
||||
second_char,
|
||||
if_matches,
|
||||
if_doesnt,
|
||||
))
|
||||
};
|
||||
|
||||
let match_res = match maybe_first_char {
|
||||
None => panic!("cannot scan zero length source"),
|
||||
|
||||
Some(c @ '(') => Ok((TokenKind::LeftParen, c.to_string())),
|
||||
Some(c @ ')') => Ok((TokenKind::RightParen, c.to_string())),
|
||||
Some(c @ '{') => Ok((TokenKind::LeftBrace, c.to_string())),
|
||||
Some(c @ '}') => Ok((TokenKind::RightBrace, c.to_string())),
|
||||
Some(c @ ',') => Ok((TokenKind::Comma, c.to_string())),
|
||||
Some(c @ '.') => Ok((TokenKind::Dot, c.to_string())),
|
||||
Some(c @ '-') => Ok((TokenKind::Minus, c.to_string())),
|
||||
Some(c @ '+') => Ok((TokenKind::Plus, c.to_string())),
|
||||
Some(c @ ';') => Ok((TokenKind::SemiColon, c.to_string())),
|
||||
Some(c @ '*') => Ok((TokenKind::Star, c.to_string())),
|
||||
|
||||
Some('!') => match_one_or_two('!', '=', TokenKind::BangEqual, TokenKind::Bang),
|
||||
Some('=') => match_one_or_two('=', '=', TokenKind::EqualEqual, TokenKind::Equal),
|
||||
Some('<') => match_one_or_two('<', '=', TokenKind::Less, TokenKind::LessEqual),
|
||||
Some('>') => match_one_or_two('>', '=', TokenKind::Greater, TokenKind::GreaterEqual),
|
||||
|
||||
Some(c @ '/') => {
|
||||
if next_matches(&mut source_chars, '/') {
|
||||
return Ok(lex_comment_contents(&mut source_chars));
|
||||
}
|
||||
|
||||
Ok((TokenKind::Slash, c.to_string()))
|
||||
}
|
||||
|
||||
Some('"') => return lex_remaining_string_literal(&mut source_chars, start_line),
|
||||
|
||||
Some(' ' | '\r' | '\t') => {
|
||||
return Ok(Consumed {
|
||||
token: None,
|
||||
span: Span { lines: 0, chars: 1 },
|
||||
})
|
||||
}
|
||||
|
||||
Some('\n') => {
|
||||
return Ok(Consumed {
|
||||
token: None,
|
||||
span: Span { lines: 1, chars: 1 },
|
||||
})
|
||||
}
|
||||
|
||||
Some(c) if c.is_ascii_digit() => {
|
||||
let raw_number = iter::once(c)
|
||||
.chain(
|
||||
source_chars
|
||||
.take_while(|&inner_char| inner_char.is_ascii_digit() || inner_char == '.'),
|
||||
)
|
||||
.collect::<String>();
|
||||
|
||||
let number = raw_number.parse().unwrap_or_else(|err| {
|
||||
panic!("could not parse lexed numeric literal '{raw_number}': {err}")
|
||||
});
|
||||
|
||||
Ok((TokenKind::Number(number), raw_number))
|
||||
}
|
||||
|
||||
Some(c) if c.is_ascii_alphabetic() => {
|
||||
let raw_word = iter::once(c)
|
||||
.chain(source_chars.take_while(|&inner_char| {
|
||||
inner_char.is_ascii_alphanumeric() || inner_char == '_'
|
||||
}))
|
||||
.collect::<String>();
|
||||
|
||||
Ok((tokenize_word(raw_word.clone()), raw_word))
|
||||
}
|
||||
|
||||
Some(c) => Err(ScriptError {
|
||||
line: start_line,
|
||||
location: String::new(),
|
||||
message: format!("Unexpected character {c}"),
|
||||
}),
|
||||
};
|
||||
|
||||
let (token_kind, lexeme) = match_res.map_err(|err| ScanError {
|
||||
error: err,
|
||||
span: Span { chars: 1, lines: 0 },
|
||||
})?;
|
||||
|
||||
let span = Span {
|
||||
chars: lexeme.len(),
|
||||
lines: 0,
|
||||
};
|
||||
|
||||
Ok(Consumed {
|
||||
span,
|
||||
token: Some(Token {
|
||||
kind: token_kind,
|
||||
lexeme,
|
||||
line: start_line,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn lex_comment_contents<I: Iterator<Item = char>>(line_iter: &mut I) -> Consumed {
|
||||
let comment_length = length_until_end_of_line(line_iter);
|
||||
Consumed {
|
||||
span: Span {
|
||||
lines: 1,
|
||||
chars: comment_length + 2,
|
||||
},
|
||||
token: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn lex_remaining_string_literal<I: Iterator<Item = char>>(
|
||||
line_iter: &mut I,
|
||||
start_line: usize,
|
||||
) -> Result<Consumed, ScanError> {
|
||||
let (contents, end_found, newlines) = line_iter
|
||||
.fold_while(
|
||||
(String::new(), false, 0),
|
||||
|(mut s, _end_found, newlines), c| {
|
||||
if c == '"' {
|
||||
FoldWhile::Done((s, true, newlines))
|
||||
} else {
|
||||
s.push(c);
|
||||
FoldWhile::Continue((s, false, if c == '\n' { newlines + 1 } else { newlines }))
|
||||
}
|
||||
},
|
||||
)
|
||||
.into_inner();
|
||||
|
||||
if end_found {
|
||||
Ok(Consumed {
|
||||
token: Some(Token {
|
||||
kind: TokenKind::String(contents.clone()),
|
||||
lexeme: format!("\"{contents}\""),
|
||||
line: start_line + newlines,
|
||||
}),
|
||||
span: Span {
|
||||
lines: newlines,
|
||||
// must include the quotes
|
||||
chars: contents.len() + 2,
|
||||
},
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
span: Span {
|
||||
// only need to include start quote
|
||||
chars: contents.len() + 1,
|
||||
lines: newlines,
|
||||
},
|
||||
error: ScriptError {
|
||||
line: start_line,
|
||||
location: String::new(),
|
||||
message: "Unterminated string".to_string(),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn next_matches<I: Iterator<Item = char>>(line_iter: &mut Peekable<I>, c: char) -> bool {
|
||||
match line_iter.peek() {
|
||||
Some(&peeked) if peeked == c => {
|
||||
// Force the iterator to iterate
|
||||
line_iter.next();
|
||||
true
|
||||
}
|
||||
None | Some(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn match_one_or_two<I: Iterator<Item = char>>(
|
||||
line_iter: &mut Peekable<I>,
|
||||
first_char: char,
|
||||
second_char: char,
|
||||
if_matches: TokenKind,
|
||||
if_doesnt: TokenKind,
|
||||
) -> (TokenKind, String) {
|
||||
if next_matches(line_iter, '=') {
|
||||
(if_matches, format!("{first_char}{second_char}"))
|
||||
} else {
|
||||
(if_doesnt, first_char.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn length_until_end_of_line<I: Iterator<Item = char>>(iter: &mut I) -> usize {
|
||||
iter.take_while(|&c| c != '\n').count()
|
||||
}
|
||||
|
||||
fn tokenize_word(word: String) -> TokenKind {
|
||||
match word.as_ref() {
|
||||
"and" => TokenKind::And,
|
||||
"class" => TokenKind::Class,
|
||||
"else" => TokenKind::Else,
|
||||
"false" => TokenKind::False,
|
||||
"for" => TokenKind::For,
|
||||
"fun" => TokenKind::Fun,
|
||||
"if" => TokenKind::If,
|
||||
"nil" => TokenKind::Nil,
|
||||
"or" => TokenKind::Or,
|
||||
"print" => TokenKind::Print,
|
||||
"return" => TokenKind::Return,
|
||||
"super" => TokenKind::Super,
|
||||
"this" => TokenKind::This,
|
||||
"true" => TokenKind::True,
|
||||
"var" => TokenKind::Var,
|
||||
"while" => TokenKind::While,
|
||||
_other => TokenKind::Identifier(word),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#![warn(clippy::pedantic)]
|
||||
|
||||
use std::fmt::{self, Display, Formatter};
|
||||
|
||||
mod lex;
|
||||
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
#[error("[line {line}] Error {location}: {message}")]
|
||||
pub struct ScriptError {
|
||||
line: usize,
|
||||
location: String,
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub struct ScriptErrors(Vec<ScriptError>);
|
||||
|
||||
impl From<Vec<ScriptError>> for ScriptErrors {
|
||||
fn from(errs: Vec<ScriptError>) -> Self {
|
||||
Self(errs)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ScriptErrors {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
for err in &self.0 {
|
||||
writeln!(f, "{err}")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(script: &str) -> Result<(), ScriptErrors> {
|
||||
let tokens = lex::scan_source(script)?;
|
||||
for token in &tokens {
|
||||
print!("{}", token.lexeme());
|
||||
}
|
||||
println!();
|
||||
|
||||
Ok(())
|
||||
}
|
49
src/main.rs
49
src/main.rs
|
@ -1,3 +1,48 @@
|
|||
fn main() {
|
||||
println!("Hello, world!");
|
||||
#![warn(clippy::pedantic)]
|
||||
|
||||
use anyhow::Context;
|
||||
use std::{
|
||||
env, fs,
|
||||
io::{self, Write},
|
||||
process,
|
||||
};
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = env::args().collect::<Vec<String>>();
|
||||
match &args[..] {
|
||||
[_executable] => run_prompt(),
|
||||
[_executable, filename] => run_file(filename),
|
||||
|
||||
_ => {
|
||||
eprintln!("Usage: {} [script]", args[0]);
|
||||
process::exit(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn run_prompt() -> anyhow::Result<()> {
|
||||
let stdin = io::stdin();
|
||||
let mut stdout = io::stdout();
|
||||
let mut print_prompt = || -> anyhow::Result<()> {
|
||||
print!("> ");
|
||||
stdout.flush().map_err(anyhow::Error::new)
|
||||
};
|
||||
|
||||
print_prompt()?;
|
||||
for line_res in stdin.lines() {
|
||||
let line = line_res.with_context(|| "failed to read input line")?;
|
||||
let run_res = jlox_rust::run(&line);
|
||||
if let Err(err) = run_res {
|
||||
eprintln!("{err:?}");
|
||||
}
|
||||
|
||||
print_prompt()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_file(path: &str) -> anyhow::Result<()> {
|
||||
let script = fs::read_to_string(path)?;
|
||||
jlox_rust::run(&script).map_err(anyhow::Error::new)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue