Skip to content

Commit

Permalink
js: Use buffered tokenizer
Browse files Browse the repository at this point in the history
This is a large change that introduces a proxy-like tokenizer that
buffers tokens and should allow us to implement things like
automatic semicolon insertion in the future. This means that we had
to rewrite all the parser code from the previous functional style
into a SLL-like style with lookaheads (its also easier to reason
about and debug)
  • Loading branch information
simonwuelker committed Mar 28, 2024
1 parent 1174402 commit 2be7a27
Show file tree
Hide file tree
Showing 22 changed files with 796 additions and 386 deletions.
1 change: 1 addition & 0 deletions crates/js/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ license.workspace = true

[dependencies]
sl-std = { workspace = true }
log = { workspace = true }

[lints]
workspace = true
47 changes: 47 additions & 0 deletions crates/js/examples/shell.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use std::{
env, fs,
io::{self, Write},
};

fn main() -> io::Result<()> {
if let Some(filename) = env::args().nth(1) {
let script = fs::read_to_string(&filename)?;

let program: js::bytecode::Program = script.parse().unwrap();

println!("{program:#?}");

let mut vm = js::bytecode::Vm::default();
vm.execute_program(&program);
vm.dump();
} else {
run_shell()?;
}

Ok(())
}

fn run_shell() -> io::Result<()> {
let mut buffer = String::new();
let stdin = io::stdin();

loop {
buffer.clear();
let mut stdout = io::stdout();
write!(stdout, ">>> ")?;
stdout.flush()?;

stdin.read_line(&mut buffer)?;

match buffer.parse::<js::bytecode::Program>() {
Ok(program) => {
writeln!(stdout, "{program:#?}")?;

let mut vm = js::bytecode::Vm::default();
vm.execute_program(&program);
vm.dump();
},
Err(error) => error.get_context(&buffer).dump(),
}
}
}
5 changes: 4 additions & 1 deletion crates/js/src/bytecode/program.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::str::FromStr;

use crate::parser::{tokenizer::GoalSymbol, Script, SyntaxError, Tokenizer};
use crate::parser::{
tokenization::{GoalSymbol, Tokenizer},
Script, SyntaxError,
};

use super::{CompileToBytecode, Instruction, ProgramBuilder, Register};

Expand Down
25 changes: 13 additions & 12 deletions crates/js/src/parser/expressions/binary_expression.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::{
bytecode::{self, CompileToBytecode},
parser::{tokenizer::Punctuator, SyntaxError, Tokenizer},
parser::{
tokenization::{Punctuator, SkipLineTerminators, Token, Tokenizer},
SyntaxError,
},
};

use super::{parse_primary_expression, Expression};
Expand Down Expand Up @@ -75,23 +78,21 @@ macro_rules! binary_op {
) -> Result<Expression, SyntaxError> {
let mut expression: Expression = $next(tokenizer)?.into();

let parse_or_term = |tokenizer: &mut Tokenizer<'_>| {
let punctuator = tokenizer.attempt(Tokenizer::consume_punctuator)?;

let operator = match punctuator {
$($symbol => $op,)*
_ => return Err(tokenizer.syntax_error()),
loop {
let operator = match tokenizer.peek(0, SkipLineTerminators::Yes)? {
$(Some(Token::Punctuator($symbol)) => {
tokenizer.advance(1);
$op
},)*
_ => break
};

let rhs = tokenizer.attempt($next)?;
Ok((operator, rhs))
};
let rhs = $next(tokenizer)?.into();

while let Ok((operator, rhs)) = tokenizer.attempt(parse_or_term) {
expression = BinaryExpression {
op: operator.into(),
lhs: Box::new(expression),
rhs: Box::new(rhs.into()),
rhs: Box::new(rhs),
}
.into();
}
Expand Down
91 changes: 68 additions & 23 deletions crates/js/src/parser/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@ mod object;

pub use binary_expression::BinaryExpression;

use crate::bytecode::{self, CompileToBytecode};
use crate::{
bytecode::{self, CompileToBytecode},
Number,
};

use self::object::ObjectLiteral;

use super::{identifiers::parse_identifier_reference, literals::Literal, SyntaxError, Tokenizer};
use super::{
identifiers::parse_identifier_reference,
literals::Literal,
tokenization::{Punctuator, SkipLineTerminators, Token, Tokenizer},
SyntaxError,
};

#[derive(Clone, Debug)]
pub enum Expression {
Expand Down Expand Up @@ -41,21 +49,49 @@ pub struct NewExpression {
fn parse_primary_expression<const YIELD: bool, const AWAIT: bool>(
tokenizer: &mut Tokenizer<'_>,
) -> Result<Expression, SyntaxError> {
let primary_expression = if tokenizer
.attempt(|tokenizer| tokenizer.expect_keyword("this"))
.is_ok()
{
Expression::This
} else if let Ok(identifier) = tokenizer.attempt(parse_identifier_reference::<YIELD, AWAIT>) {
Expression::IdentifierReference(identifier)
} else if let Ok(literal) = Literal::parse(tokenizer) {
Expression::Literal(literal)
} else if let Ok(object_literal) = ObjectLiteral::parse::<YIELD, AWAIT>(tokenizer) {
Expression::ObjectLiteral(object_literal)
} else {
let Some(next_token) = tokenizer.peek(0, SkipLineTerminators::Yes)? else {
return Err(tokenizer.syntax_error());
};

let primary_expression = match next_token {
Token::Identifier(ident) if ident == "this" => {
tokenizer.advance(1);
Expression::This
},
Token::Identifier(ident) if ident == "true" => {
tokenizer.advance(1);
Literal::BooleanLiteral(true).into()
},
Token::Identifier(ident) if ident == "false" => {
tokenizer.advance(1);
Literal::BooleanLiteral(false).into()
},
Token::Identifier(ident) if ident == "null" => {
tokenizer.advance(1);
Literal::NullLiteral.into()
},
Token::NumericLiteral(n) => {
let n = Number::new(*n as f64);
tokenizer.advance(1);
Literal::NumericLiteral(n).into()
},
Token::StringLiteral(s) => {
// FIXME: avoiding a clone here would be nice
let s = s.clone();
tokenizer.advance(1);
Literal::StringLiteral(s.clone()).into()
},
Token::Identifier(_) => {
let identifier_reference = parse_identifier_reference::<YIELD, AWAIT>(tokenizer)?;
Expression::IdentifierReference(identifier_reference)
},
Token::Punctuator(Punctuator::CurlyBraceOpen) => {
let object_literal = ObjectLiteral::parse::<YIELD, AWAIT>(tokenizer)?;
object_literal.into()
},
_ => return Err(tokenizer.syntax_error()),
};

Ok(primary_expression)
}

Expand All @@ -66,25 +102,28 @@ impl NewExpression {
) -> Result<Expression, SyntaxError> {
let mut nest_level = 0;

while matches!(
tokenizer.attempt(Tokenizer::consume_identifier).as_deref(),
Ok("new")
) {
while tokenizer
.peek(0, SkipLineTerminators::Yes)?
.is_some_and(|t| t.is_identifier("new"))
{
tokenizer.advance(1);
nest_level += 1;
}

// FIXME: This should be a MemberExpression instead of a PrimaryExpression
let member_expression = parse_primary_expression::<YIELD, AWAIT>(tokenizer)?;

if nest_level == 0 {
Ok(member_expression)
let new_expression = if nest_level == 0 {
member_expression
} else {
Ok(Self {
Self {
nest_level,
expression: Box::new(member_expression),
}
.into())
}
.into()
};

Ok(new_expression)
}
}

Expand Down Expand Up @@ -135,3 +174,9 @@ impl From<NewExpression> for Expression {
Self::New(value)
}
}

impl From<ObjectLiteral> for Expression {
fn from(value: ObjectLiteral) -> Self {
Self::ObjectLiteral(value)
}
}
26 changes: 22 additions & 4 deletions crates/js/src/parser/expressions/object.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
use crate::{
bytecode::{self, CompileToBytecode},
parser::{
identifiers::parse_identifier_reference, tokenizer::Punctuator, SyntaxError, Tokenizer,
identifiers::parse_identifier_reference,
tokenization::{Punctuator, SkipLineTerminators, Token, Tokenizer},
SyntaxError,
},
};

Expand Down Expand Up @@ -42,9 +44,25 @@ impl ObjectLiteral {
tokenizer: &mut Tokenizer<'_>,
) -> Result<Self, SyntaxError> {
tokenizer.expect_punctuator(Punctuator::CurlyBraceOpen)?;
let property_definitions =
tokenizer.parse_comma_separated_list(PropertyDefinition::parse::<YIELD, AWAIT>);
tokenizer.expect_punctuator(Punctuator::CurlyBraceClose)?;

let mut property_definitions = vec![];

while !matches!(
tokenizer.peek(0, SkipLineTerminators::Yes)?,
Some(Token::Punctuator(Punctuator::CurlyBraceClose))
) {
let property_definition = PropertyDefinition::parse::<YIELD, AWAIT>(tokenizer)?;
property_definitions.push(property_definition);

if let Some(Token::Punctuator(Punctuator::Comma)) =
tokenizer.peek(0, SkipLineTerminators::Yes)?
{
tokenizer.advance(1);
}
}

// Discard the closing brace
tokenizer.advance(1);

let object_literal = Self {
property_definitions,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
use crate::{
bytecode::{self, CompileToBytecode},
parser::{
identifiers::parse_binding_identifier, statements_and_declarations::Statement,
tokenizer::Punctuator, SyntaxError, Tokenizer,
identifiers::parse_binding_identifier,
statements_and_declarations::Statement,
tokenization::{Punctuator, Tokenizer},
SyntaxError,
},
};

Expand Down
65 changes: 37 additions & 28 deletions crates/js/src/parser/identifiers.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
//! <https://262.ecma-international.org/14.0/#sec-identifiers>
use super::{tokenizer::GoalSymbol, SyntaxError, Tokenizer};
use super::{
tokenization::{GoalSymbol, SkipLineTerminators, Token, Tokenizer},
SyntaxError,
};

const RESERVED_WORDS: [&str; 37] = [
"await",
Expand Down Expand Up @@ -46,25 +49,23 @@ const RESERVED_WORDS: [&str; 37] = [
pub(crate) fn parse_binding_identifier<const YIELD: bool, const AWAIT: bool>(
tokenizer: &mut Tokenizer<'_>,
) -> Result<String, SyntaxError> {
let binding_identifier = if let Ok(identifier) = tokenizer.attempt(Identifier::parse) {
if tokenizer.is_strict() && matches!(identifier.0.as_str(), "arguments" | "eval") {
return Err(tokenizer.syntax_error());
}
let Some(Token::Identifier(identifier)) = tokenizer.next(SkipLineTerminators::Yes)? else {
return Err(tokenizer.syntax_error());
};

identifier.0
} else {
let identifier_name = tokenizer.consume_identifier()?;
if !YIELD && identifier.as_str() == "yield" {
return Err(tokenizer.syntax_error());
}

if !YIELD && identifier_name.as_str() == "yield" {
identifier_name
} else if !AWAIT && identifier_name.as_str() == "await" {
identifier_name
} else {
return Err(tokenizer.syntax_error());
}
};
if !AWAIT && identifier.as_str() == "await" {
return Err(tokenizer.syntax_error());
}

if tokenizer.is_strict() && matches!(identifier.as_str(), "arguments" | "eval") {
return Err(tokenizer.syntax_error());
}

Ok(binding_identifier)
Ok(identifier)
}

/// <https://262.ecma-international.org/14.0/#prod-Identifier>
Expand All @@ -86,7 +87,11 @@ const DISALLOWED_IDENTIFIERS_IN_STRICT_MODE: [&str; 9] = [
impl Identifier {
/// <https://262.ecma-international.org/14.0/#prod-Identifier>
pub(crate) fn parse(tokenizer: &mut Tokenizer<'_>) -> Result<Self, SyntaxError> {
let identifier_name = tokenizer.consume_identifier()?;
let Some(Token::Identifier(identifier_name)) = tokenizer.next(SkipLineTerminators::Yes)?
else {
return Err(tokenizer.syntax_error());
};

if RESERVED_WORDS.contains(&identifier_name.as_str()) {
return Err(tokenizer.syntax_error());
}
Expand All @@ -109,17 +114,21 @@ impl Identifier {
pub(crate) fn parse_identifier_reference<const YIELD: bool, const AWAIT: bool>(
tokenizer: &mut Tokenizer<'_>,
) -> Result<String, SyntaxError> {
if let Ok(identifier) = tokenizer.attempt(Identifier::parse) {
return Ok(identifier.0);
}
let next_token = tokenizer.peek(0, SkipLineTerminators::Yes)?;

if YIELD && tokenizer.expect_keyword("yield").is_ok() {
return Ok("yield".to_string());
}
if let Some(Token::Identifier(ident)) = next_token {
if YIELD && ident == "yield" {
tokenizer.advance(1);
return Ok("yield".to_string());
}

if AWAIT && tokenizer.expect_keyword("await").is_ok() {
return Ok("await".to_string());
}
if AWAIT && ident == "await" {
tokenizer.advance(1);
return Ok("await".to_string());
}

Err(tokenizer.syntax_error())
Identifier::parse(tokenizer).map(|i| i.0)
} else {
Err(tokenizer.syntax_error())
}
}
Loading

0 comments on commit 2be7a27

Please sign in to comment.