diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index ff4dbe798de14..eb9e3e7f10bb5 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -11,7 +11,7 @@ use crate::{ pub struct ParserCheckpoint<'a> { lexer: LexerCheckpoint<'a>, - cur_token: Token<'a>, + cur_token: Token, prev_span_end: u32, errors_pos: usize, } @@ -29,8 +29,8 @@ impl<'a> Parser<'a> { } /// Get current token - pub(crate) fn cur_token(&self) -> &Token<'a> { - &self.token + pub(crate) fn cur_token(&self) -> Token { + self.token } /// Get current Kind @@ -47,12 +47,12 @@ impl<'a> Parser<'a> { } /// Get current string - pub(crate) fn cur_string(&self) -> Option<&str> { - self.cur_token().value.get_string() + pub(crate) fn cur_string(&self) -> &'a str { + self.lexer.get_string(self.token) } /// Peek next token, returns EOF for final peek - pub(crate) fn peek_token(&mut self) -> &Token { + pub(crate) fn peek_token(&mut self) -> Token { self.lexer.lookahead(1) } @@ -67,7 +67,7 @@ impl<'a> Parser<'a> { } /// Peek nth token - pub(crate) fn nth(&mut self, n: u8) -> &Token { + pub(crate) fn nth(&mut self, n: u8) -> Token { if n == 0 { return self.cur_token(); } @@ -94,7 +94,7 @@ impl<'a> Parser<'a> { /// whose code point sequence is the same as a `ReservedWord`. #[inline] fn test_escaped_keyword(&mut self, kind: Kind) { - if self.cur_token().escaped && kind.is_all_keyword() { + if self.cur_token().escaped() && kind.is_all_keyword() { let span = self.cur_token().span(); self.error(diagnostics::EscapedKeyword(span)); } diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index fbbed1556e1fa..7cf643f4d4e57 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -17,8 +17,7 @@ use super::{ }; use crate::{ diagnostics, - lexer::{parse_big_int, parse_float, parse_int}, - lexer::{Kind, TokenValue}, + lexer::{parse_big_int, parse_float, parse_int, Kind}, list::SeparatedList, Context, Parser, }; @@ -96,10 +95,7 @@ impl<'a> Parser<'a> { pub(crate) fn parse_identifier_kind(&mut self, kind: Kind) -> (Span, Atom) { let span = self.start_span(); - let name = match std::mem::take(&mut self.token.value) { - TokenValue::String(value) => value, - TokenValue::None => "", - }; + let name = self.cur_string(); self.bump_remap(kind); (self.end_span(span), Atom::from(name)) } @@ -121,7 +117,7 @@ impl<'a> Parser<'a> { /// # Panics pub(crate) fn parse_private_identifier(&mut self) -> PrivateIdentifier { let span = self.start_span(); - let name = Atom::from(self.cur_string().unwrap()); + let name = Atom::from(self.cur_string()); self.bump_any(); PrivateIdentifier { span: self.end_span(span), name } } @@ -349,9 +345,7 @@ impl<'a> Parser<'a> { if !self.at(Kind::Str) { return Err(self.unexpected()); } - let TokenValue::String(value) = std::mem::take(&mut self.token.value) else { - unreachable!() - }; + let value = self.cur_string(); let span = self.start_span(); self.bump_any(); Ok(StringLiteral { span: self.end_span(span), value: value.into() }) @@ -454,8 +448,9 @@ impl<'a> Parser<'a> { _ => unreachable!(), }; - // cooked = None when template literal has invalid escape sequence - let cooked = self.cur_string().map(Atom::from); + // `cooked = None` when template literal has invalid escape sequence + // This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal` + let cooked = self.cur_token().escaped_string_id.map(|_| self.cur_string()); let raw = &self.cur_src()[1..self.cur_src().len() - end_offset as usize]; let raw = Atom::from(if cooked.is_some() && raw.contains('\r') { @@ -475,7 +470,11 @@ impl<'a> Parser<'a> { } let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate); - TemplateElement { span, tail, value: TemplateElementValue { raw, cooked } } + TemplateElement { + span, + tail, + value: TemplateElementValue { raw, cooked: cooked.map(Atom::from) }, + } } /// Section 13.3 Meta Property diff --git a/crates/oxc_parser/src/js/function.rs b/crates/oxc_parser/src/js/function.rs index 153321df6c93e..02556a393f505 100644 --- a/crates/oxc_parser/src/js/function.rs +++ b/crates/oxc_parser/src/js/function.rs @@ -50,7 +50,7 @@ impl<'a> Parser<'a> { } pub(crate) fn at_async_no_new_line(&mut self) -> bool { - self.at(Kind::Async) && !self.cur_token().escaped && !self.peek_token().is_on_new_line + self.at(Kind::Async) && !self.cur_token().escaped() && !self.peek_token().is_on_new_line } pub(crate) fn parse_function_body(&mut self) -> Result>> { diff --git a/crates/oxc_parser/src/js/statement.rs b/crates/oxc_parser/src/js/statement.rs index 54297b88bf4d9..4e8c8ef5e5f84 100644 --- a/crates/oxc_parser/src/js/statement.rs +++ b/crates/oxc_parser/src/js/statement.rs @@ -127,7 +127,7 @@ impl<'a> Parser<'a> { Kind::Const if !(self.ts_enabled() && self.is_at_enum_declaration()) => { self.parse_variable_statement(stmt_ctx) } - Kind::Let if !self.cur_token().escaped => self.parse_let(stmt_ctx), + Kind::Let if !self.cur_token().escaped() => self.parse_let(stmt_ctx), Kind::Await if self.peek_kind() == Kind::Using && self.nth_kind(2).is_binding_identifier() => { @@ -276,7 +276,7 @@ impl<'a> Parser<'a> { let is_let_of = self.at(Kind::Let) && self.peek_at(Kind::Of); let is_async_of = - self.at(Kind::Async) && !self.cur_token().escaped && self.peek_at(Kind::Of); + self.at(Kind::Async) && !self.cur_token().escaped() && self.peek_at(Kind::Of); let expr_span = self.start_span(); if self.at(Kind::RParen) { diff --git a/crates/oxc_parser/src/jsx/mod.rs b/crates/oxc_parser/src/jsx/mod.rs index ed6ae2a7adc43..4d5fd7db11370 100644 --- a/crates/oxc_parser/src/jsx/mod.rs +++ b/crates/oxc_parser/src/jsx/mod.rs @@ -360,14 +360,15 @@ impl<'a> Parser<'a> { } // we are at a valid normal Ident or Keyword, let's keep on lexing for `-` self.re_lex_jsx_identifier(); - let name = Atom::from(self.cur_string().unwrap()); self.bump_any(); - Ok(self.ast.jsx_identifier(self.end_span(span), name)) + let span = self.end_span(span); + let name = span.source_text(self.source_text); + Ok(self.ast.jsx_identifier(span, name.into())) } fn parse_jsx_text(&mut self) -> JSXText { let span = self.start_span(); - let value = Atom::from(self.cur_string().unwrap()); + let value = Atom::from(self.cur_string()); self.bump_any(); self.ast.jsx_text(self.end_span(span), value) } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index e91f23a1a86a8..15195c350c438 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -24,13 +24,13 @@ use oxc_syntax::{ }, unicode_id_start::is_id_start_unicode, }; -pub use token::{Token, TokenValue}; pub use self::{ kind::Kind, number::{parse_big_int, parse_float, parse_int}, + token::Token, }; -use self::{string_builder::AutoCow, trivia_builder::TriviaBuilder}; +use self::{string_builder::AutoCow, token::EscapedStringId, trivia_builder::TriviaBuilder}; use crate::{diagnostics, MAX_LEN}; #[derive(Debug, Clone)] @@ -38,7 +38,7 @@ pub struct LexerCheckpoint<'a> { /// Remaining chars to be tokenized chars: Chars<'a>, - token: Token<'a>, + token: Token, errors_pos: usize, } @@ -66,6 +66,9 @@ pub struct Lexer<'a> { context: LexerContext, pub(crate) trivia_builder: TriviaBuilder, + + /// Data store for escaped strings, indexed by `Token.escaped_string_id` + escaped_strings: Vec<&'a str>, } #[allow(clippy::unused_self)] @@ -91,6 +94,7 @@ impl<'a> Lexer<'a> { lookahead: VecDeque::with_capacity(4), // 4 is the maximum lookahead for TypeScript context: LexerContext::Regular, trivia_builder: TriviaBuilder::default(), + escaped_strings: vec![], } } @@ -117,12 +121,12 @@ impl<'a> Lexer<'a> { } /// Find the nth lookahead token lazily - pub fn lookahead(&mut self, n: u8) -> &Token<'a> { + pub fn lookahead(&mut self, n: u8) -> Token { let n = n as usize; debug_assert!(n > 0); if self.lookahead.len() > n - 1 { - return &self.lookahead[n - 1].token; + return self.lookahead[n - 1].token; } let checkpoint = self.checkpoint(); @@ -148,7 +152,7 @@ impl<'a> Lexer<'a> { self.current = checkpoint; - &self.lookahead[n - 1].token + self.lookahead[n - 1].token } /// Set context @@ -157,7 +161,7 @@ impl<'a> Lexer<'a> { } /// Main entry point - pub fn next_token(&mut self) -> Token<'a> { + pub fn next_token(&mut self) -> Token { if let Some(checkpoint) = self.lookahead.pop_front() { self.current.chars = checkpoint.chars; self.current.errors_pos = checkpoint.errors_pos; @@ -167,13 +171,13 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - pub fn next_jsx_child(&mut self) -> Token<'a> { + pub fn next_jsx_child(&mut self) -> Token { self.current.token.start = self.offset(); let kind = self.read_jsx_child(); self.finish_next(kind) } - fn finish_next(&mut self, kind: Kind) -> Token<'a> { + fn finish_next(&mut self, kind: Kind) -> Token { self.current.token.kind = kind; self.current.token.end = self.offset(); debug_assert!(self.current.token.start <= self.current.token.end); @@ -188,7 +192,7 @@ impl<'a> Lexer<'a> { /// where a `RegularExpressionLiteral` is permitted /// Which means the parser needs to re-tokenize on `PrimaryExpression`, /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` - pub fn next_regex(&mut self, kind: Kind) -> Token<'a> { + pub fn next_regex(&mut self, kind: Kind) -> Token { self.current.token.start = self.offset() - match kind { Kind::Slash => 1, @@ -200,7 +204,7 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - pub fn next_right_angle(&mut self) -> Token<'a> { + pub fn next_right_angle(&mut self) -> Token { let kind = self.read_right_angle(); self.lookahead.clear(); self.finish_next(kind) @@ -208,7 +212,7 @@ impl<'a> Lexer<'a> { /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, - pub fn next_template_substitution_tail(&mut self) -> Token<'a> { + pub fn next_template_substitution_tail(&mut self) -> Token { self.current.token.start = self.offset() - 1; let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); self.lookahead.clear(); @@ -216,14 +220,14 @@ impl<'a> Lexer<'a> { } /// Expand the current token for `JSXIdentifier` - pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token<'a> { + pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token { let kind = self.read_jsx_identifier(start_offset); self.lookahead.clear(); self.finish_next(kind) } /// Re-tokenize '<<' or '<=' or '<<=' to '<' - pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token<'a> { + pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token { let offset = match kind { Kind::ShiftLeft | Kind::LtEq => 2, Kind::ShiftLeftEq => 3, @@ -297,6 +301,44 @@ impl<'a> Lexer<'a> { } } + /// Save the string if it is escaped + /// This reduces the overall memory consumption while keeping the `Token` size small + /// Strings without escaped values can be retrieved as is from the token span + #[allow(clippy::cast_possible_truncation)] + fn save_string(&mut self, has_escape: bool, s: &'a str) { + if !has_escape { + return; + } + self.escaped_strings.push(s); + let escaped_string_id = self.escaped_strings.len() as u32; + // SAFETY: escaped_string_id is the length of `self.escaped_strings` after an item is pushed, which can never be 0 + let escaped_string_id = unsafe { EscapedStringId::new_unchecked(escaped_string_id) }; + self.current.token.escaped_string_id.replace(escaped_string_id); + } + + pub(crate) fn get_string(&self, token: Token) -> &'a str { + if let Some(escaped_string_id) = token.escaped_string_id { + return self.escaped_strings[escaped_string_id.get() as usize - 1]; + } + + let raw = &self.source[token.start as usize..token.end as usize]; + match token.kind { + Kind::Str | Kind::NoSubstitutionTemplate => { + // omit surrounding quotes + &raw[1..raw.len() - 1] + } + Kind::TemplateHead => { + // omit leading "`${" + &raw[3..] + } + Kind::TemplateTail => { + // omit trailing "$`" + &raw[..raw.len() - 2] + } + _ => raw, + } + } + /// Read each char and set the current token /// Whitespace and line terminators are skipped fn read_next_token(&mut self) -> Kind { @@ -402,7 +444,7 @@ impl<'a> Lexer<'a> { } /// Section 12.7.1 Identifier Names - fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> (bool, &'a str) { + fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { // ident tail while let Some(c) = self.peek() { if !is_identifier_part(c) { @@ -418,14 +460,13 @@ impl<'a> Lexer<'a> { builder.push_matching(c); } let has_escape = builder.has_escape(); - (has_escape, builder.finish(self)) + let text = builder.finish(self); + self.save_string(has_escape, text); + text } fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { - let (has_escape, text) = self.identifier_tail(builder); - self.current.token.escaped = has_escape; - self.current.token.value = TokenValue::String(text); - text + self.identifier_tail(builder) } fn identifier_name_handler(&mut self) -> &'a str { @@ -532,8 +573,7 @@ impl<'a> Lexer<'a> { return Kind::Undetermined; } } - let (_, name) = self.identifier_tail(builder); - self.current.token.value = TokenValue::String(name); + self.identifier_tail(builder); Kind::PrivateIdentifier } @@ -765,8 +805,7 @@ impl<'a> Lexer<'a> { } Some(c @ ('"' | '\'')) => { if c == delimiter { - self.current.token.value = - TokenValue::String(builder.finish_without_push(self)); + self.save_string(builder.has_escape(), builder.finish_without_push(self)); return Kind::Str; } builder.push_matching(c); @@ -850,16 +889,14 @@ impl<'a> Lexer<'a> { match c { '$' if self.peek() == Some('{') => { if is_valid_escape_sequence { - self.current.token.value = - TokenValue::String(builder.finish_without_push(self)); + self.save_string(true, builder.finish_without_push(self)); } self.current.chars.next(); return substitute; } '`' => { if is_valid_escape_sequence { - self.current.token.value = - TokenValue::String(builder.finish_without_push(self)); + self.save_string(true, builder.finish_without_push(self)); } return tail; } @@ -872,6 +909,7 @@ impl<'a> Lexer<'a> { '\\' => { let text = builder.get_mut_string_without_current_ascii_char(self); self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); + if !is_valid_escape_sequence {} } _ => builder.push_matching(c), } @@ -884,18 +922,13 @@ impl<'a> Lexer<'a> { /// `IdentifierStart` /// `JSXIdentifier` `IdentifierPart` /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - - fn read_jsx_identifier(&mut self, start_offset: u32) -> Kind { - let prev_str = &self.source[start_offset as usize..self.offset() as usize]; - - let mut builder = AutoCow::new(self); + fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { while let Some(c) = self.peek() { if c == '-' || is_identifier_start_all(c) { self.current.chars.next(); - builder.push_matching(c); while let Some(c) = self.peek() { if is_identifier_part(c) { - let c = self.current.chars.next().unwrap(); - builder.push_matching(c); + self.current.chars.next().unwrap(); } else { break; } @@ -904,9 +937,6 @@ impl<'a> Lexer<'a> { break; } } - let mut s = String::from_str_in(prev_str, self.allocator); - s.push_str(builder.finish(self)); - self.current.token.value = TokenValue::String(s.into_bump_str()); Kind::Ident } @@ -941,7 +971,6 @@ impl<'a> Lexer<'a> { break; } } - self.current.token.value = TokenValue::String(builder.finish(self)); Kind::JSXText } None => Kind::Eof, @@ -964,8 +993,7 @@ impl<'a> Lexer<'a> { match self.current.chars.next() { Some(c @ ('"' | '\'')) => { if c == delimiter { - self.current.token.value = - TokenValue::String(builder.finish_without_push(self)); + self.save_string(builder.has_escape(), builder.finish_without_push(self)); return Kind::Str; } builder.push_matching(c); diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs index 0aecc63c54c98..8f648e3edda89 100644 --- a/crates/oxc_parser/src/lexer/string_builder.rs +++ b/crates/oxc_parser/src/lexer/string_builder.rs @@ -33,14 +33,14 @@ impl<'a> AutoCow<'a> { // and return the reference to it pub fn get_mut_string_without_current_ascii_char<'b>( &'b mut self, - lexer: &'_ Lexer<'a>, + lexer: &Lexer<'a>, ) -> &'b mut String<'a> { self.force_allocation_without_current_ascii_char(lexer); self.value.as_mut().unwrap() } // Force allocation of a String, excluding the current ASCII character. - pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'a>) { + pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) { if self.value.is_some() { return; } diff --git a/crates/oxc_parser/src/lexer/token.rs b/crates/oxc_parser/src/lexer/token.rs index 40c7cd36ca52d..e688692627af3 100644 --- a/crates/oxc_parser/src/lexer/token.rs +++ b/crates/oxc_parser/src/lexer/token.rs @@ -4,8 +4,10 @@ use oxc_span::Span; use super::kind::Kind; +pub type EscapedStringId = std::num::NonZeroU32; + #[derive(Debug, Clone, Copy, Default)] -pub struct Token<'a> { +pub struct Token { /// Token Kind pub kind: Kind, @@ -18,40 +20,22 @@ pub struct Token<'a> { /// Indicates the token is on a newline pub is_on_new_line: bool, - /// Is the original string escaped? - pub escaped: bool, - - pub value: TokenValue<'a>, + /// A index handle to `Lexer::escaped_strings` + /// See https://floooh.github.io/2018/06/17/handles-vs-pointers.html for some background reading + pub escaped_string_id: Option, } #[cfg(target_pointer_width = "64")] mod size_asserts { - oxc_index::assert_eq_size!(super::Token, [u8; 32]); + oxc_index::assert_eq_size!(super::Token, [u8; 16]); } -impl<'a> Token<'a> { +impl Token { pub fn span(&self) -> Span { Span::new(self.start, self.end) } -} - -#[derive(Debug, Copy, Clone)] -pub enum TokenValue<'a> { - None, - String(&'a str), -} - -impl<'a> Default for TokenValue<'a> { - fn default() -> Self { - Self::None - } -} -impl<'a> TokenValue<'a> { - pub fn get_string(&self) -> Option<&str> { - match self { - Self::String(s) => Some(s), - Self::None => None, - } + pub fn escaped(&self) -> bool { + self.escaped_string_id.is_some() } } diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index 45dd4d3b5fcb9..7cce4a0685428 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -117,7 +117,7 @@ pub struct Parser<'a> { errors: Vec, /// The current parsing token - token: Token<'a>, + token: Token, /// The end range of the previous token prev_token_end: u32, diff --git a/crates/oxc_parser/src/ts/types.rs b/crates/oxc_parser/src/ts/types.rs index 9847351ab8344..62cd64e5f5270 100644 --- a/crates/oxc_parser/src/ts/types.rs +++ b/crates/oxc_parser/src/ts/types.rs @@ -302,13 +302,8 @@ impl<'a> Parser<'a> { return self.parse_ts_infer_type(); } - let mut operator = None; - - if !self.at(Kind::Str) { - if let Some(atom) = self.cur_string() { - operator = TSTypeOperator::from_src(atom); - } - } + let operator = + if self.at(Kind::Str) { None } else { TSTypeOperator::from_src(self.cur_string()) }; // test ts ts_type_operator // type B = keyof A;