LibJS: Cache length-in-code-units in SourceCode

This avoids some bit twiddling whenever accessing the length in code
units in the lexer.
This commit is contained in:
Andreas Kling 2025-11-08 22:57:01 +01:00 committed by Andreas Kling
parent 7c7a035347
commit 201803f601
Notes: github-actions[bot] 2025-11-09 11:15:02 +00:00
3 changed files with 16 additions and 13 deletions

View file

@ -280,16 +280,16 @@ Lexer::Lexer(NonnullRefPtr<SourceCode const> source_code, size_t line_number, si
void Lexer::consume()
{
auto did_reach_eof = [this] {
if (m_position < source().length_in_code_units())
if (m_position < source_code().length_in_code_units())
return false;
m_eof = true;
m_current_code_unit = '\0';
m_position = source().length_in_code_units() + 1;
m_position = source_code().length_in_code_units() + 1;
m_line_column++;
return true;
};
if (m_position > source().length_in_code_units())
if (m_position > source_code().length_in_code_units())
return;
if (did_reach_eof())
@ -325,7 +325,7 @@ void Lexer::consume()
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
}
} else {
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < source().length_in_code_units()) {
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < source_code().length_in_code_units()) {
if (AK::UnicodeUtils::is_utf16_low_surrogate(source().code_unit_at(m_position))) {
++m_position;
@ -412,7 +412,7 @@ bool Lexer::consume_binary_number()
template<typename Callback>
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
{
if (m_position >= source().length_in_code_units())
if (m_position >= source_code().length_in_code_units())
return false;
return m_current_code_unit == '_'
&& callback(source().code_unit_at(m_position));
@ -420,7 +420,7 @@ bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
bool Lexer::match(char16_t a, char16_t b) const
{
if (m_position >= source().length_in_code_units())
if (m_position >= source_code().length_in_code_units())
return false;
return m_current_code_unit == a
@ -429,7 +429,7 @@ bool Lexer::match(char16_t a, char16_t b) const
bool Lexer::match(char16_t a, char16_t b, char16_t c) const
{
if (m_position + 1 >= source().length_in_code_units())
if (m_position + 1 >= source_code().length_in_code_units())
return false;
return m_current_code_unit == a
@ -439,7 +439,7 @@ bool Lexer::match(char16_t a, char16_t b, char16_t c) const
bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const
{
if (m_position + 2 >= source().length_in_code_units())
if (m_position + 2 >= source_code().length_in_code_units())
return false;
return m_current_code_unit == a
@ -591,7 +591,7 @@ bool Lexer::is_block_comment_end() const
bool Lexer::is_numeric_literal_start() const
{
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < source().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position)));
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < source_code().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position)));
}
bool Lexer::slash_means_division() const
@ -837,7 +837,7 @@ Token const& Lexer::next()
while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) {
if (m_current_code_unit == '\\') {
consume();
if (m_current_code_unit == '\r' && m_position < source().length_in_code_units() && source().code_unit_at(m_position) == '\n') {
if (m_current_code_unit == '\r' && m_position < source_code().length_in_code_units() && source().code_unit_at(m_position) == '\n') {
consume();
}
}
@ -872,7 +872,7 @@ Token const& Lexer::next()
consume();
}
if (!found_token && m_position + 1 < source().length_in_code_units()) {
if (!found_token && m_position + 1 < source_code().length_in_code_units()) {
auto three_chars_view = source().substring_view(m_position - 1, 3);
if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) {
found_token = true;
@ -883,11 +883,11 @@ Token const& Lexer::next()
}
}
if (!found_token && m_position < source().length_in_code_units()) {
if (!found_token && m_position < source_code().length_in_code_units()) {
auto two_chars_view = source().substring_view(m_position - 1, 2);
if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) {
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < source().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position + 1)))) {
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < source_code().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position + 1)))) {
found_token = true;
token_type = type;
consume();

View file

@ -21,6 +21,7 @@ SourceCode::SourceCode(String filename, Utf16String code)
: m_filename(move(filename))
, m_code(move(code))
, m_code_view(m_code.utf16_view())
, m_length_in_code_units(m_code_view.length_in_code_units())
{
}

View file

@ -22,6 +22,7 @@ public:
String const& filename() const { return m_filename; }
Utf16String const& code() const { return m_code; }
Utf16View const& code_view() const { return m_code_view; }
size_t length_in_code_units() const { return m_length_in_code_units; }
SourceRange range_from_offsets(u32 start_offset, u32 end_offset) const;
@ -31,6 +32,7 @@ private:
String m_filename;
Utf16String m_code;
Utf16View m_code_view;
size_t m_length_in_code_units { 0 };
// For fast mapping of offsets to line/column numbers, we build a list of
// starting points (with byte offsets into the source string) and which