mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2026-04-19 10:20:22 +00:00
Introduce JS::Substring as a lazily materialized PrimitiveString variant that stores an originating string plus a UTF-16 offset and length. This makes substring creation cheap while still reifying to a normal string when character data is requested. Track which short strings actually live in the VM caches so lazily resolved ropes and substrings do not evict unrelated cached strings when they are finalized. Add focused unit tests for nested ranges, rope-backed substrings, surrogate boundaries, and cache behavior.
467 lines
15 KiB
C++
467 lines
15 KiB
C++
/*
|
|
* Copyright (c) 2020-2025, Andreas Kling <andreas@ladybird.org>
|
|
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/CharacterTypes.h>
|
|
#include <AK/FlyString.h>
|
|
#include <AK/StringBuilder.h>
|
|
#include <AK/UnicodeUtils.h>
|
|
#include <AK/Utf16FlyString.h>
|
|
#include <AK/Utf16View.h>
|
|
#include <AK/Utf8View.h>
|
|
#include <LibJS/Runtime/AbstractOperations.h>
|
|
#include <LibJS/Runtime/GlobalObject.h>
|
|
#include <LibJS/Runtime/PrimitiveString.h>
|
|
#include <LibJS/Runtime/PropertyKey.h>
|
|
#include <LibJS/Runtime/VM.h>
|
|
#include <LibJS/Runtime/Value.h>
|
|
|
|
namespace JS {
|
|
|
|
// Strings shorter than or equal to this length are cached in the VM and deduplicated.
|
|
// Longer strings are not cached to avoid excessive hashing and lookup costs.
|
|
static constexpr size_t MAX_LENGTH_FOR_STRING_CACHE = 256;
|
|
|
|
GC_DEFINE_ALLOCATOR(PrimitiveString);
|
|
GC_DEFINE_ALLOCATOR(RopeString);
|
|
GC_DEFINE_ALLOCATOR(Substring);
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String const& string)
|
|
{
|
|
if (string.is_empty())
|
|
return vm.empty_string();
|
|
|
|
auto const length_in_code_units = string.length_in_code_units();
|
|
|
|
if (length_in_code_units == 1) {
|
|
if (auto code_unit = string.code_unit_at(0); is_ascii(code_unit))
|
|
return vm.single_ascii_character_string(static_cast<u8>(code_unit));
|
|
}
|
|
|
|
if (length_in_code_units > MAX_LENGTH_FOR_STRING_CACHE) {
|
|
return vm.heap().allocate<PrimitiveString>(string);
|
|
}
|
|
|
|
auto& string_cache = vm.utf16_string_cache();
|
|
if (auto it = string_cache.find(string); it != string_cache.end())
|
|
return *it->value;
|
|
|
|
auto new_string = vm.heap().allocate<PrimitiveString>(string);
|
|
new_string->m_utf16_string_is_in_cache = true;
|
|
string_cache.set(move(string), new_string);
|
|
return *new_string;
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, Utf16View const& string)
|
|
{
|
|
return create(vm, Utf16String::from_utf16(string));
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, Utf16FlyString const& string)
|
|
{
|
|
return create(vm, string.to_utf16_string());
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, String const& string)
|
|
{
|
|
if (string.is_empty())
|
|
return vm.empty_string();
|
|
|
|
auto const length_in_code_units = string.length_in_code_units();
|
|
|
|
if (length_in_code_units == 1) {
|
|
auto bytes = string.bytes();
|
|
if (auto ch = bytes[0]; is_ascii(ch))
|
|
return vm.single_ascii_character_string(ch);
|
|
}
|
|
|
|
if (string.length_in_code_units() > MAX_LENGTH_FOR_STRING_CACHE) {
|
|
return vm.heap().allocate<PrimitiveString>(string);
|
|
}
|
|
|
|
auto& string_cache = vm.string_cache();
|
|
if (auto it = string_cache.find(string); it != string_cache.end())
|
|
return *it->value;
|
|
|
|
auto new_string = vm.heap().allocate<PrimitiveString>(string);
|
|
new_string->m_utf8_string_is_in_cache = true;
|
|
string_cache.set(move(string), new_string);
|
|
return *new_string;
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, StringView string)
|
|
{
|
|
return create(vm, String::from_utf8(string).release_value());
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
|
|
{
|
|
return create(vm, string.to_string());
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create_from_unsigned_integer(VM& vm, u64 number)
|
|
{
|
|
if (number < vm.numeric_string_cache().size()) {
|
|
auto& cache_slot = vm.numeric_string_cache()[number];
|
|
if (!cache_slot) {
|
|
auto string = Utf16String::number(number);
|
|
cache_slot = create(vm, string);
|
|
}
|
|
return *cache_slot;
|
|
}
|
|
return create(vm, Utf16String::number(number));
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
|
|
{
|
|
// We're here to concatenate two strings into a new rope string. However, if any of them are empty, no rope is required.
|
|
bool lhs_empty = lhs.is_empty();
|
|
bool rhs_empty = rhs.is_empty();
|
|
|
|
if (lhs_empty && rhs_empty)
|
|
return vm.empty_string();
|
|
|
|
if (lhs_empty)
|
|
return rhs;
|
|
|
|
if (rhs_empty)
|
|
return lhs;
|
|
|
|
return vm.heap().allocate<RopeString>(lhs, rhs);
|
|
}
|
|
|
|
GC::Ref<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& string, size_t code_unit_offset, size_t code_unit_length)
|
|
{
|
|
auto string_length = string.length_in_utf16_code_units();
|
|
VERIFY(code_unit_offset <= string_length);
|
|
VERIFY(code_unit_length <= string_length - code_unit_offset);
|
|
|
|
if (code_unit_length == 0)
|
|
return vm.empty_string();
|
|
|
|
if (code_unit_offset == 0 && code_unit_length == string_length)
|
|
return string;
|
|
|
|
if (string.m_deferred_kind == DeferredKind::Substring) {
|
|
auto const& substring = static_cast<Substring const&>(string);
|
|
return create(vm, *substring.m_source_string, substring.m_code_unit_offset + code_unit_offset, code_unit_length);
|
|
}
|
|
|
|
return vm.heap().allocate<Substring>(string, code_unit_offset, code_unit_length);
|
|
}
|
|
|
|
PrimitiveString::PrimitiveString(Utf16String string)
|
|
: m_utf16_string(move(string))
|
|
{
|
|
}
|
|
|
|
PrimitiveString::PrimitiveString(String string)
|
|
: m_utf8_string(move(string))
|
|
{
|
|
}
|
|
|
|
PrimitiveString::~PrimitiveString() = default;
|
|
|
|
void PrimitiveString::finalize()
|
|
{
|
|
Base::finalize();
|
|
if (m_utf16_string_is_in_cache) {
|
|
auto const& string = *m_utf16_string;
|
|
if (string.length_in_code_units() <= MAX_LENGTH_FOR_STRING_CACHE)
|
|
vm().utf16_string_cache().remove(string);
|
|
}
|
|
if (m_utf8_string_is_in_cache) {
|
|
auto const& string = *m_utf8_string;
|
|
if (string.length_in_code_units() <= MAX_LENGTH_FOR_STRING_CACHE)
|
|
vm().string_cache().remove(string);
|
|
}
|
|
}
|
|
|
|
bool PrimitiveString::is_empty() const
|
|
{
|
|
if (m_deferred_kind == DeferredKind::Rope) {
|
|
// NOTE: We never make an empty rope string.
|
|
return false;
|
|
}
|
|
if (m_deferred_kind == DeferredKind::Substring)
|
|
return static_cast<Substring const&>(*this).m_code_unit_length == 0;
|
|
|
|
if (has_utf16_string())
|
|
return m_utf16_string->is_empty();
|
|
if (has_utf8_string())
|
|
return m_utf8_string->is_empty();
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
|
|
String PrimitiveString::utf8_string() const
|
|
{
|
|
resolve_if_needed(EncodingPreference::UTF8);
|
|
|
|
if (!has_utf8_string()) {
|
|
VERIFY(has_utf16_string());
|
|
m_utf8_string = m_utf16_string->to_utf8();
|
|
}
|
|
|
|
return *m_utf8_string;
|
|
}
|
|
|
|
StringView PrimitiveString::utf8_string_view() const
|
|
{
|
|
if (!has_utf8_string()) {
|
|
if (has_utf16_string() && m_utf16_string->has_ascii_storage())
|
|
return m_utf16_string->ascii_view();
|
|
|
|
(void)utf8_string();
|
|
}
|
|
|
|
return m_utf8_string->bytes_as_string_view();
|
|
}
|
|
|
|
Utf16String PrimitiveString::utf16_string() const
|
|
{
|
|
resolve_if_needed(EncodingPreference::UTF16);
|
|
|
|
if (!has_utf16_string()) {
|
|
VERIFY(has_utf8_string());
|
|
m_utf16_string = Utf16String::from_utf8(*m_utf8_string);
|
|
}
|
|
|
|
return *m_utf16_string;
|
|
}
|
|
|
|
Utf16View PrimitiveString::utf16_string_view() const
|
|
{
|
|
if (!has_utf16_string()) {
|
|
if (m_deferred_kind == DeferredKind::Substring) {
|
|
auto const& substring = static_cast<Substring const&>(*this);
|
|
return substring.m_source_string->utf16_string_view().substring_view(substring.m_code_unit_offset, substring.m_code_unit_length);
|
|
}
|
|
(void)utf16_string();
|
|
}
|
|
return *m_utf16_string;
|
|
}
|
|
|
|
size_t PrimitiveString::length_in_utf16_code_units() const
|
|
{
|
|
if (m_deferred_kind == DeferredKind::Substring)
|
|
return static_cast<Substring const&>(*this).m_code_unit_length;
|
|
return utf16_string_view().length_in_code_units();
|
|
}
|
|
|
|
bool PrimitiveString::operator==(PrimitiveString const& other) const
|
|
{
|
|
if (this == &other)
|
|
return true;
|
|
if (m_utf8_string.has_value() && other.m_utf8_string.has_value())
|
|
return m_utf8_string->bytes_as_string_view() == other.m_utf8_string->bytes_as_string_view();
|
|
if (m_utf16_string.has_value() && other.m_utf16_string.has_value())
|
|
return *m_utf16_string == *other.m_utf16_string;
|
|
return utf8_string_view() == other.utf8_string_view();
|
|
}
|
|
|
|
ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
|
|
{
|
|
if (property_key.is_symbol())
|
|
return Optional<Value> {};
|
|
|
|
if (property_key.is_string()) {
|
|
if (property_key.as_string() == vm.names.length.as_string()) {
|
|
return Value(static_cast<double>(length_in_utf16_code_units()));
|
|
}
|
|
}
|
|
|
|
auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
|
|
if (!index.is_index())
|
|
return Optional<Value> {};
|
|
|
|
auto string = utf16_string_view();
|
|
if (string.length_in_code_units() <= index.as_index())
|
|
return Optional<Value> {};
|
|
|
|
return create(vm, string.substring_view(index.as_index(), 1));
|
|
}
|
|
|
|
void PrimitiveString::resolve_if_needed(EncodingPreference preference) const
|
|
{
|
|
switch (m_deferred_kind) {
|
|
case DeferredKind::None:
|
|
return;
|
|
case DeferredKind::Rope:
|
|
static_cast<RopeString const&>(*this).resolve(preference);
|
|
return;
|
|
case DeferredKind::Substring:
|
|
static_cast<Substring const&>(*this).resolve(preference);
|
|
return;
|
|
}
|
|
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
|
|
void RopeString::resolve(EncodingPreference preference) const
|
|
{
|
|
|
|
// This vector will hold all the pieces of the rope that need to be assembled
|
|
// into the resolved string.
|
|
Vector<PrimitiveString const*, 2> pieces;
|
|
size_t approximate_length = 0;
|
|
size_t length_in_utf16_code_units = 0;
|
|
|
|
// NOTE: We traverse the rope tree without using recursion, since we'd run out of
|
|
// stack space quickly when handling a long sequence of unresolved concatenations.
|
|
Vector<PrimitiveString const*, 2> stack;
|
|
stack.append(m_rhs);
|
|
stack.append(m_lhs);
|
|
while (!stack.is_empty()) {
|
|
auto const* current = stack.take_last();
|
|
if (current->m_deferred_kind == DeferredKind::Rope) {
|
|
auto& current_rope_string = static_cast<RopeString const&>(*current);
|
|
stack.append(current_rope_string.m_rhs);
|
|
stack.append(current_rope_string.m_lhs);
|
|
continue;
|
|
}
|
|
|
|
if (current->has_utf8_string())
|
|
approximate_length += current->utf8_string_view().length();
|
|
if (preference == EncodingPreference::UTF16)
|
|
length_in_utf16_code_units += current->length_in_utf16_code_units();
|
|
pieces.append(current);
|
|
}
|
|
|
|
if (preference == EncodingPreference::UTF16) {
|
|
// The caller wants a UTF-16 string, so we can simply concatenate all the pieces
|
|
// into a UTF-16 code unit buffer and create a Utf16String from it.
|
|
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_utf16_code_units);
|
|
|
|
for (auto const* current : pieces) {
|
|
if (current->has_utf16_string())
|
|
builder.append(current->utf16_string_view());
|
|
else
|
|
builder.append(current->utf8_string_view());
|
|
}
|
|
|
|
m_utf16_string = builder.to_utf16_string();
|
|
m_deferred_kind = DeferredKind::None;
|
|
m_lhs = nullptr;
|
|
m_rhs = nullptr;
|
|
return;
|
|
}
|
|
|
|
// Now that we have all the pieces, we can concatenate them using a StringBuilder.
|
|
StringBuilder builder(approximate_length);
|
|
|
|
// We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
|
|
PrimitiveString const* previous = nullptr;
|
|
for (auto const* current : pieces) {
|
|
if (!previous) {
|
|
// This is the very first piece, just append it and continue.
|
|
builder.append(current->utf8_string());
|
|
previous = current;
|
|
continue;
|
|
}
|
|
|
|
// Get the UTF-8 representations for both strings.
|
|
auto current_string_as_utf8 = current->utf8_string_view();
|
|
auto previous_string_as_utf8 = previous->utf8_string_view();
|
|
|
|
// NOTE: Now we need to look at the end of the previous string and the start
|
|
// of the current string, to see if they should be combined into a surrogate.
|
|
|
|
// Surrogates encoded as UTF-8 are 3 bytes.
|
|
if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
|
|
builder.append(current_string_as_utf8);
|
|
previous = current;
|
|
continue;
|
|
}
|
|
|
|
// Might the previous string end with a UTF-8 encoded surrogate?
|
|
if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
|
|
// If not, just append the current string and continue.
|
|
builder.append(current_string_as_utf8);
|
|
previous = current;
|
|
continue;
|
|
}
|
|
|
|
// Might the current string begin with a UTF-8 encoded surrogate?
|
|
if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
|
|
// If not, just append the current string and continue.
|
|
builder.append(current_string_as_utf8);
|
|
previous = current;
|
|
continue;
|
|
}
|
|
|
|
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
|
|
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
|
|
|
|
if (!AK::UnicodeUtils::is_utf16_high_surrogate(high_surrogate) || !AK::UnicodeUtils::is_utf16_low_surrogate(low_surrogate)) {
|
|
builder.append(current_string_as_utf8);
|
|
previous = current;
|
|
continue;
|
|
}
|
|
|
|
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
|
|
builder.trim(3);
|
|
builder.append_code_point(AK::UnicodeUtils::decode_utf16_surrogate_pair(high_surrogate, low_surrogate));
|
|
|
|
// Append the remaining part of the current string.
|
|
builder.append(current_string_as_utf8.substring_view(3));
|
|
previous = current;
|
|
}
|
|
|
|
// NOTE: We've already produced valid UTF-8 above, so there's no need for additional validation.
|
|
m_utf8_string = builder.to_string_without_validation();
|
|
m_deferred_kind = DeferredKind::None;
|
|
m_lhs = nullptr;
|
|
m_rhs = nullptr;
|
|
}
|
|
|
|
RopeString::RopeString(GC::Ref<PrimitiveString> lhs, GC::Ref<PrimitiveString> rhs)
|
|
: PrimitiveString(DeferredKind::Rope)
|
|
, m_lhs(lhs)
|
|
, m_rhs(rhs)
|
|
{
|
|
}
|
|
|
|
RopeString::~RopeString() = default;
|
|
|
|
void RopeString::visit_edges(Cell::Visitor& visitor)
|
|
{
|
|
Base::visit_edges(visitor);
|
|
visitor.visit(m_lhs);
|
|
visitor.visit(m_rhs);
|
|
}
|
|
|
|
void Substring::resolve(EncodingPreference preference) const
|
|
{
|
|
auto source_view = m_source_string->utf16_string_view().substring_view(m_code_unit_offset, m_code_unit_length);
|
|
|
|
if (preference == EncodingPreference::UTF16) {
|
|
m_utf16_string = Utf16String::from_utf16(source_view);
|
|
} else {
|
|
auto substring = Utf16String::from_utf16(source_view);
|
|
m_utf8_string = substring.to_utf8();
|
|
}
|
|
|
|
m_deferred_kind = DeferredKind::None;
|
|
m_source_string = nullptr;
|
|
}
|
|
|
|
Substring::Substring(GC::Ref<PrimitiveString> source_string, size_t code_unit_offset, size_t code_unit_length)
|
|
: PrimitiveString(DeferredKind::Substring)
|
|
, m_source_string(source_string)
|
|
, m_code_unit_offset(code_unit_offset)
|
|
, m_code_unit_length(code_unit_length)
|
|
{
|
|
}
|
|
|
|
Substring::~Substring() = default;
|
|
|
|
void Substring::visit_edges(Cell::Visitor& visitor)
|
|
{
|
|
Base::visit_edges(visitor);
|
|
visitor.visit(m_source_string);
|
|
}
|
|
|
|
}
|