ladybird/Libraries/LibWeb/DOM/CharacterData.cpp

/*
 * Copyright (c) 2018-2022, Andreas Kling <andreas@ladybird.org>
 * Copyright (c) 2025, Jelle Raaijmakers <jelle@ladybird.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <LibUnicode/Segmenter.h>
#include <LibWeb/Bindings/CharacterDataPrototype.h>
#include <LibWeb/DOM/CharacterData.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/DOM/MutationType.h>
#include <LibWeb/DOM/Range.h>
#include <LibWeb/Layout/TextNode.h>

namespace Web::DOM {

GC_DEFINE_ALLOCATOR(CharacterData);

CharacterData::CharacterData(Document& document, NodeType type, String const& data)
    : Node(document, type)
    , m_data(data)
{
}

CharacterData::~CharacterData() = default;

void CharacterData::initialize(JS::Realm& realm)
{
    WEB_SET_PROTOTYPE_FOR_INTERFACE(CharacterData);
    Base::initialize(realm);
}

// https://dom.spec.whatwg.org/#dom-characterdata-data
void CharacterData::set_data(String const& data)
{
    // [The data] setter must replace data with node this, offset 0, count this’s length, and data new value.
    // NOTE: Since the offset is 0, it can never be above data's length, so this can never throw.
    // NOTE: Setting the data to the same value as the current data still causes a mutation observer callback.
    // FIXME: Figure out a way to make this a no-op again if the passed in data is the same as the current data.
    MUST(replace_data(0, this->length_in_utf16_code_units(), data));
}

// https://dom.spec.whatwg.org/#concept-cd-substring
WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t count) const
{
    // 1. Let length be node’s length.
    // FIXME: This is very inefficient!
    auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
    Utf16View utf16_view { utf16_result };
    auto length = utf16_view.length_in_code_units();

    // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
    if (offset > length)
        return WebIDL::IndexSizeError::create(realm(), "Substring offset out of range."_string);

    // 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
    //    to the end of node’s data, and then return.
    if (offset + count > length)
        return MUST(utf16_view.substring_view(offset).to_utf8());

    // 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data.
    return MUST(utf16_view.substring_view(offset, count).to_utf8());
}

// https://dom.spec.whatwg.org/#concept-cd-replace
WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t count, String const& data)
{
    // 1. Let length be node’s length.
    // FIXME: This is very inefficient!
    auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
    Utf16View utf16_view { utf16_data };
    auto length = utf16_view.length_in_code_units();

    // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
    if (offset > length)
        return WebIDL::IndexSizeError::create(realm(), "Replacement offset out of range."_string);

    // 3. If offset plus count is greater than length, then set count to length minus offset.
    if (offset + count > length)
        count = length - offset;

    // 5. Insert data into node’s data after offset code units.
    // 6. Let delete offset be offset + data’s length.
    // 7. Starting from delete offset code units, remove count code units from node’s data.
    auto before_data = utf16_view.substring_view(0, offset);
    auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
    auto after_data = utf16_view.substring_view(offset + count);

    Utf16Data full_data;
    full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
    full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
    full_data.extend(inserted_data_result.data);
    full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
    Utf16View full_view { full_data };

    bool characters_are_the_same = utf16_view == full_view;
    auto old_data = m_data;

    // OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
    if (!characters_are_the_same) {
        m_data = MUST(full_view.to_utf8());
    }

    // 4. Queue a mutation record of "characterData" for node with null, null, node’s data, « », « », null, and null.
    // NOTE: We do this later so that the mutation observer may notify UI clients of this node's new value.
    queue_mutation_record(MutationType::characterData, {}, {}, old_data, {}, {}, nullptr, nullptr);

    // 8. For each live range whose start node is node and start offset is greater than offset but less than or equal to
    //    offset plus count, set its start offset to offset.
    for (auto* range : Range::live_ranges()) {
        if (range->start_container() == this && range->start_offset() > offset && range->start_offset() <= (offset + count))
            range->set_start_offset(offset);
    }

    // 9. For each live range whose end node is node and end offset is greater than offset but less than or equal to
    //    offset plus count, set its end offset to offset.
    for (auto* range : Range::live_ranges()) {
        if (range->end_container() == this && range->end_offset() > offset && range->end_offset() <= (offset + count))
            range->set_end_offset(offset);
    }

    // 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its
    //     start offset by data’s length and decrease it by count.
    for (auto* range : Range::live_ranges()) {
        if (range->start_container() == this && range->start_offset() > (offset + count))
            range->set_start_offset(range->start_offset() + inserted_data_result.data.size() - count);
    }

    // 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end
    //     offset by data’s length and decrease it by count.
    for (auto* range : Range::live_ranges()) {
        if (range->end_container() == this && range->end_offset() > (offset + count))
            range->set_end_offset(range->end_offset() + inserted_data_result.data.size() - count);
    }

    // 12. If node’s parent is non-null, then run the children changed steps for node’s parent.
    if (parent())
        parent()->children_changed(nullptr);

    // OPTIMIZATION: If the characters are the same, we can skip the remainder of this function.
    if (characters_are_the_same)
        return {};

    if (auto* layout_node = this->layout_node(); layout_node && layout_node->is_text_node()) {
        // NOTE: Since the text node's data has changed, we need to invalidate the text for rendering.
        //       This ensures that the new text is reflected in layout, even if we don't end up
        //       doing a full layout tree rebuild.
        static_cast<Layout::TextNode&>(*layout_node).invalidate_text_for_rendering();

        // We also need to relayout.
        layout_node->set_needs_layout_update(SetNeedsLayoutReason::CharacterDataReplaceData);
    }

    document().bump_character_data_version();

    if (m_grapheme_segmenter)
        m_grapheme_segmenter->set_segmented_text(m_data);
    if (m_word_segmenter)
        m_word_segmenter->set_segmented_text(m_data);

    return {};
}

// https://dom.spec.whatwg.org/#dom-characterdata-appenddata
WebIDL::ExceptionOr<void> CharacterData::append_data(String const& data)
{
    // The appendData(data) method steps are to replace data with node this, offset this’s length, count 0, and data data.
    return replace_data(this->length_in_utf16_code_units(), 0, data);
}

// https://dom.spec.whatwg.org/#dom-characterdata-insertdata
WebIDL::ExceptionOr<void> CharacterData::insert_data(size_t offset, String const& data)
{
    // The insertData(offset, data) method steps are to replace data with node this, offset offset, count 0, and data data.
    return replace_data(offset, 0, data);
}

// https://dom.spec.whatwg.org/#dom-characterdata-deletedata
WebIDL::ExceptionOr<void> CharacterData::delete_data(size_t offset, size_t count)
{
    // The deleteData(offset, count) method steps are to replace data with node this, offset offset, count count, and data the empty string.
    return replace_data(offset, count, String {});
}

Unicode::Segmenter& CharacterData::grapheme_segmenter() const
{
    if (!m_grapheme_segmenter) {
        m_grapheme_segmenter = document().grapheme_segmenter().clone();
        m_grapheme_segmenter->set_segmented_text(m_data);
    }

    return *m_grapheme_segmenter;
}

Unicode::Segmenter& CharacterData::word_segmenter() const
{
    if (!m_word_segmenter) {
        m_word_segmenter = document().word_segmenter().clone();
        m_word_segmenter->set_segmented_text(m_data);
    }

    return *m_word_segmenter;
}

}
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 09:38:21 +01:00
+								/*
-												Meta: Update my e-mail address everywhere

											
										
										
											2024-10-04 13:19:50 +02:00
+								 * Copyright (c) 2018-2022, Andreas Kling <andreas@ladybird.org>
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								 * Copyright (c) 2025, Jelle Raaijmakers <jelle@ladybird.org>
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 09:38:21 +01:00
+								 *
-												Everything: Move to SPDX license identifiers in all files.

SPDX License Identifiers are a more compact / standardized
way of representing file license information.

See: https://spdx.dev/resources/use/#identifiers

This was done with the `ambr` search and replace tool.

 ambr --no-parent-ignore --key-from-file --rep-from-file key.txt rep.txt *

											
										
										
											2021-04-22 01:24:48 -07:00
+								 * SPDX-License-Identifier: BSD-2-Clause
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 09:38:21 +01:00
+								 */
-												LibUnicode+Everywhere: Merge LibLocale back into LibUnicode

LibLocale was split off from LibUnicode a couple years ago to reduce the
number of applications on SerenityOS that depend on CLDR data. Now that
we use ICU, both LibUnicode and LibLocale are actually linking in this
data. And since vcpkg gives us static libraries, both libraries are over
30MB in size.

This patch reverts the separation and merges LibLocale into LibUnicode
again. We now have just one library that includes the ICU data.

Further, this will let LibUnicode share the locale cache that previously
would only exist in LibLocale.

											
										
										
											2024-06-23 09:14:27 -04:00
+								#include <LibUnicode/Segmenter.h>
-												LibWeb+LibJS: Make the EventTarget hierarchy (incl. DOM) GC-allocated

This is a monster patch that turns all EventTargets into GC-allocated
PlatformObjects. Their C++ wrapper classes are removed, and the LibJS
garbage collector is now responsible for their lifetimes.

There's a fair amount of hacks and band-aids in this patch, and we'll
have a lot of cleanup to do after this.

											
										
										
											2022-08-28 13:42:07 +02:00
+								#include <LibWeb/Bindings/CharacterDataPrototype.h>
-												LibWeb: Rename directory LibHTML => LibWeb

Let's rename this to LibWeb since it aims to provide more parts of the
web platform than just HTML. :^)

											
										
										
											2020-03-07 10:32:51 +01:00
+								#include <LibWeb/DOM/CharacterData.h>
-												LibWeb: Relayout document on CharacterData data change

This can definitely be optimized to avoid full relayouts in many
situations, but for now let's just go for correctness.

											
										
										
											2021-02-10 18:32:16 +01:00
+								#include <LibWeb/DOM/Document.h>
-												LibWeb: Implement "characterData" mutation record for MutationObserver


											
										
										
											2022-07-11 16:39:32 +01:00
+								#include <LibWeb/DOM/MutationType.h>
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								#include <LibWeb/DOM/Range.h>
-												LibWeb: Invalidate layout-transformed text on DOM text node change

This fixes an issue where programmatically changing the value of an
input element wasn't reflected visually.

											
										
										
											2023-08-16 11:03:00 +02:00
+								#include <LibWeb/Layout/TextNode.h>
-												LibHTML: Add Comment and CharacterData nodes and improve HTML parsing

This patch adds the CharacterData subclass of Node, which is now the
parent class of Text and a new Comment class.

A Comment node is one of these in HTML: <!--hello friends-->
Since these occur somewhat frequently on the web, we need to be able
to parse them.

This patch also adds a child rejection mechanism to the DOM tree.
Nodes can now override is_child_allowed(Node) and return false if they
don't want a particular Node to become a child of theirs. This is used
to prevent Document from taking on unwanted children.

											
										
										
											2019-10-12 23:26:47 +02:00
-												LibWeb: Move DOM classes into the Web::DOM namespace

LibWeb keeps growing and the Web namespace is filling up fast.
Let's put DOM stuff into Web::DOM, just like we already started doing
with SVG stuff in Web::SVG.

											
										
										
											2020-07-26 19:37:56 +02:00
+								namespace Web::DOM {
-												LibWeb: Move everything into the Web namespace

											
										
										
											2020-03-07 10:27:02 +01:00
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								GC_DEFINE_ALLOCATOR(CharacterData);
-												LibWeb: Put most LibWeb GC objects in type-specific heap blocks

With this change, we now have ~1200 CellAllocators across both LibJS and
LibWeb in a normal WebContent instance.

This gives us a minimum heap size of 4.7 MiB in the scenario where we
only have one cell allocated per type. Of course, in practice there will
be many more of each type, so the effective overhead is quite a bit
smaller than that in practice.

I left a few types unconverted to this mechanism because I got tired of
doing this. :^)

											
										
										
											2023-11-19 19:47:52 +01:00
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								CharacterData::CharacterData(Document& document, NodeType type, String const& data)
-												LibHTML: Add Comment and CharacterData nodes and improve HTML parsing

This patch adds the CharacterData subclass of Node, which is now the
parent class of Text and a new Comment class.

A Comment node is one of these in HTML: <!--hello friends-->
Since these occur somewhat frequently on the web, we need to be able
to parse them.

This patch also adds a child rejection mechanism to the DOM tree.
Nodes can now override is_child_allowed(Node) and return false if they
don't want a particular Node to become a child of theirs. This is used
to prevent Document from taking on unwanted children.

											
										
										
											2019-10-12 23:26:47 +02:00
+								    : Node(document, type)
 								    , m_data(data)
 								{
-												LibWeb: Move setting of Web object prototypes to initialize()

This needs to happen before prototype/constructor intitialization can be
made lazy. Otherwise, GC could run during the C++ constructor and try to
collect the object currently being created.

											
										
										
											2023-01-10 06:28:20 -05:00
+								}
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
+								CharacterData::~CharacterData() = default;
-												LibJS: Make Cell::initialize() return void

Stop worrying about tiny OOMs.

Work towards #20405

											
										
										
											2023-08-07 08:41:28 +02:00
+								void CharacterData::initialize(JS::Realm& realm)
-												LibWeb: Move setting of Web object prototypes to initialize()

This needs to happen before prototype/constructor intitialization can be
made lazy. Otherwise, GC could run during the C++ constructor and try to
collect the object currently being created.

											
										
										
											2023-01-10 06:28:20 -05:00
+								{
-												LibWeb: Avoid FlyString lookups when setting IDL interface prototypes

This commit introduces a WEB_SET_PROTOTYPE_FOR_INTERFACE macro that
caches the interface name in a local static FlyString. This means that
we only pay for FlyString-from-literal lookup once per browser lifetime
instead of every time the interface is instantiated.

											
										
										
											2024-03-16 13:13:08 +01:00
+								    WEB_SET_PROTOTYPE_FOR_INTERFACE(CharacterData);
-												LibWeb: Only set prototype once for object with IDL interface

Before this change, we were going through the chain of base classes for
each IDL interface object and having them set the prototype to their
prototype.

Instead of doing that, reorder things so that we set the right prototype
immediately in Foo::initialize(), and then don't bother in all the base
class overrides.

This knocks off a ~1% profile item on Speedometer 3.

											
										
										
											2025-04-20 16:22:57 +02:00
+								    Base::initialize(realm);
-												LibHTML: Add Comment and CharacterData nodes and improve HTML parsing

This patch adds the CharacterData subclass of Node, which is now the
parent class of Text and a new Comment class.

A Comment node is one of these in HTML: <!--hello friends-->
Since these occur somewhat frequently on the web, we need to be able
to parse them.

This patch also adds a child rejection mechanism to the DOM tree.
Nodes can now override is_child_allowed(Node) and return false if they
don't want a particular Node to become a child of theirs. This is used
to prevent Document from taking on unwanted children.

											
										
										
											2019-10-12 23:26:47 +02:00
+								}
-												LibWeb: Implement CharacterData::set_data in terms of replace_data

This makes it so that it always queues a mutation record, even if
`data` is set to the same value. It also makes it follow the spec
steps.
											
										
										
											2022-07-11 16:13:16 +01:00
+								// https://dom.spec.whatwg.org/#dom-characterdata-data
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								void CharacterData::set_data(String const& data)
-												LibWeb: Relayout document on CharacterData data change

This can definitely be optimized to avoid full relayouts in many
situations, but for now let's just go for correctness.

											
										
										
											2021-02-10 18:32:16 +01:00
+								{
-												LibWeb: Implement CharacterData::set_data in terms of replace_data

This makes it so that it always queues a mutation record, even if
`data` is set to the same value. It also makes it follow the spec
steps.
											
										
										
											2022-07-11 16:13:16 +01:00
+								    // [The data] setter must replace data with node this, offset 0, count this’s length, and data new value.
 								    // NOTE: Since the offset is 0, it can never be above data's length, so this can never throw.
 								    // NOTE: Setting the data to the same value as the current data still causes a mutation observer callback.
 								    // FIXME: Figure out a way to make this a no-op again if the passed in data is the same as the current data.
-												LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971

											
										
										
											2023-12-22 20:41:34 +13:00
+								    MUST(replace_data(0, this->length_in_utf16_code_units(), data));
-												LibWeb: Relayout document on CharacterData data change

This can definitely be optimized to avoid full relayouts in many
situations, but for now let's just go for correctness.

											
										
										
											2021-02-10 18:32:16 +01:00
+								}
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
+								// https://dom.spec.whatwg.org/#concept-cd-substring
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t count) const
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
+								{
 								    // 1. Let length be node’s length.
-												LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971

											
										
										
											2023-12-22 20:41:34 +13:00
+								    // FIXME: This is very inefficient!
-												AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.

											
										
										
											2025-04-02 17:56:49 +02:00
+								    auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
 								    Utf16View utf16_view { utf16_result };
-												LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971

											
										
										
											2023-12-22 20:41:34 +13:00
+								    auto length = utf16_view.length_in_code_units();
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
 								    // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
 								    if (offset > length)
-												LibWeb: Make DOMException take error message as a String

There was no need to use FlyString for error messages, and it just
caused a bunch of churn since these strings typically only existed
during the lifetime of the error.

											
										
										
											2024-10-12 20:56:21 +02:00
+								        return WebIDL::IndexSizeError::create(realm(), "Substring offset out of range."_string);
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
 								    // 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
 								    //    to the end of node’s data, and then return.
 								    if (offset + count > length)
-												AK+Everywhere: Allow lonely UTF-16 surrogates by default

By definition, the web allows lonely surrogates by default. Let's have
our string APIs reflect this, so we don't have to pass an allow option
all over the place.

											
										
										
											2025-06-26 19:52:09 -04:00
+								        return MUST(utf16_view.substring_view(offset).to_utf8());
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
 								    // 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data.
-												AK+Everywhere: Allow lonely UTF-16 surrogates by default

By definition, the web allows lonely surrogates by default. Let's have
our string APIs reflect this, so we don't have to pass an allow option
all over the place.

											
										
										
											2025-06-26 19:52:09 -04:00
+								    return MUST(utf16_view.substring_view(offset, count).to_utf8());
-												LibWeb: Add CharacterData.substringData(offset, count)

											
										
										
											2022-03-21 17:20:42 +01:00
+								}
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
+								// https://dom.spec.whatwg.org/#concept-cd-replace
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t count, String const& data)
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
+								{
 								    // 1. Let length be node’s length.
-												LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971

											
										
										
											2023-12-22 20:41:34 +13:00
+								    // FIXME: This is very inefficient!
 								    auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
 								    Utf16View utf16_view { utf16_data };
 								    auto length = utf16_view.length_in_code_units();
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
 								    // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
 								    if (offset > length)
-												LibWeb: Make DOMException take error message as a String

There was no need to use FlyString for error messages, and it just
caused a bunch of churn since these strings typically only existed
during the lifetime of the error.

											
										
										
											2024-10-12 20:56:21 +02:00
+								        return WebIDL::IndexSizeError::create(realm(), "Replacement offset out of range."_string);
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
 								    // 3. If offset plus count is greater than length, then set count to length minus offset.
 								    if (offset + count > length)
 								        count = length - offset;
 								    // 5. Insert data into node’s data after offset code units.
 								    // 6. Let delete offset be offset + data’s length.
 								    // 7. Starting from delete offset code units, remove count code units from node’s data.
-												LibWeb: Make replaceData create new surrogate pairs

When inserting a new utf-16 surrogate next to an existing surrogate
with replaceData, the surrogates would not get merged correctly into a
single code point. This is because internally the text data is stored
as utf-8, and the two surrogates would be converted seperately. This
has now been fixed by first recreating the whole string in utf-16 and
then converting it back to utf-8.

It's not the most efficient solution, but this fixes at least 6 WPT
subtests.

											
										
										
											2024-11-23 14:46:42 +01:00
+								    auto before_data = utf16_view.substring_view(0, offset);
-												AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.

											
										
										
											2025-04-02 17:56:49 +02:00
+								    auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
-												LibWeb: Make replaceData create new surrogate pairs

When inserting a new utf-16 surrogate next to an existing surrogate
with replaceData, the surrogates would not get merged correctly into a
single code point. This is because internally the text data is stored
as utf-8, and the two surrogates would be converted seperately. This
has now been fixed by first recreating the whole string in utf-16 and
then converting it back to utf-8.

It's not the most efficient solution, but this fixes at least 6 WPT
subtests.

											
										
										
											2024-11-23 14:46:42 +01:00
+								    auto after_data = utf16_view.substring_view(offset + count);
-												AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string

To prepare for an upcoming Utf16String, this migrates Utf16View to store
its data as a char16_t. Most function definitions are moved inline and
made constexpr.

This also adds a UDL to construct a Utf16View from a string literal:

    auto string = u"hello"sv;

This let's us remove the NTTP Utf16View constructor, as we have found
that such constructors bloat binary size quite a bit.

											
										
										
											2025-06-26 12:52:23 -04:00
-												LibWeb: Make replaceData create new surrogate pairs

When inserting a new utf-16 surrogate next to an existing surrogate
with replaceData, the surrogates would not get merged correctly into a
single code point. This is because internally the text data is stored
as utf-8, and the two surrogates would be converted seperately. This
has now been fixed by first recreating the whole string in utf-16 and
then converting it back to utf-8.

It's not the most efficient solution, but this fixes at least 6 WPT
subtests.

											
										
										
											2024-11-23 14:46:42 +01:00
+								    Utf16Data full_data;
-												AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.

											
										
										
											2025-04-02 17:56:49 +02:00
+								    full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
-												AK: Add a UTF-16 string with optimized short- and ASCII-string storage

This is a strictly UTF-16 string with some optimizations for ASCII.

* If created from a short UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an inlined byte buffer.

* If created with a long UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an outlined char buffer.

* If created with a short or long UTF-8 or UTF-16 string that is not
  ASCII, then the string is stored in an outlined char16 buffer.

We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.

											
										
										
											2025-06-12 19:29:41 -04:00
+								    full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
-												AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.

											
										
										
											2025-04-02 17:56:49 +02:00
+								    full_data.extend(inserted_data_result.data);
-												AK: Add a UTF-16 string with optimized short- and ASCII-string storage

This is a strictly UTF-16 string with some optimizations for ASCII.

* If created from a short UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an inlined byte buffer.

* If created with a long UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an outlined char buffer.

* If created with a short or long UTF-8 or UTF-16 string that is not
  ASCII, then the string is stored in an outlined char16 buffer.

We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.

											
										
										
											2025-06-12 19:29:41 -04:00
+								    full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
-												LibWeb: Make replaceData create new surrogate pairs

When inserting a new utf-16 surrogate next to an existing surrogate
with replaceData, the surrogates would not get merged correctly into a
single code point. This is because internally the text data is stored
as utf-8, and the two surrogates would be converted seperately. This
has now been fixed by first recreating the whole string in utf-16 and
then converting it back to utf-8.

It's not the most efficient solution, but this fixes at least 6 WPT
subtests.

											
										
										
											2024-11-23 14:46:42 +01:00
+								    Utf16View full_view { full_data };
-												LibWeb: Early return from `replace_data()` if data didn't change

Allows us to avoid invalidating layout when CharacterData didn't change.

Results in visible improvement on Discord that continuously invokes
this function with the same data, which previously resulted in relayout
on every frame.

											
										
										
											2025-02-19 16:05:43 +01:00
-												LibWeb: Don't neglect DOM range updates on CharacterData changes

Regressed in 036327332ffeca6f7d0b84645098837bd87145bf.

This commit moves the optimization a little later in replaceData(),
still avoiding relayout (the important part).

Recovers 480 points on WPT. :^)

											
										
										
											2025-02-21 11:05:59 +01:00
+								    bool characters_are_the_same = utf16_view == full_view;
-												LibWeb: Slightly delay queueing a character data mutation event

For DevTools, we will want to forward mutation events to the UI in order
to inform the DevTools client about changed DOM nodes. The API for this
requires the new values associated with the events; for example, for
character data events, this will be the node's new text data.

This patch moves the queueing of the mutation record until after we have
the new character data stored. This is not observable.

											
										
										
											2025-03-06 17:17:20 -05:00
+								    auto old_data = m_data;
-												LibWeb: Early return from `replace_data()` if data didn't change

Allows us to avoid invalidating layout when CharacterData didn't change.

Results in visible improvement on Discord that continuously invokes
this function with the same data, which previously resulted in relayout
on every frame.

											
										
										
											2025-02-19 16:05:43 +01:00
-												LibWeb: Don't neglect DOM range updates on CharacterData changes

Regressed in 036327332ffeca6f7d0b84645098837bd87145bf.

This commit moves the optimization a little later in replaceData(),
still avoiding relayout (the important part).

Recovers 480 points on WPT. :^)

											
										
										
											2025-02-21 11:05:59 +01:00
+								    // OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
 								    if (!characters_are_the_same) {
-												AK+Everywhere: Allow lonely UTF-16 surrogates by default

By definition, the web allows lonely surrogates by default. Let's have
our string APIs reflect this, so we don't have to pass an allow option
all over the place.

											
										
										
											2025-06-26 19:52:09 -04:00
+								        m_data = MUST(full_view.to_utf8());
-												LibWeb: Don't neglect DOM range updates on CharacterData changes

Regressed in 036327332ffeca6f7d0b84645098837bd87145bf.

This commit moves the optimization a little later in replaceData(),
still avoiding relayout (the important part).

Recovers 480 points on WPT. :^)

											
										
										
											2025-02-21 11:05:59 +01:00
+								    }
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
-												LibWeb: Slightly delay queueing a character data mutation event

For DevTools, we will want to forward mutation events to the UI in order
to inform the DevTools client about changed DOM nodes. The API for this
requires the new values associated with the events; for example, for
character data events, this will be the node's new text data.

This patch moves the queueing of the mutation record until after we have
the new character data stored. This is not observable.

											
										
										
											2025-03-06 17:17:20 -05:00
+								    // 4. Queue a mutation record of "characterData" for node with null, null, node’s data, « », « », null, and null.
 								    // NOTE: We do this later so that the mutation observer may notify UI clients of this node's new value.
 								    queue_mutation_record(MutationType::characterData, {}, {}, old_data, {}, {}, nullptr, nullptr);
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								    // 8. For each live range whose start node is node and start offset is greater than offset but less than or equal to
 								    //    offset plus count, set its start offset to offset.
 								    for (auto* range : Range::live_ranges()) {
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								        if (range->start_container() == this && range->start_offset() > offset && range->start_offset() <= (offset + count))
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								            range->set_start_offset(offset);
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								    }
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								    // 9. For each live range whose end node is node and end offset is greater than offset but less than or equal to
 								    //    offset plus count, set its end offset to offset.
 								    for (auto* range : Range::live_ranges()) {
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								        if (range->end_container() == this && range->end_offset() > offset && range->end_offset() <= (offset + count))
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								            range->set_end_offset(offset);
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								    }
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								    // 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its
 								    //     start offset by data’s length and decrease it by count.
 								    for (auto* range : Range::live_ranges()) {
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								        if (range->start_container() == this && range->start_offset() > (offset + count))
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								            range->set_start_offset(range->start_offset() + inserted_data_result.data.size() - count);
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								    }
-												LibWeb: Modify range start & end directly where applicable

We were calling into `Range::set_start_or_end()` indirectly through
`::set_start()` and `::set_end()`, but that algorithm only calls for an
invocation whenever the start or end of a range needs to be set to a
boundary point. If an algorithm step calls for setting the node or
offset, we should directly modify the range.

The problem with calling into `::set_start_or_end()` is that this
algorithm potentially modifies _both_ the start and end of the range,
but algorithms trying to update a range's start or end often have
explicit steps to take both the start and end into account and end up
overcompensating for the start or end offset resulting in an invalid
range (e.g. with an end offset beyond a node's length).

This makes updating a range's start/end a bit more efficient and removes
a piece of ad-hoc code in CharacterData needed to make it work before.

											
										
										
											2025-05-14 12:56:03 +02:00
+								    // 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end
 								    //     offset by data’s length and decrease it by count.
 								    for (auto* range : Range::live_ranges()) {
 								        if (range->end_container() == this && range->end_offset() > (offset + count))
 								            range->set_end_offset(range->end_offset() + inserted_data_result.data.size() - count);
-												LibWeb: Update live DOM ranges on Text and CharacterData mutations

Taking care of the FIXMEs I added in earlier patches. :^)

											
										
										
											2022-03-21 20:05:25 +01:00
+								    }
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
 								    // 12. If node’s parent is non-null, then run the children changed steps for node’s parent.
 								    if (parent())
-												LibWeb: Add metadata to children update steps invocation

Currently, this metadata is only provided on the insertion steps,
though I believe it would be useful to extend to the other cases
as well. This metadata can aid in making optimizations for these
steps by providing extra context into the type of change which
was made on the child.

											
										
										
											2025-01-27 01:16:33 +13:00
+								        parent()->children_changed(nullptr);
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
-												LibWeb: Don't neglect DOM range updates on CharacterData changes

Regressed in 036327332ffeca6f7d0b84645098837bd87145bf.

This commit moves the optimization a little later in replaceData(),
still avoiding relayout (the important part).

Recovers 480 points on WPT. :^)

											
										
										
											2025-02-21 11:05:59 +01:00
+								    // OPTIMIZATION: If the characters are the same, we can skip the remainder of this function.
 								    if (characters_are_the_same)
 								        return {};
-												LibWeb: Avoid more unnecessary relayouts on CharacterData text change

If the CharacterData node has no layout node when we're changing its
text, we don't need to mark the document for relayout.

This is fine, because if the node ends up getting a layout node attached
to it, we'll naturally perform relayout after that anyway.

											
										
										
											2025-03-07 21:10:16 +01:00
+								    if (auto* layout_node = this->layout_node(); layout_node && layout_node->is_text_node()) {
 								        // NOTE: Since the text node's data has changed, we need to invalidate the text for rendering.
 								        //       This ensures that the new text is reflected in layout, even if we don't end up
 								        //       doing a full layout tree rebuild.
-												LibWeb: Invalidate layout-transformed text on DOM text node change

This fixes an issue where programmatically changing the value of an
input element wasn't reflected visually.

											
										
										
											2023-08-16 11:03:00 +02:00
+								        static_cast<Layout::TextNode&>(*layout_node).invalidate_text_for_rendering();
-												LibWeb: Avoid more unnecessary relayouts on CharacterData text change

If the CharacterData node has no layout node when we're changing its
text, we don't need to mark the document for relayout.

This is fine, because if the node ends up getting a layout node attached
to it, we'll naturally perform relayout after that anyway.

											
										
										
											2025-03-07 21:10:16 +01:00
+								        // We also need to relayout.
-												LibWeb: Move "needs layout update" flag from DOM to layout tree

This is in preparation for allowing anonymous boxes to retain their
intrinsic size cache across layouts.

											
										
										
											2025-04-18 20:40:14 +02:00
+								        layout_node->set_needs_layout_update(SetNeedsLayoutReason::CharacterDataReplaceData);
-												LibWeb: Avoid more unnecessary relayouts on CharacterData text change

If the CharacterData node has no layout node when we're changing its
text, we don't need to mark the document for relayout.

This is fine, because if the node ends up getting a layout node attached
to it, we'll naturally perform relayout after that anyway.

											
										
										
											2025-03-07 21:10:16 +01:00
+								    }
-												LibWeb: Fire `input` events in `.execCommand()`

We do not fire `beforeinput` events since other browsers do not seem to
do so either.

The spec asks us to check whether a command's action modified the DOM
tree. This means adding or removing nodes and attributes, or changing
character data anywhere in the tree. We have
`Document::dom_tree_version()` for node updates, but for character data
a new version number is introduced that allows us to easily keep track
of any text changes in the entire tree.

											
										
										
											2025-01-23 10:50:00 +01:00
+								    document().bump_character_data_version();
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
-												LibWeb: Rename CharacterData's segmenter indicate it is for graphemes

We will be adding a word segmenter as well, so this is to disambiguate
the two.

											
										
										
											2024-09-05 09:59:59 -04:00
+								    if (m_grapheme_segmenter)
 								        m_grapheme_segmenter->set_segmented_text(m_data);
-												LibWeb: Add Document helpers to move its cursor to word boundaries

This implementation is based on the same feature I added to Serenity's
TextEditor:

https://github.com/SerenityOS/serenity/pull/17477

											
										
										
											2024-09-05 12:10:25 -04:00
+								    if (m_word_segmenter)
 								        m_word_segmenter->set_segmented_text(m_data);
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
-												LibWeb: Add CharacterData.replaceData(offset, count, data)

Note that we don't queue mutation records or update live ranges yet,
I've left those as FIXMEs.

											
										
										
											2022-03-21 18:05:20 +01:00
+								    return {};
 								}
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								// https://dom.spec.whatwg.org/#dom-characterdata-appenddata
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								WebIDL::ExceptionOr<void> CharacterData::append_data(String const& data)
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								{
 								    // The appendData(data) method steps are to replace data with node this, offset this’s length, count 0, and data data.
-												LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971

											
										
										
											2023-12-22 20:41:34 +13:00
+								    return replace_data(this->length_in_utf16_code_units(), 0, data);
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								}
 								// https://dom.spec.whatwg.org/#dom-characterdata-insertdata
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								WebIDL::ExceptionOr<void> CharacterData::insert_data(size_t offset, String const& data)
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								{
 								    // The insertData(offset, data) method steps are to replace data with node this, offset offset, count 0, and data data.
 								    return replace_data(offset, 0, data);
 								}
 								// https://dom.spec.whatwg.org/#dom-characterdata-deletedata
-												LibWeb: Move ExceptionOr from DOM/ to WebIDL/

This is a concept fully defined in the Web IDL spec and doesn't belong
in the DOM directory/namespace - not even DOMException, despite the name
:^)

											
										
										
											2022-09-25 17:03:42 +01:00
+								WebIDL::ExceptionOr<void> CharacterData::delete_data(size_t offset, size_t count)
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								{
 								    // The deleteData(offset, count) method steps are to replace data with node this, offset offset, count count, and data the empty string.
-												LibWeb: Port CharacterData from DeprecatedString to String

The existing implementation has some pre-existing issues where it is
incorrectly assumes that byte offsets are given through the IDL instead
of UTF-16 code units. While making these changes, leave some FIXMEs for
that.

											
										
										
											2023-09-07 21:36:05 +12:00
+								    return replace_data(offset, count, String {});
-												LibWeb: Implement CharacterData.{append,insert,delete}Data


											
										
										
											2022-07-11 16:23:50 +01:00
+								}
-												LibWeb: Move initial creation of Unicode segmenters to the Document

The expensive part of creating a segmenter is doing the locale and UCD
data lookups at creation time. Instead of doing this once per text node,
cache the segmenters on the document, and clone them as needed (cloning
is much, much cheaper).

On a profile loading Ladybird's GitHub repo, the following hot methods
changed as follows:

    ChunkIterator ctor: 6.08% -> 0.21%
    Segmenter factory:  5.86% ->    0%
    Segmenter clone:    N/A   -> 0.09%

											
										
										
											2024-09-22 10:03:23 -04:00
+								Unicode::Segmenter& CharacterData::grapheme_segmenter() const
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
+								{
-												LibWeb: Rename CharacterData's segmenter indicate it is for graphemes

We will be adding a word segmenter as well, so this is to disambiguate
the two.

											
										
										
											2024-09-05 09:59:59 -04:00
+								    if (!m_grapheme_segmenter) {
-												LibWeb: Move initial creation of Unicode segmenters to the Document

The expensive part of creating a segmenter is doing the locale and UCD
data lookups at creation time. Instead of doing this once per text node,
cache the segmenters on the document, and clone them as needed (cloning
is much, much cheaper).

On a profile loading Ladybird's GitHub repo, the following hot methods
changed as follows:

    ChunkIterator ctor: 6.08% -> 0.21%
    Segmenter factory:  5.86% ->    0%
    Segmenter clone:    N/A   -> 0.09%

											
										
										
											2024-09-22 10:03:23 -04:00
+								        m_grapheme_segmenter = document().grapheme_segmenter().clone();
-												LibWeb: Rename CharacterData's segmenter indicate it is for graphemes

We will be adding a word segmenter as well, so this is to disambiguate
the two.

											
										
										
											2024-09-05 09:59:59 -04:00
+								        m_grapheme_segmenter->set_segmented_text(m_data);
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
+								    }
-												LibWeb: Rename CharacterData's segmenter indicate it is for graphemes

We will be adding a word segmenter as well, so this is to disambiguate
the two.

											
										
										
											2024-09-05 09:59:59 -04:00
+								    return *m_grapheme_segmenter;
-												LibWeb: Port text segmentation to the ICU text segmenter

											
										
										
											2024-06-19 09:02:21 -04:00
+								}
-												LibWeb: Move initial creation of Unicode segmenters to the Document

The expensive part of creating a segmenter is doing the locale and UCD
data lookups at creation time. Instead of doing this once per text node,
cache the segmenters on the document, and clone them as needed (cloning
is much, much cheaper).

On a profile loading Ladybird's GitHub repo, the following hot methods
changed as follows:

    ChunkIterator ctor: 6.08% -> 0.21%
    Segmenter factory:  5.86% ->    0%
    Segmenter clone:    N/A   -> 0.09%

											
										
										
											2024-09-22 10:03:23 -04:00
+								Unicode::Segmenter& CharacterData::word_segmenter() const
-												LibWeb: Add Document helpers to move its cursor to word boundaries

This implementation is based on the same feature I added to Serenity's
TextEditor:

https://github.com/SerenityOS/serenity/pull/17477

											
										
										
											2024-09-05 12:10:25 -04:00
+								{
 								    if (!m_word_segmenter) {
-												LibWeb: Move initial creation of Unicode segmenters to the Document

The expensive part of creating a segmenter is doing the locale and UCD
data lookups at creation time. Instead of doing this once per text node,
cache the segmenters on the document, and clone them as needed (cloning
is much, much cheaper).

On a profile loading Ladybird's GitHub repo, the following hot methods
changed as follows:

    ChunkIterator ctor: 6.08% -> 0.21%
    Segmenter factory:  5.86% ->    0%
    Segmenter clone:    N/A   -> 0.09%

											
										
										
											2024-09-22 10:03:23 -04:00
+								        m_word_segmenter = document().word_segmenter().clone();
-												LibWeb: Add Document helpers to move its cursor to word boundaries

This implementation is based on the same feature I added to Serenity's
TextEditor:

https://github.com/SerenityOS/serenity/pull/17477

											
										
										
											2024-09-05 12:10:25 -04:00
+								        m_word_segmenter->set_segmented_text(m_data);
 								    }
 								    return *m_word_segmenter;
 								}
-												LibWeb: Move everything into the Web namespace

											
										
										
											2020-03-07 10:27:02 +01:00
+								}