ladybird/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
Andreas Kling ca97f68cb7 LibWeb: Normalize decoded HTML string parsing
Preserve leading BOMs when parsing already-decoded HTML strings, since
those strings do not go through the encoded byte decoder path.

Decoded markup from JS strings can also contain WTF-8 for lone surrogate
code units. Keep the common scalar UTF-8 path to a single validation and
copy, but replace surrogates before handing bytes to the Rust tokenizer.

Add text coverage for DOMParser and innerHTML string parsing, including
leading BOMs, text and attributes, lone high and low surrogates, and a
valid surrogate pair.
2026-05-24 10:14:17 +02:00

304 lines
9.8 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <AK/FlyString.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
#include <LibWeb/HTMLTokenizerRustFFI.h>
namespace Web::HTML {
static Vector<u32> code_points_from_string(String const& string)
{
Vector<u32> code_points;
code_points.ensure_capacity(string.bytes().size());
for (auto code_point : string.code_points())
code_points.append(code_point);
return code_points;
}
static StringView ffi_string_view(u8 const* ptr, size_t len)
{
if (ptr == nullptr || len == 0)
return {};
return { ptr, len };
}
static RustFfiTokenizerHandle* create_tokenizer_from_utf8(StringView utf8_bytes)
{
auto* bytes = reinterpret_cast<u8 const*>(utf8_bytes.characters_without_null_termination());
if (bytes == nullptr)
bytes = reinterpret_cast<u8 const*>("");
return rust_html_tokenizer_create_from_utf8(bytes, utf8_bytes.length());
}
static String decoded_string_for_utf8_tokenizer(StringView input)
{
Utf8View utf8_view { input };
if (utf8_view.validate(AllowLonelySurrogates::No))
return String::from_utf8_without_validation(input.bytes());
// Decoded strings may come from WTF-16 JS strings. Rust's UTF-8 path
// requires scalar-value UTF-8, so replace lone surrogates but keep BOMs.
VERIFY(utf8_view.validate());
StringBuilder builder(input.length());
for (auto code_point : utf8_view)
builder.append_code_point(is_unicode_surrogate(code_point) ? AK::UnicodeUtils::REPLACEMENT_CODE_POINT : code_point);
return builder.to_string_without_validation();
}
static Vector<FlyString> build_interned_name_table(size_t count, void (*fetch)(uint16_t, uint8_t const**, size_t*))
{
Vector<FlyString> table;
// Slot 0 is unused (id 0 means "not interned"); store an empty FlyString there.
table.append(FlyString {});
table.ensure_capacity(count + 1);
for (size_t i = 0; i < count; ++i) {
uint8_t const* ptr = nullptr;
size_t len = 0;
fetch(static_cast<uint16_t>(i + 1), &ptr, &len);
if (ptr == nullptr || len == 0) {
table.append(FlyString {});
continue;
}
table.append(MUST(FlyString::from_utf8(StringView { ptr, len })));
}
return table;
}
static FlyString const& interned_rust_tag_name(uint16_t id)
{
static Vector<FlyString> const s_table = build_interned_name_table(
rust_html_tokenizer_interned_tag_name_count(),
rust_html_tokenizer_interned_tag_name);
if (id == 0 || id >= s_table.size())
return s_table[0];
return s_table[id];
}
static FlyString const& interned_rust_attr_name(uint16_t id)
{
static Vector<FlyString> const s_table = build_interned_name_table(
rust_html_tokenizer_interned_attr_name_count(),
rust_html_tokenizer_interned_attr_name);
if (id == 0 || id >= s_table.size())
return s_table[0];
return s_table[id];
}
HTMLTokenizer::HTMLTokenizer()
{
m_tokenizer = create_tokenizer_from_utf8({});
rust_html_tokenizer_set_input_stream_closed(m_tokenizer, false);
}
HTMLTokenizer::~HTMLTokenizer()
{
if (m_tokenizer)
rust_html_tokenizer_destroy(m_tokenizer);
}
HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding, InputType input_type)
{
if (input_type == InputType::EncodedBytes) {
auto decoder = TextCodec::decoder_for(encoding);
VERIFY(decoder.has_value());
m_source = MUST(decoder->to_utf8(input));
} else {
m_source = decoded_string_for_utf8_tokenizer(input);
}
m_input_stream_closed = true;
m_tokenizer = create_tokenizer_from_utf8(m_source.bytes_as_string_view());
}
Optional<HTMLToken> HTMLTokenizer::next_token(StopAtInsertionPoint stop_at_insertion_point)
{
RustFfiToken ffi;
bool stop = stop_at_insertion_point == StopAtInsertionPoint::Yes;
bool cdata_allowed = false;
if (!rust_html_tokenizer_next_token(m_tokenizer, &ffi, stop, cdata_allowed))
return {};
HTMLToken::Type type;
switch (ffi.token_type) {
case 1:
type = HTMLToken::Type::DOCTYPE;
break;
case 2:
type = HTMLToken::Type::StartTag;
break;
case 3:
type = HTMLToken::Type::EndTag;
break;
case 4:
type = HTMLToken::Type::Comment;
break;
case 5:
type = HTMLToken::Type::Character;
break;
case 6:
type = HTMLToken::Type::EndOfFile;
break;
default:
VERIFY_NOT_REACHED();
}
HTMLToken token { type };
token.set_start_position({}, { ffi.start_line, ffi.start_column });
token.set_end_position({}, { ffi.end_line, ffi.end_column });
switch (type) {
case HTMLToken::Type::Character:
token.set_code_point(ffi.code_point);
break;
case HTMLToken::Type::StartTag:
case HTMLToken::Type::EndTag: {
if (ffi.tag_name_id != 0)
token.set_tag_name(interned_rust_tag_name(ffi.tag_name_id));
else
token.set_tag_name(MUST(FlyString::from_utf8(ffi_string_view(ffi.tag_name_ptr, ffi.tag_name_len))));
token.set_self_closing(ffi.self_closing);
for (size_t i = 0; i < ffi.attributes_len; ++i) {
auto const& ffi_attribute = ffi.attributes_ptr[i];
HTMLToken::Attribute attribute;
if (ffi_attribute.name_id != 0)
attribute.local_name = interned_rust_attr_name(ffi_attribute.name_id);
else
attribute.local_name = MUST(FlyString::from_utf8(ffi_string_view(ffi_attribute.name_ptr, ffi_attribute.name_len)));
attribute.value = MUST(String::from_utf8(ffi_string_view(ffi_attribute.value_ptr, ffi_attribute.value_len)));
attribute.name_start_position = { ffi_attribute.name_start_line, ffi_attribute.name_start_column };
attribute.name_end_position = { ffi_attribute.name_end_line, ffi_attribute.name_end_column };
attribute.value_start_position = { ffi_attribute.value_start_line, ffi_attribute.value_start_column };
attribute.value_end_position = { ffi_attribute.value_end_line, ffi_attribute.value_end_column };
token.add_attribute(move(attribute));
}
token.normalize_attributes();
if (ffi.had_duplicate_attribute)
token.set_had_duplicate_attribute({});
break;
}
case HTMLToken::Type::Comment:
token.set_comment(MUST(String::from_utf8(ffi_string_view(ffi.comment_ptr, ffi.comment_len))));
break;
case HTMLToken::Type::DOCTYPE: {
auto& doctype = token.ensure_doctype_data();
if (!ffi.missing_name) {
doctype.name = MUST(String::from_utf8(ffi_string_view(ffi.doctype_name_ptr, ffi.doctype_name_len)));
doctype.missing_name = false;
}
if (!ffi.missing_public_id) {
doctype.public_identifier = MUST(String::from_utf8(ffi_string_view(ffi.public_id_ptr, ffi.public_id_len)));
doctype.missing_public_identifier = false;
}
if (!ffi.missing_system_id) {
doctype.system_identifier = MUST(String::from_utf8(ffi_string_view(ffi.system_id_ptr, ffi.system_id_len)));
doctype.missing_system_identifier = false;
}
doctype.force_quirks = ffi.force_quirks;
break;
}
case HTMLToken::Type::EndOfFile:
break;
case HTMLToken::Type::Invalid:
VERIFY_NOT_REACHED();
}
return token;
}
void HTMLTokenizer::parser_did_run(Badge<HTMLParser>)
{
rust_html_tokenizer_parser_did_run(m_tokenizer);
}
String HTMLTokenizer::unparsed_input() const
{
uint8_t const* ptr = nullptr;
size_t len = 0;
rust_html_tokenizer_unparsed_input(m_tokenizer, &ptr, &len);
return MUST(String::from_utf8(ffi_string_view(ptr, len)));
}
void HTMLTokenizer::append_to_input_stream(StringView input)
{
if (input.is_empty())
return;
auto utf8_input = MUST(String::from_utf8(input));
auto code_points = code_points_from_string(utf8_input);
rust_html_tokenizer_append_input(m_tokenizer, code_points.data(), code_points.size());
}
void HTMLTokenizer::close_input_stream()
{
m_input_stream_closed = true;
rust_html_tokenizer_set_input_stream_closed(m_tokenizer, true);
}
void HTMLTokenizer::insert_input_at_insertion_point(StringView input)
{
auto utf8_input = MUST(String::from_utf8(input));
auto code_points = code_points_from_string(utf8_input);
rust_html_tokenizer_insert_input(m_tokenizer, code_points.data(), code_points.size());
}
void HTMLTokenizer::insert_eof()
{
close_input_stream();
rust_html_tokenizer_insert_eof(m_tokenizer);
}
bool HTMLTokenizer::is_insertion_point_defined() const
{
return rust_html_tokenizer_is_insertion_point_defined(m_tokenizer);
}
bool HTMLTokenizer::is_insertion_point_reached()
{
return rust_html_tokenizer_is_insertion_point_reached(m_tokenizer);
}
void HTMLTokenizer::undefine_insertion_point()
{
rust_html_tokenizer_undefine_insertion_point(m_tokenizer);
}
void HTMLTokenizer::store_insertion_point()
{
rust_html_tokenizer_store_insertion_point(m_tokenizer);
}
void HTMLTokenizer::restore_insertion_point()
{
rust_html_tokenizer_restore_insertion_point(m_tokenizer);
}
void HTMLTokenizer::update_insertion_point()
{
rust_html_tokenizer_update_insertion_point(m_tokenizer);
}
void HTMLTokenizer::abort()
{
rust_html_tokenizer_abort(m_tokenizer);
}
void HTMLTokenizer::switch_to(State new_state)
{
dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state));
m_state = new_state;
rust_html_tokenizer_switch_state(m_tokenizer, static_cast<uint8_t>(new_state));
}
}