ladybird/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp

/*
 * Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Debug.h>
#include <AK/FFIHelpers.h>
#include <AK/FlyString.h>
#include <AK/NeverDestroyed.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
#include <LibWeb/HTMLTokenizerRustFFI.h>

namespace Web::HTML {

static Vector<u32> code_points_from_string(String const& string)
{
    Vector<u32> code_points;
    code_points.ensure_capacity(string.bytes().size());
    for (auto code_point : string.code_points())
        code_points.append(code_point);
    return code_points;
}

static RustFfiTokenizerHandle* create_tokenizer_from_utf8(StringView utf8_bytes)
{
    auto* bytes = reinterpret_cast<u8 const*>(utf8_bytes.characters_without_null_termination());
    if (bytes == nullptr)
        bytes = reinterpret_cast<u8 const*>("");
    return rust_html_tokenizer_create_from_utf8(bytes, utf8_bytes.length());
}

static String decoded_string_for_utf8_tokenizer(StringView input)
{
    Utf8View utf8_view { input };
    if (utf8_view.validate(AllowLonelySurrogates::No))
        return String::from_utf8_without_validation(input.bytes());

    // Decoded strings may come from WTF-16 JS strings. Rust's UTF-8 path
    // requires scalar-value UTF-8, so replace lone surrogates but keep BOMs.
    VERIFY(utf8_view.validate());

    StringBuilder builder(input.length());
    for (auto code_point : utf8_view)
        builder.append_code_point(is_unicode_surrogate(code_point) ? AK::UnicodeUtils::REPLACEMENT_CODE_POINT : code_point);
    return builder.to_string_without_validation();
}

static Vector<FlyString> build_interned_name_table(size_t count, void (*fetch)(uint16_t, uint8_t const**, size_t*))
{
    Vector<FlyString> table;
    // Slot 0 is unused (id 0 means "not interned"); store an empty FlyString there.
    table.append(FlyString {});
    table.ensure_capacity(count + 1);
    for (size_t i = 0; i < count; ++i) {
        uint8_t const* ptr = nullptr;
        size_t len = 0;
        fetch(static_cast<uint16_t>(i + 1), &ptr, &len);
        if (ptr == nullptr || len == 0) {
            table.append(FlyString {});
            continue;
        }
        table.append(MUST(FlyString::from_utf8(StringView { ptr, len })));
    }
    return table;
}

static FlyString const& interned_rust_tag_name(uint16_t id)
{
    static NeverDestroyed<Vector<FlyString>> table { build_interned_name_table(
        rust_html_tokenizer_interned_tag_name_count(),
        rust_html_tokenizer_interned_tag_name) };
    if (id == 0 || id >= table->size())
        return (*table)[0];
    return (*table)[id];
}

static FlyString const& interned_rust_attr_name(uint16_t id)
{
    static NeverDestroyed<Vector<FlyString>> table { build_interned_name_table(
        rust_html_tokenizer_interned_attr_name_count(),
        rust_html_tokenizer_interned_attr_name) };
    if (id == 0 || id >= table->size())
        return (*table)[0];
    return (*table)[id];
}

HTMLTokenizer::HTMLTokenizer()
{
    m_tokenizer = create_tokenizer_from_utf8({});
    rust_html_tokenizer_set_input_stream_closed(m_tokenizer, false);
}

HTMLTokenizer::~HTMLTokenizer()
{
    if (m_tokenizer)
        rust_html_tokenizer_destroy(m_tokenizer);
}

HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding, InputType input_type)
{
    if (input_type == InputType::EncodedBytes) {
        auto decoder = TextCodec::decoder_for(encoding);
        VERIFY(decoder.has_value());
        m_source = MUST(decoder->to_utf8(input));
    } else {
        m_source = decoded_string_for_utf8_tokenizer(input);
    }
    m_input_stream_closed = true;
    m_tokenizer = create_tokenizer_from_utf8(m_source.bytes_as_string_view());
}

Optional<HTMLToken> HTMLTokenizer::next_token(StopAtInsertionPoint stop_at_insertion_point)
{
    RustFfiToken ffi;
    bool stop = stop_at_insertion_point == StopAtInsertionPoint::Yes;
    bool cdata_allowed = false;
    if (!rust_html_tokenizer_next_token(m_tokenizer, &ffi, stop, cdata_allowed))
        return {};

    HTMLToken::Type type;
    switch (ffi.token_type) {
    case 1:
        type = HTMLToken::Type::DOCTYPE;
        break;
    case 2:
        type = HTMLToken::Type::StartTag;
        break;
    case 3:
        type = HTMLToken::Type::EndTag;
        break;
    case 4:
        type = HTMLToken::Type::Comment;
        break;
    case 5:
        type = HTMLToken::Type::Character;
        break;
    case 6:
        type = HTMLToken::Type::EndOfFile;
        break;
    default:
        VERIFY_NOT_REACHED();
    }

    HTMLToken token { type };
    token.set_start_position({}, { ffi.start_line, ffi.start_column });
    token.set_end_position({}, { ffi.end_line, ffi.end_column });

    switch (type) {
    case HTMLToken::Type::Character:
        token.set_code_point(ffi.code_point);
        break;
    case HTMLToken::Type::StartTag:
    case HTMLToken::Type::EndTag: {
        if (ffi.tag_name_id != 0)
            token.set_tag_name(interned_rust_tag_name(ffi.tag_name_id));
        else
            token.set_tag_name(MUST(FlyString::from_utf8(ffi_string_view(ffi.tag_name_ptr, ffi.tag_name_len))));

        token.set_self_closing(ffi.self_closing);
        for (size_t i = 0; i < ffi.attributes_len; ++i) {
            auto const& ffi_attribute = ffi.attributes_ptr[i];
            HTMLToken::Attribute attribute;
            if (ffi_attribute.name_id != 0)
                attribute.local_name = interned_rust_attr_name(ffi_attribute.name_id);
            else
                attribute.local_name = MUST(FlyString::from_utf8(ffi_string_view(ffi_attribute.name_ptr, ffi_attribute.name_len)));
            attribute.value = MUST(String::from_utf8(ffi_string_view(ffi_attribute.value_ptr, ffi_attribute.value_len)));
            attribute.name_start_position = { ffi_attribute.name_start_line, ffi_attribute.name_start_column };
            attribute.name_end_position = { ffi_attribute.name_end_line, ffi_attribute.name_end_column };
            attribute.value_start_position = { ffi_attribute.value_start_line, ffi_attribute.value_start_column };
            attribute.value_end_position = { ffi_attribute.value_end_line, ffi_attribute.value_end_column };
            token.add_attribute(move(attribute));
        }
        token.normalize_attributes();
        if (ffi.had_duplicate_attribute)
            token.set_had_duplicate_attribute({});
        break;
    }
    case HTMLToken::Type::Comment:
        token.set_comment(MUST(String::from_utf8(ffi_string_view(ffi.comment_ptr, ffi.comment_len))));
        break;
    case HTMLToken::Type::DOCTYPE: {
        auto& doctype = token.ensure_doctype_data();
        if (!ffi.missing_name) {
            doctype.name = MUST(String::from_utf8(ffi_string_view(ffi.doctype_name_ptr, ffi.doctype_name_len)));
            doctype.missing_name = false;
        }
        if (!ffi.missing_public_id) {
            doctype.public_identifier = MUST(String::from_utf8(ffi_string_view(ffi.public_id_ptr, ffi.public_id_len)));
            doctype.missing_public_identifier = false;
        }
        if (!ffi.missing_system_id) {
            doctype.system_identifier = MUST(String::from_utf8(ffi_string_view(ffi.system_id_ptr, ffi.system_id_len)));
            doctype.missing_system_identifier = false;
        }
        doctype.force_quirks = ffi.force_quirks;
        break;
    }
    case HTMLToken::Type::EndOfFile:
        break;
    case HTMLToken::Type::Invalid:
        VERIFY_NOT_REACHED();
    }

    return token;
}

void HTMLTokenizer::parser_did_run(Badge<HTMLParser>)
{
    rust_html_tokenizer_parser_did_run(m_tokenizer);
}

String HTMLTokenizer::unparsed_input() const
{
    uint8_t const* ptr = nullptr;
    size_t len = 0;
    rust_html_tokenizer_unparsed_input(m_tokenizer, &ptr, &len);
    return MUST(String::from_utf8(ffi_string_view(ptr, len)));
}

void HTMLTokenizer::append_to_input_stream(StringView input)
{
    if (input.is_empty())
        return;

    auto utf8_input = MUST(String::from_utf8(input));
    auto code_points = code_points_from_string(utf8_input);
    rust_html_tokenizer_append_input(m_tokenizer, code_points.data(), code_points.size());
}

void HTMLTokenizer::close_input_stream()
{
    m_input_stream_closed = true;
    rust_html_tokenizer_set_input_stream_closed(m_tokenizer, true);
}

void HTMLTokenizer::insert_input_at_insertion_point(StringView input)
{
    auto utf8_input = MUST(String::from_utf8(input));
    auto code_points = code_points_from_string(utf8_input);
    rust_html_tokenizer_insert_input(m_tokenizer, code_points.data(), code_points.size());
}

void HTMLTokenizer::insert_eof()
{
    close_input_stream();
    rust_html_tokenizer_insert_eof(m_tokenizer);
}

bool HTMLTokenizer::is_insertion_point_defined() const
{
    return rust_html_tokenizer_is_insertion_point_defined(m_tokenizer);
}

bool HTMLTokenizer::is_insertion_point_reached()
{
    return rust_html_tokenizer_is_insertion_point_reached(m_tokenizer);
}

void HTMLTokenizer::undefine_insertion_point()
{
    rust_html_tokenizer_undefine_insertion_point(m_tokenizer);
}

void HTMLTokenizer::store_insertion_point()
{
    rust_html_tokenizer_store_insertion_point(m_tokenizer);
}

void HTMLTokenizer::restore_insertion_point()
{
    rust_html_tokenizer_restore_insertion_point(m_tokenizer);
}

void HTMLTokenizer::update_insertion_point()
{
    rust_html_tokenizer_update_insertion_point(m_tokenizer);
}

void HTMLTokenizer::abort()
{
    rust_html_tokenizer_abort(m_tokenizer);
}

void HTMLTokenizer::switch_to(State new_state)
{
    dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state));
    m_state = new_state;
    rust_html_tokenizer_switch_state(m_tokenizer, static_cast<uint8_t>(new_state));
}

}