ladybird/Libraries/LibWeb/HTML/Parser/SpeculativeHTMLParser.cpp

/*
 * Copyright (c) 2026, Aliaksandr Kalenik <kalenik.aliaksandr@gmail.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Assertions.h>
#include <AK/FFIHelpers.h>
#include <LibJS/Runtime/Realm.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/Fetch/Fetching/Fetching.h>
#include <LibWeb/Fetch/Infrastructure/FetchAlgorithms.h>
#include <LibWeb/Fetch/Infrastructure/FetchController.h>
#include <LibWeb/Fetch/Infrastructure/HTTP/Requests.h>
#include <LibWeb/HTML/CORSSettingAttribute.h>
#include <LibWeb/HTML/Parser/SpeculativeHTMLParser.h>
#include <LibWeb/HTML/PotentialCORSRequest.h>
#include <LibWeb/HTML/Scripting/Environments.h>
#include <LibWeb/HTMLTokenizerRustFFI.h>

namespace Web::HTML {

GC_DEFINE_ALLOCATOR(SpeculativeHTMLParser);

GC::Ref<SpeculativeHTMLParser> SpeculativeHTMLParser::create(JS::Realm& realm, GC::Ref<DOM::Document> document, String pending_input, URL::URL base_url)
{
    return realm.create<SpeculativeHTMLParser>(document, move(pending_input), move(base_url));
}

SpeculativeHTMLParser::SpeculativeHTMLParser(GC::Ref<DOM::Document> document, String pending_input, URL::URL base_url)
    : m_document(document)
    , m_input(move(pending_input))
    , m_base_url(move(base_url))
{
}

SpeculativeHTMLParser::~SpeculativeHTMLParser() = default;

void SpeculativeHTMLParser::visit_edges(JS::Cell::Visitor& visitor)
{
    Base::visit_edges(visitor);
    visitor.visit(m_document);
}

void SpeculativeHTMLParser::stop()
{
    // https://html.spec.whatwg.org/multipage/parsing.html#stop-the-speculative-html-parser
    // 3. Throw away any pending content in speculativeParser's input stream, and discard any future content
    //    that would have been added to it.
    m_stopped = true;
}

void SpeculativeHTMLParser::run()
{
    if (m_stopped)
        return;

    auto input = m_input.bytes_as_string_view();
    auto* bytes = reinterpret_cast<u8 const*>(input.characters_without_null_termination());
    if (bytes == nullptr)
        bytes = reinterpret_cast<u8 const*>("");

    rust_html_preload_scanner_scan(bytes, input.length(), this, [](void* context, RustFfiPreloadScannerEntry const* entry) -> bool {
        auto& parser = *static_cast<SpeculativeHTMLParser*>(context);
        if (parser.m_stopped || entry == nullptr)
            return false;

        parser.process_preload_scanner_entry(*entry);
        return !parser.m_stopped;
    });
}

namespace {

Optional<Fetch::Infrastructure::Request::Destination> destination_from_preload_scanner(RustFfiPreloadScannerDestination destination)
{
    switch (destination) {
    case RustFfiPreloadScannerDestination::None:
        return {};
    case RustFfiPreloadScannerDestination::Font:
        return Fetch::Infrastructure::Request::Destination::Font;
    case RustFfiPreloadScannerDestination::Image:
        return Fetch::Infrastructure::Request::Destination::Image;
    case RustFfiPreloadScannerDestination::Script:
        return Fetch::Infrastructure::Request::Destination::Script;
    case RustFfiPreloadScannerDestination::Style:
        return Fetch::Infrastructure::Request::Destination::Style;
    case RustFfiPreloadScannerDestination::Track:
        return Fetch::Infrastructure::Request::Destination::Track;
    }
    VERIFY_NOT_REACHED();
}

CORSSettingAttribute cors_setting_from_preload_scanner(RustFfiPreloadScannerCorsSetting cors_setting)
{
    switch (cors_setting) {
    case RustFfiPreloadScannerCorsSetting::NoCors:
        return CORSSettingAttribute::NoCORS;
    case RustFfiPreloadScannerCorsSetting::Anonymous:
        return CORSSettingAttribute::Anonymous;
    case RustFfiPreloadScannerCorsSetting::UseCredentials:
        return CORSSettingAttribute::UseCredentials;
    }
    VERIFY_NOT_REACHED();
}

void issue_speculative_fetch(JS::Realm& realm, DOM::Document& document, URL::URL url, Optional<Fetch::Infrastructure::Request::Destination> destination, CORSSettingAttribute cors_setting)
{
    auto& vm = realm.vm();
    auto request = create_potential_CORS_request(vm, url, destination, cors_setting);
    request->set_client(&document.relevant_settings_object());

    Fetch::Infrastructure::FetchAlgorithms::Input fetch_algorithms_input {};
    auto algorithms = Fetch::Infrastructure::FetchAlgorithms::create(vm, move(fetch_algorithms_input));

    // The fetch stays alive via ResourceLoader's GC::Root callbacks for the duration of the
    // network request, so we don't need to retain the FetchController.
    (void)Fetch::Fetching::fetch(realm, request, algorithms);
}

}

void SpeculativeHTMLParser::process_preload_scanner_entry(RustFfiPreloadScannerEntry const& entry)
{
    auto url_string = ffi_string_view(entry.url_ptr, entry.url_len);
    if (url_string.is_empty())
        return;

    // https://html.spec.whatwg.org/multipage/parsing.html#speculative-fetch
    switch (entry.action) {
    case RustFfiPreloadScannerAction::Base:
        // 1. If the speculative HTML parser encounters one of the following elements, then act as if that
        //    element is processed for the purpose of its effect on subsequent speculative fetches.
        //    - A base element.
        if (auto parsed = m_document->encoding_parse_url(url_string); parsed.has_value())
            m_base_url = parsed.release_value();
        return;

    case RustFfiPreloadScannerAction::Fetch:
        break;
    }

    // FIXME: A meta element whose http-equiv attribute is in the Content security policy state.
    // FIXME: A meta element whose name attribute is referrer.
    // FIXME: A meta element whose name attribute is viewport.

    // 2. Let url be the URL that element would fetch if it was processed normally. If there is no such
    //    URL or if it is the empty string, then do nothing.
    // We resolve URLs against the speculative parser's tracked base_url (which may have been updated
    // by an earlier speculative <base href>); this is why we use complete_url here rather than
    // document.encoding_parse_url, which would resolve against the document's base instead.
    auto url = m_base_url.complete_url(url_string);
    if (!url.has_value())
        return;

    // 3. Otherwise, if url is already in the list of speculative fetch URLs, then do nothing.
    if (m_document->has_speculative_fetch_url(*url))
        return;

    // 4. Otherwise, fetch url as if the element was processed normally, and add url to the list of
    //    speculative fetch URLs.
    m_document->add_speculative_fetch_url(*url);
    issue_speculative_fetch(m_document->realm(), *m_document, *url, destination_from_preload_scanner(entry.destination), cors_setting_from_preload_scanner(entry.cors_setting));
}

}