ladybird/Libraries/LibWeb/HTML/Parser/HTMLParser.h

184 lines
6.7 KiB
C
Raw Normal View History

/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibGfx/Color.h>
#include <LibJS/Heap/Cell.h>
#include <LibWeb/DOM/FragmentSerializationMode.h>
2025-07-19 19:35:33 -07:00
#include <LibWeb/Export.h>
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
#include <LibWeb/HTML/Parser/ParserScriptingMode.h>
#include <LibWeb/MimeSniff/MimeType.h>
LibWeb: Replace spin_until in HTMLParser::the_end() with state machine HTMLParser::the_end() had three spin_until calls that blocked the event loop: step 5 (deferred scripts), step 7 (ASAP scripts), and step 8 (load event delay). This replaces them with an HTMLParserEndState state machine that progresses asynchronously via callbacks. The state machine has three phases matching the three spin_until calls: - WaitingForDeferredScripts: loops executing ready deferred scripts - WaitingForASAPScripts: waits for ASAP script lists to empty - WaitingForLoadEventDelay: waits for nothing to delay the load event Notification triggers re-evaluate the state machine when conditions change: HTMLScriptElement::mark_as_ready, stylesheet unblocking in StyleElementBase/HTMLLinkElement, did_stop_being_active_document, and DocumentLoadEventDelayer decrements. NavigableContainer state changes (session history readiness, content navigable cleared, lazy load flag) also trigger re-evaluation of the load event delay check. Key design decisions and why: 1. Microtask checkpoint in schedule_progress_check(): The old spin_until called perform_a_microtask_checkpoint() before checking conditions. This is critical because HTMLImageElement::update_the_image_data step 8 queues a microtask that creates the DocumentLoadEventDelayer. Without the checkpoint, check_progress() would see zero delayers and complete before images start delaying the load event. 2. deferred_invoke in schedule_progress_check(): I tried Core::Timer (0ms), queue_global_task, and synchronous calls. Timers caused non-deterministic ordering with the HTML event loop's task processing timer, leading to image layout tests failing (wrong subtest pass/fail patterns). Synchronous calls fired too early during image load processing before dimensions were set, causing 0-height images in layout tests. queue_global_task had task ordering issues with the session history traversal queue. deferred_invoke runs after the current callback returns but within the same event loop pump, giving the right balance. 3. Navigation load event guard (m_navigation_load_event_guard): During cross-document navigation, finalize_a_cross_document_navigation step 2 calls set_delaying_load_events(false) before the session history traversal activates the new document. This creates a transient state where the parent's load event delay check sees the about:blank (which has ready_for_post_load_tasks=true) as the active document and completes prematurely.
2026-03-28 09:39:51 +01:00
#include <LibWeb/Platform/Timer.h>
struct RustFfiHtmlParserHandle;
namespace Web::SVG {
class SVGScriptElement;
}
namespace Web::HTML {
class HTMLScriptElement;
class HTMLFormElement;
2025-07-19 19:35:33 -07:00
class WEB_API HTMLParser final : public JS::Cell {
GC_CELL(HTMLParser, JS::Cell);
GC_DECLARE_ALLOCATOR(HTMLParser);
public:
static constexpr bool OVERRIDES_FINALIZE = true;
virtual ~HTMLParser() override;
static GC::Ref<HTMLParser> create_for_scripting(DOM::Document&);
static GC::Ref<HTMLParser> create_with_open_input_stream(DOM::Document&);
static GC::Ref<HTMLParser> create_with_uncertain_encoding(DOM::Document&, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type = {});
static GC::Ref<HTMLParser> create(DOM::Document&, StringView input, ParserScriptingMode, StringView encoding);
static GC::Ref<HTMLParser> create_for_decoded_string(DOM::Document&, StringView input, ParserScriptingMode, StringView encoding);
void run(HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
void run(URL::URL const&, HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
void run_until_completion(HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
void pop_all_open_elements();
static void the_end(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser> = nullptr);
DOM::Document& document();
enum class AllowDeclarativeShadowRoots {
No,
Yes,
};
static WebIDL::ExceptionOr<Vector<GC::Root<DOM::Node>>> parse_html_fragment(DOM::Element& context_element, StringView markup, AllowDeclarativeShadowRoots = AllowDeclarativeShadowRoots::No, ParserScriptingMode = ParserScriptingMode::Inert);
enum class SerializableShadowRoots {
No,
Yes,
};
static String serialize_html_fragment(DOM::Node const&, SerializableShadowRoots, ReadonlySpan<GC::Ref<DOM::ShadowRoot>>, DOM::FragmentSerializationMode = DOM::FragmentSerializationMode::Inner);
2022-02-19 15:58:21 +01:00
HTMLTokenizer& tokenizer() { return m_tokenizer; }
void configure_element_created_by_rust_parser(DOM::Element&);
GC::Ref<DOM::Element> create_element_for_rust_parser(HTMLToken const&, Optional<FlyString> const& namespace_, DOM::Node& intended_parent, bool had_duplicate_attribute, GC::Ptr<HTMLFormElement>, bool has_template_element_on_stack);
void prepare_svg_script_for_rust_parser(SVG::SVGScriptElement&, size_t source_line_number);
void set_script_source_line_from_rust_parser(DOM::Element&, size_t source_line_number);
void mark_script_already_started_from_rust_parser(HTMLScriptElement&);
void stop_parsing_from_rust_parser();
bool process_script_end_tag_from_rust_parser(HTMLScriptElement&);
bool process_svg_script_end_tag_from_rust_parser(SVG::SVGScriptElement&);
// https://html.spec.whatwg.org/multipage/parsing.html#abort-a-parser
void abort();
2022-02-19 15:58:21 +01:00
bool aborted() const { return m_aborted; }
bool stopped() const { return m_stop_parsing; }
bool is_paused() const { return m_parser_pause_flag; }
bool is_script_created() const { return m_script_created; }
2022-02-19 15:58:21 +01:00
size_t script_nesting_level() const { return m_script_nesting_level; }
void schedule_resume_check();
void set_post_parse_action(Function<void()> action) { m_post_parse_action = move(action); }
void invoke_post_parse_action_for_testing() { invoke_post_parse_action(); }
private:
enum class ScriptCreatedParser {
No,
Yes,
};
HTMLParser(DOM::Document&, ParserScriptingMode, StringView input, StringView encoding, HTMLTokenizer::InputType = HTMLTokenizer::InputType::EncodedBytes);
HTMLParser(DOM::Document&, ParserScriptingMode, ScriptCreatedParser);
virtual void visit_edges(Cell::Visitor&) override;
virtual void initialize(JS::Realm&) override;
virtual void finalize() override;
void stop_parsing() { m_stop_parsing = true; }
// https://html.spec.whatwg.org/multipage/parsing.html#start-the-speculative-html-parser
void start_the_speculative_html_parser();
// https://html.spec.whatwg.org/multipage/parsing.html#stop-the-speculative-html-parser
void stop_the_speculative_html_parser();
GC::Ref<DOM::Element> create_element_for(HTMLToken const&, Optional<FlyString> const& namespace_, DOM::Node& intended_parent);
void increment_script_nesting_level();
void decrement_script_nesting_level();
void resume_after_parser_blocking_script();
void invoke_post_parse_action();
HTMLTokenizer m_tokenizer;
RustFfiHtmlParserHandle* m_rust_parser { nullptr };
bool m_parsing_fragment { false };
// https://html.spec.whatwg.org/multipage/parsing.html#scripting-mode
ParserScriptingMode m_scripting_mode {};
bool m_script_created { false };
bool m_aborted { false };
bool m_parser_pause_flag { false };
bool m_stop_parsing { false };
bool m_resume_check_pending { false };
size_t m_script_nesting_level { 0 };
Function<void()> m_post_parse_action;
JS::Realm& realm();
GC::Ptr<DOM::Document> m_document;
GC::Ptr<HTMLFormElement> m_form_element;
GC::Ptr<DOM::Element> m_context_element;
// https://html.spec.whatwg.org/multipage/parsing.html#active-speculative-html-parser
GC::Ptr<SpeculativeHTMLParser> m_active_speculative_html_parser;
};
LibWeb: Replace spin_until in HTMLParser::the_end() with state machine HTMLParser::the_end() had three spin_until calls that blocked the event loop: step 5 (deferred scripts), step 7 (ASAP scripts), and step 8 (load event delay). This replaces them with an HTMLParserEndState state machine that progresses asynchronously via callbacks. The state machine has three phases matching the three spin_until calls: - WaitingForDeferredScripts: loops executing ready deferred scripts - WaitingForASAPScripts: waits for ASAP script lists to empty - WaitingForLoadEventDelay: waits for nothing to delay the load event Notification triggers re-evaluate the state machine when conditions change: HTMLScriptElement::mark_as_ready, stylesheet unblocking in StyleElementBase/HTMLLinkElement, did_stop_being_active_document, and DocumentLoadEventDelayer decrements. NavigableContainer state changes (session history readiness, content navigable cleared, lazy load flag) also trigger re-evaluation of the load event delay check. Key design decisions and why: 1. Microtask checkpoint in schedule_progress_check(): The old spin_until called perform_a_microtask_checkpoint() before checking conditions. This is critical because HTMLImageElement::update_the_image_data step 8 queues a microtask that creates the DocumentLoadEventDelayer. Without the checkpoint, check_progress() would see zero delayers and complete before images start delaying the load event. 2. deferred_invoke in schedule_progress_check(): I tried Core::Timer (0ms), queue_global_task, and synchronous calls. Timers caused non-deterministic ordering with the HTML event loop's task processing timer, leading to image layout tests failing (wrong subtest pass/fail patterns). Synchronous calls fired too early during image load processing before dimensions were set, causing 0-height images in layout tests. queue_global_task had task ordering issues with the session history traversal queue. deferred_invoke runs after the current callback returns but within the same event loop pump, giving the right balance. 3. Navigation load event guard (m_navigation_load_event_guard): During cross-document navigation, finalize_a_cross_document_navigation step 2 calls set_delaying_load_events(false) before the session history traversal activates the new document. This creates a transient state where the parent's load event delay check sees the about:blank (which has ready_for_post_load_tasks=true) as the active document and completes prematurely.
2026-03-28 09:39:51 +01:00
class HTMLParserEndState final : public JS::Cell {
GC_CELL(HTMLParserEndState, JS::Cell);
GC_DECLARE_ALLOCATOR(HTMLParserEndState);
public:
static GC::Ref<HTMLParserEndState> create(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser>);
void schedule_progress_check();
private:
enum class Phase {
WaitingForDeferredScripts,
WaitingForASAPScripts,
WaitingForLoadEventDelay,
Completed,
};
HTMLParserEndState(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser>);
virtual void visit_edges(Cell::Visitor&) override;
void check_progress();
void advance_to_asap_scripts_phase();
void complete();
Phase m_phase { Phase::WaitingForDeferredScripts };
bool m_check_pending { false };
GC::Ref<DOM::Document> m_document;
GC::Ptr<HTMLParser> m_parser;
GC::Ref<Platform::Timer> m_timeout;
};
RefPtr<CSS::StyleValue const> parse_dimension_value(StringView);
RefPtr<CSS::StyleValue const> parse_nonzero_dimension_value(StringView);
Optional<Color> parse_legacy_color_value(StringView);
RefPtr<CSS::StyleValue const> parse_table_child_element_align_value(StringView);
}