2020-05-24 00:14:23 +02:00
|
|
|
/*
|
2024-10-04 13:19:50 +02:00
|
|
|
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
|
2020-05-24 00:14:23 +02:00
|
|
|
*
|
2021-04-22 01:24:48 -07:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-05-24 00:14:23 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
2023-05-28 15:04:40 +12:00
|
|
|
#include <LibGfx/Color.h>
|
2022-10-17 10:46:11 +02:00
|
|
|
#include <LibJS/Heap/Cell.h>
|
2026-02-11 09:20:13 +01:00
|
|
|
#include <LibWeb/DOM/FragmentSerializationMode.h>
|
2025-07-19 19:35:33 -07:00
|
|
|
#include <LibWeb/Export.h>
|
2020-07-28 19:18:23 +02:00
|
|
|
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
|
2026-04-12 13:31:30 +02:00
|
|
|
#include <LibWeb/HTML/Parser/ParserScriptingMode.h>
|
2024-10-20 19:39:50 +11:00
|
|
|
#include <LibWeb/MimeSniff/MimeType.h>
|
LibWeb: Replace spin_until in HTMLParser::the_end() with state machine
HTMLParser::the_end() had three spin_until calls that blocked the event
loop: step 5 (deferred scripts), step 7 (ASAP scripts), and step 8
(load event delay). This replaces them with an HTMLParserEndState state
machine that progresses asynchronously via callbacks.
The state machine has three phases matching the three spin_until calls:
- WaitingForDeferredScripts: loops executing ready deferred scripts
- WaitingForASAPScripts: waits for ASAP script lists to empty
- WaitingForLoadEventDelay: waits for nothing to delay the load event
Notification triggers re-evaluate the state machine when conditions
change: HTMLScriptElement::mark_as_ready, stylesheet unblocking in
StyleElementBase/HTMLLinkElement, did_stop_being_active_document, and
DocumentLoadEventDelayer decrements. NavigableContainer state changes
(session history readiness, content navigable cleared, lazy load flag)
also trigger re-evaluation of the load event delay check.
Key design decisions and why:
1. Microtask checkpoint in schedule_progress_check(): The old spin_until
called perform_a_microtask_checkpoint() before checking conditions.
This is critical because HTMLImageElement::update_the_image_data step
8 queues a microtask that creates the DocumentLoadEventDelayer.
Without the checkpoint, check_progress() would see zero delayers and
complete before images start delaying the load event.
2. deferred_invoke in schedule_progress_check():
I tried Core::Timer (0ms), queue_global_task, and synchronous calls.
Timers caused non-deterministic ordering with the HTML event loop's
task processing timer, leading to image layout tests failing (wrong
subtest pass/fail patterns). Synchronous calls fired too early during
image load processing before dimensions were set, causing 0-height
images in layout tests. queue_global_task had task ordering issues
with the session history traversal queue. deferred_invoke runs after
the current callback returns but within the same event loop pump,
giving the right balance.
3. Navigation load event guard (m_navigation_load_event_guard): During
cross-document navigation, finalize_a_cross_document_navigation step
2 calls set_delaying_load_events(false) before the session history
traversal activates the new document. This creates a transient state
where the parent's load event delay check sees the about:blank (which
has ready_for_post_load_tasks=true) as the active document and
completes prematurely.
2026-03-28 09:39:51 +01:00
|
|
|
#include <LibWeb/Platform/Timer.h>
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2026-05-16 13:47:49 +02:00
|
|
|
struct RustFfiHtmlParserHandle;
|
|
|
|
|
|
LibWeb: Complete Rust HTML tree construction
Finish the Rust implementation of the spec tree-construction algorithms
needed by the LibWeb test suite. Add the remaining table modes, foster
parenting, scope helpers, adoption agency handling, ruby/list/form and
select cases, frameset state, foreign-content edge cases, and parser
host callbacks.
Preserve behavior that depends on the C++ DOM integration, including
parser-created custom element reactions, fragment quirks mode, arbitrary
fragment namespaces, template fragment mode, fragment form ownership,
MathML annotation-xml boundaries, contextual fragment scripts, parser
script source positions, document.close() parser state, void-element
insertion, and duplicate attribute tracking.
Add focused tests for the parser edge cases that are easy to regress at
the boundary between the Rust tree builder and the C++ DOM host.
2026-05-15 21:56:35 +02:00
|
|
|
namespace Web::SVG {
|
|
|
|
|
|
|
|
|
|
class SVGScriptElement;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-28 18:20:36 +02:00
|
|
|
namespace Web::HTML {
|
|
|
|
|
|
2026-05-15 20:57:22 +02:00
|
|
|
class HTMLScriptElement;
|
2026-05-16 13:47:49 +02:00
|
|
|
class HTMLFormElement;
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2025-07-19 19:35:33 -07:00
|
|
|
class WEB_API HTMLParser final : public JS::Cell {
|
2024-11-15 04:01:23 +13:00
|
|
|
GC_CELL(HTMLParser, JS::Cell);
|
|
|
|
|
GC_DECLARE_ALLOCATOR(HTMLParser);
|
2022-10-17 10:46:11 +02:00
|
|
|
|
2020-05-24 00:14:23 +02:00
|
|
|
public:
|
2026-05-15 20:29:23 +02:00
|
|
|
static constexpr bool OVERRIDES_FINALIZE = true;
|
|
|
|
|
|
|
|
|
|
virtual ~HTMLParser() override;
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2024-11-15 04:01:23 +13:00
|
|
|
static GC::Ref<HTMLParser> create_for_scripting(DOM::Document&);
|
2026-04-28 19:47:49 +02:00
|
|
|
static GC::Ref<HTMLParser> create_with_open_input_stream(DOM::Document&);
|
2024-11-15 04:01:23 +13:00
|
|
|
static GC::Ref<HTMLParser> create_with_uncertain_encoding(DOM::Document&, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type = {});
|
2026-05-16 13:35:37 +02:00
|
|
|
static GC::Ref<HTMLParser> create(DOM::Document&, StringView input, ParserScriptingMode, StringView encoding);
|
2026-05-24 09:07:05 +02:00
|
|
|
static GC::Ref<HTMLParser> create_for_decoded_string(DOM::Document&, StringView input, ParserScriptingMode, StringView encoding);
|
2021-05-12 10:47:12 +02:00
|
|
|
|
2024-02-18 12:45:53 -05:00
|
|
|
void run(HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
|
2025-08-29 13:02:52 +01:00
|
|
|
void run(URL::URL const&, HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
|
2026-04-28 19:49:17 +02:00
|
|
|
void run_until_completion(HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
|
LibWeb: Complete Rust HTML tree construction
Finish the Rust implementation of the spec tree-construction algorithms
needed by the LibWeb test suite. Add the remaining table modes, foster
parenting, scope helpers, adoption agency handling, ruby/list/form and
select cases, frameset state, foreign-content edge cases, and parser
host callbacks.
Preserve behavior that depends on the C++ DOM integration, including
parser-created custom element reactions, fragment quirks mode, arbitrary
fragment namespaces, template fragment mode, fragment form ownership,
MathML annotation-xml boundaries, contextual fragment scripts, parser
script source positions, document.close() parser state, void-element
insertion, and duplicate attribute tracking.
Add focused tests for the parser edge cases that are easy to regress at
the boundary between the Rust tree builder and the C++ DOM host.
2026-05-15 21:56:35 +02:00
|
|
|
void pop_all_open_elements();
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2024-11-15 04:01:23 +13:00
|
|
|
static void the_end(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser> = nullptr);
|
2023-12-19 12:51:34 +00:00
|
|
|
|
2020-07-26 19:37:56 +02:00
|
|
|
DOM::Document& document();
|
2024-06-25 20:55:58 +01:00
|
|
|
enum class AllowDeclarativeShadowRoots {
|
|
|
|
|
No,
|
|
|
|
|
Yes,
|
|
|
|
|
};
|
2026-04-12 13:31:30 +02:00
|
|
|
static WebIDL::ExceptionOr<Vector<GC::Root<DOM::Node>>> parse_html_fragment(DOM::Element& context_element, StringView markup, AllowDeclarativeShadowRoots = AllowDeclarativeShadowRoots::No, ParserScriptingMode = ParserScriptingMode::Inert);
|
2025-09-10 23:06:59 +02:00
|
|
|
|
2024-06-25 10:49:54 +02:00
|
|
|
enum class SerializableShadowRoots {
|
|
|
|
|
No,
|
|
|
|
|
Yes,
|
|
|
|
|
};
|
2026-05-20 20:58:46 +02:00
|
|
|
static String serialize_html_fragment(DOM::Node const&, SerializableShadowRoots, ReadonlySpan<GC::Ref<DOM::ShadowRoot>>, DOM::FragmentSerializationMode = DOM::FragmentSerializationMode::Inner);
|
2020-06-25 23:42:08 +02:00
|
|
|
|
2022-02-19 15:58:21 +01:00
|
|
|
HTMLTokenizer& tokenizer() { return m_tokenizer; }
|
|
|
|
|
|
2026-05-15 20:57:22 +02:00
|
|
|
void configure_element_created_by_rust_parser(DOM::Element&);
|
|
|
|
|
GC::Ref<DOM::Element> create_element_for_rust_parser(HTMLToken const&, Optional<FlyString> const& namespace_, DOM::Node& intended_parent, bool had_duplicate_attribute, GC::Ptr<HTMLFormElement>, bool has_template_element_on_stack);
|
LibWeb: Complete Rust HTML tree construction
Finish the Rust implementation of the spec tree-construction algorithms
needed by the LibWeb test suite. Add the remaining table modes, foster
parenting, scope helpers, adoption agency handling, ruby/list/form and
select cases, frameset state, foreign-content edge cases, and parser
host callbacks.
Preserve behavior that depends on the C++ DOM integration, including
parser-created custom element reactions, fragment quirks mode, arbitrary
fragment namespaces, template fragment mode, fragment form ownership,
MathML annotation-xml boundaries, contextual fragment scripts, parser
script source positions, document.close() parser state, void-element
insertion, and duplicate attribute tracking.
Add focused tests for the parser edge cases that are easy to regress at
the boundary between the Rust tree builder and the C++ DOM host.
2026-05-15 21:56:35 +02:00
|
|
|
void prepare_svg_script_for_rust_parser(SVG::SVGScriptElement&, size_t source_line_number);
|
|
|
|
|
void set_script_source_line_from_rust_parser(DOM::Element&, size_t source_line_number);
|
|
|
|
|
void mark_script_already_started_from_rust_parser(HTMLScriptElement&);
|
|
|
|
|
void stop_parsing_from_rust_parser();
|
2026-05-15 20:57:22 +02:00
|
|
|
bool process_script_end_tag_from_rust_parser(HTMLScriptElement&);
|
LibWeb: Complete Rust HTML tree construction
Finish the Rust implementation of the spec tree-construction algorithms
needed by the LibWeb test suite. Add the remaining table modes, foster
parenting, scope helpers, adoption agency handling, ruby/list/form and
select cases, frameset state, foreign-content edge cases, and parser
host callbacks.
Preserve behavior that depends on the C++ DOM integration, including
parser-created custom element reactions, fragment quirks mode, arbitrary
fragment namespaces, template fragment mode, fragment form ownership,
MathML annotation-xml boundaries, contextual fragment scripts, parser
script source positions, document.close() parser state, void-element
insertion, and duplicate attribute tracking.
Add focused tests for the parser edge cases that are easy to regress at
the boundary between the Rust tree builder and the C++ DOM host.
2026-05-15 21:56:35 +02:00
|
|
|
bool process_svg_script_end_tag_from_rust_parser(SVG::SVGScriptElement&);
|
2026-05-15 20:57:22 +02:00
|
|
|
|
2022-09-20 21:08:14 +02:00
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#abort-a-parser
|
|
|
|
|
void abort();
|
|
|
|
|
|
2022-02-19 15:58:21 +01:00
|
|
|
bool aborted() const { return m_aborted; }
|
2022-09-20 21:08:14 +02:00
|
|
|
bool stopped() const { return m_stop_parsing; }
|
2026-04-28 19:51:22 +02:00
|
|
|
bool is_paused() const { return m_parser_pause_flag; }
|
2026-04-28 19:47:49 +02:00
|
|
|
bool is_script_created() const { return m_script_created; }
|
2022-02-19 15:58:21 +01:00
|
|
|
|
|
|
|
|
size_t script_nesting_level() const { return m_script_nesting_level; }
|
|
|
|
|
|
2026-04-25 23:59:12 +02:00
|
|
|
void schedule_resume_check();
|
|
|
|
|
void set_post_parse_action(Function<void()> action) { m_post_parse_action = move(action); }
|
2026-05-17 09:33:22 +02:00
|
|
|
void invoke_post_parse_action_for_testing() { invoke_post_parse_action(); }
|
2026-04-25 23:59:12 +02:00
|
|
|
|
2020-05-24 00:14:23 +02:00
|
|
|
private:
|
2026-04-28 19:47:49 +02:00
|
|
|
enum class ScriptCreatedParser {
|
|
|
|
|
No,
|
|
|
|
|
Yes,
|
|
|
|
|
};
|
|
|
|
|
|
2026-05-24 09:07:05 +02:00
|
|
|
HTMLParser(DOM::Document&, ParserScriptingMode, StringView input, StringView encoding, HTMLTokenizer::InputType = HTMLTokenizer::InputType::EncodedBytes);
|
2026-05-16 13:35:37 +02:00
|
|
|
HTMLParser(DOM::Document&, ParserScriptingMode, ScriptCreatedParser);
|
2022-02-21 21:54:21 +01:00
|
|
|
|
2022-10-17 10:46:11 +02:00
|
|
|
virtual void visit_edges(Cell::Visitor&) override;
|
2025-04-04 09:20:27 -06:00
|
|
|
virtual void initialize(JS::Realm&) override;
|
2026-05-15 20:29:23 +02:00
|
|
|
virtual void finalize() override;
|
2022-10-17 10:46:11 +02:00
|
|
|
|
2020-05-28 18:55:18 +02:00
|
|
|
void stop_parsing() { m_stop_parsing = true; }
|
|
|
|
|
|
2026-04-26 03:21:39 +02:00
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#start-the-speculative-html-parser
|
|
|
|
|
void start_the_speculative_html_parser();
|
|
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#stop-the-speculative-html-parser
|
|
|
|
|
void stop_the_speculative_html_parser();
|
|
|
|
|
|
2024-11-15 04:01:23 +13:00
|
|
|
GC::Ref<DOM::Element> create_element_for(HTMLToken const&, Optional<FlyString> const& namespace_, DOM::Node& intended_parent);
|
2020-05-24 22:00:46 +02:00
|
|
|
void increment_script_nesting_level();
|
|
|
|
|
void decrement_script_nesting_level();
|
2020-05-30 16:22:25 +02:00
|
|
|
|
2026-04-25 23:59:12 +02:00
|
|
|
void resume_after_parser_blocking_script();
|
|
|
|
|
void invoke_post_parse_action();
|
|
|
|
|
|
2020-05-24 00:14:23 +02:00
|
|
|
HTMLTokenizer m_tokenizer;
|
2026-05-15 20:29:23 +02:00
|
|
|
RustFfiHtmlParserHandle* m_rust_parser { nullptr };
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2020-05-24 00:49:22 +02:00
|
|
|
bool m_parsing_fragment { false };
|
2022-09-23 20:43:17 +01:00
|
|
|
|
2026-04-12 13:31:30 +02:00
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#scripting-mode
|
|
|
|
|
ParserScriptingMode m_scripting_mode {};
|
2026-04-28 19:47:49 +02:00
|
|
|
bool m_script_created { false };
|
2022-09-23 20:43:17 +01:00
|
|
|
|
2020-05-27 23:01:04 +02:00
|
|
|
bool m_aborted { false };
|
2020-05-24 22:00:46 +02:00
|
|
|
bool m_parser_pause_flag { false };
|
2020-05-28 18:55:18 +02:00
|
|
|
bool m_stop_parsing { false };
|
2026-04-25 23:59:12 +02:00
|
|
|
bool m_resume_check_pending { false };
|
2020-05-24 22:00:46 +02:00
|
|
|
size_t m_script_nesting_level { 0 };
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2026-04-25 23:59:12 +02:00
|
|
|
Function<void()> m_post_parse_action;
|
|
|
|
|
|
2022-08-28 13:42:07 +02:00
|
|
|
JS::Realm& realm();
|
|
|
|
|
|
2024-11-15 04:01:23 +13:00
|
|
|
GC::Ptr<DOM::Document> m_document;
|
|
|
|
|
GC::Ptr<HTMLFormElement> m_form_element;
|
|
|
|
|
GC::Ptr<DOM::Element> m_context_element;
|
2020-05-30 17:57:41 +02:00
|
|
|
|
2026-04-26 03:21:39 +02:00
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#active-speculative-html-parser
|
|
|
|
|
GC::Ptr<SpeculativeHTMLParser> m_active_speculative_html_parser;
|
2026-02-17 14:23:39 +01:00
|
|
|
};
|
2020-05-24 00:14:23 +02:00
|
|
|
|
LibWeb: Replace spin_until in HTMLParser::the_end() with state machine
HTMLParser::the_end() had three spin_until calls that blocked the event
loop: step 5 (deferred scripts), step 7 (ASAP scripts), and step 8
(load event delay). This replaces them with an HTMLParserEndState state
machine that progresses asynchronously via callbacks.
The state machine has three phases matching the three spin_until calls:
- WaitingForDeferredScripts: loops executing ready deferred scripts
- WaitingForASAPScripts: waits for ASAP script lists to empty
- WaitingForLoadEventDelay: waits for nothing to delay the load event
Notification triggers re-evaluate the state machine when conditions
change: HTMLScriptElement::mark_as_ready, stylesheet unblocking in
StyleElementBase/HTMLLinkElement, did_stop_being_active_document, and
DocumentLoadEventDelayer decrements. NavigableContainer state changes
(session history readiness, content navigable cleared, lazy load flag)
also trigger re-evaluation of the load event delay check.
Key design decisions and why:
1. Microtask checkpoint in schedule_progress_check(): The old spin_until
called perform_a_microtask_checkpoint() before checking conditions.
This is critical because HTMLImageElement::update_the_image_data step
8 queues a microtask that creates the DocumentLoadEventDelayer.
Without the checkpoint, check_progress() would see zero delayers and
complete before images start delaying the load event.
2. deferred_invoke in schedule_progress_check():
I tried Core::Timer (0ms), queue_global_task, and synchronous calls.
Timers caused non-deterministic ordering with the HTML event loop's
task processing timer, leading to image layout tests failing (wrong
subtest pass/fail patterns). Synchronous calls fired too early during
image load processing before dimensions were set, causing 0-height
images in layout tests. queue_global_task had task ordering issues
with the session history traversal queue. deferred_invoke runs after
the current callback returns but within the same event loop pump,
giving the right balance.
3. Navigation load event guard (m_navigation_load_event_guard): During
cross-document navigation, finalize_a_cross_document_navigation step
2 calls set_delaying_load_events(false) before the session history
traversal activates the new document. This creates a transient state
where the parent's load event delay check sees the about:blank (which
has ready_for_post_load_tasks=true) as the active document and
completes prematurely.
2026-03-28 09:39:51 +01:00
|
|
|
class HTMLParserEndState final : public JS::Cell {
|
|
|
|
|
GC_CELL(HTMLParserEndState, JS::Cell);
|
|
|
|
|
GC_DECLARE_ALLOCATOR(HTMLParserEndState);
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
static GC::Ref<HTMLParserEndState> create(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser>);
|
|
|
|
|
|
|
|
|
|
void schedule_progress_check();
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
enum class Phase {
|
|
|
|
|
WaitingForDeferredScripts,
|
|
|
|
|
WaitingForASAPScripts,
|
|
|
|
|
WaitingForLoadEventDelay,
|
|
|
|
|
Completed,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
HTMLParserEndState(GC::Ref<DOM::Document>, GC::Ptr<HTMLParser>);
|
|
|
|
|
|
|
|
|
|
virtual void visit_edges(Cell::Visitor&) override;
|
|
|
|
|
|
|
|
|
|
void check_progress();
|
|
|
|
|
void advance_to_asap_scripts_phase();
|
|
|
|
|
void complete();
|
|
|
|
|
|
|
|
|
|
Phase m_phase { Phase::WaitingForDeferredScripts };
|
|
|
|
|
bool m_check_pending { false };
|
|
|
|
|
|
|
|
|
|
GC::Ref<DOM::Document> m_document;
|
|
|
|
|
GC::Ptr<HTMLParser> m_parser;
|
|
|
|
|
GC::Ref<Platform::Timer> m_timeout;
|
|
|
|
|
};
|
|
|
|
|
|
2025-08-08 10:11:51 +01:00
|
|
|
RefPtr<CSS::StyleValue const> parse_dimension_value(StringView);
|
|
|
|
|
RefPtr<CSS::StyleValue const> parse_nonzero_dimension_value(StringView);
|
2024-01-16 19:04:45 +01:00
|
|
|
Optional<Color> parse_legacy_color_value(StringView);
|
2026-04-30 11:15:16 +01:00
|
|
|
RefPtr<CSS::StyleValue const> parse_table_child_element_align_value(StringView);
|
2022-03-26 14:29:52 +01:00
|
|
|
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|