/* * Copyright (c) 2020-2025, Andreas Kling * Copyright (c) 2021, Luke Wilde * Copyright (c) 2023-2024, Shannon Booth * Copyright (c) 2025, Lorenz Ackermann * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace Web::HTML { GC_DEFINE_ALLOCATOR(HTMLParser); GC_DEFINE_ALLOCATOR(HTMLParserEndState); static inline void log_parse_error(SourceLocation const& location = SourceLocation::current()) { dbgln_if(HTML_PARSER_DEBUG, "Parse error! {}", location); } static DOM::Node& node_from_html_parser_ffi(size_t); static HTMLParser& parser_from_html_parser_ffi(void*); static RustFfiHtmlNamespace namespace_to_html_parser_ffi(Optional const&); static RustFfiHtmlAttributeNamespace attribute_namespace_to_html_parser_ffi(Optional const&); static RustFfiHtmlQuirksMode quirks_mode_to_html_parser_ffi(DOM::QuirksMode); extern "C" void ladybird_html_parser_log_parse_error(void*, u8 const*, size_t); extern "C" void ladybird_html_parser_stop_parsing(void*); extern "C" bool ladybird_html_parser_parse_errors_enabled(); extern "C" void ladybird_html_parser_visit_node(void*, size_t); extern "C" size_t ladybird_html_parser_document_node(void*); extern "C" size_t ladybird_html_parser_document_html_element(void*); extern "C" void ladybird_html_parser_set_document_quirks_mode(void*, RustFfiHtmlQuirksMode); extern "C" size_t ladybird_html_parser_create_document_type(void*, u8 const*, size_t, u8 const*, size_t, u8 const*, size_t); extern "C" size_t ladybird_html_parser_create_comment(void*, u8 const*, size_t); extern "C" void ladybird_html_parser_insert_text(size_t, size_t, u8 const*, size_t); extern "C" void ladybird_html_parser_add_missing_attribute(size_t, u8 const*, size_t, u8 const*, size_t); extern "C" void ladybird_html_parser_remove_node(size_t); extern "C" void ladybird_html_parser_handle_element_popped(size_t); extern "C" void ladybird_html_parser_prepare_svg_script(void*, size_t, size_t); extern "C" void ladybird_html_parser_set_script_source_line(void*, size_t, size_t); extern "C" void ladybird_html_parser_mark_script_already_started(void*, size_t); extern "C" size_t ladybird_html_parser_parent_node(size_t); extern "C" size_t ladybird_html_parser_create_element(void*, size_t, RustFfiHtmlNamespace, u8 const*, size_t, u8 const*, size_t, RustFfiHtmlParserAttribute const*, size_t, bool, size_t, bool); extern "C" void ladybird_html_parser_append_child(size_t, size_t); extern "C" void ladybird_html_parser_insert_node(size_t, size_t, size_t, bool); extern "C" void ladybird_html_parser_move_all_children(size_t, size_t); extern "C" size_t ladybird_html_parser_template_content(size_t); extern "C" size_t ladybird_html_parser_attach_declarative_shadow_root(size_t, RustFfiHtmlShadowRootMode, RustFfiHtmlSlotAssignmentMode, bool, bool, bool, bool); extern "C" void ladybird_html_parser_set_template_content(size_t, size_t); extern "C" bool ladybird_html_parser_allows_declarative_shadow_roots(size_t); Optional html_parser_backend_from_string(StringView backend) { if (backend == "cpp"sv) return HTMLParserBackend::Cpp; if (backend == "rust"sv) return HTMLParserBackend::Rust; return {}; } StringView html_parser_backend_name(HTMLParserBackend backend) { switch (backend) { case HTMLParserBackend::Cpp: return "cpp"sv; case HTMLParserBackend::Rust: return "rust"sv; } VERIFY_NOT_REACHED(); } HTMLParserBackend default_html_parser_backend() { static HTMLParserBackend s_backend = [] { auto* backend = getenv("LIBWEB_HTML_PARSER"); if (!backend) return HTMLParserBackend::Cpp; if (auto parsed_backend = html_parser_backend_from_string(StringView { backend, strlen(backend) }); parsed_backend.has_value()) return parsed_backend.value(); dbgln("Unknown LIBWEB_HTML_PARSER value '{}'; using cpp", backend); return HTMLParserBackend::Cpp; }(); return s_backend; } static Vector const s_quirks_public_ids = { "+//Silmaril//dtd html Pro v0r11 19970101//"sv, "-//AS//DTD HTML 3.0 asWedit + extensions//"sv, "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"sv, "-//IETF//DTD HTML 2.0 Level 1//"sv, "-//IETF//DTD HTML 2.0 Level 2//"sv, "-//IETF//DTD HTML 2.0 Strict Level 1//"sv, "-//IETF//DTD HTML 2.0 Strict Level 2//"sv, "-//IETF//DTD HTML 2.0 Strict//"sv, "-//IETF//DTD HTML 2.0//"sv, "-//IETF//DTD HTML 2.1E//"sv, "-//IETF//DTD HTML 3.0//"sv, "-//IETF//DTD HTML 3.2 Final//"sv, "-//IETF//DTD HTML 3.2//"sv, "-//IETF//DTD HTML 3//"sv, "-//IETF//DTD HTML Level 0//"sv, "-//IETF//DTD HTML Level 1//"sv, "-//IETF//DTD HTML Level 2//"sv, "-//IETF//DTD HTML Level 3//"sv, "-//IETF//DTD HTML Strict Level 0//"sv, "-//IETF//DTD HTML Strict Level 1//"sv, "-//IETF//DTD HTML Strict Level 2//"sv, "-//IETF//DTD HTML Strict Level 3//"sv, "-//IETF//DTD HTML Strict//"sv, "-//IETF//DTD HTML//"sv, "-//Metrius//DTD Metrius Presentational//"sv, "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"sv, "-//Microsoft//DTD Internet Explorer 2.0 HTML//"sv, "-//Microsoft//DTD Internet Explorer 2.0 Tables//"sv, "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"sv, "-//Microsoft//DTD Internet Explorer 3.0 HTML//"sv, "-//Microsoft//DTD Internet Explorer 3.0 Tables//"sv, "-//Netscape Comm. Corp.//DTD HTML//"sv, "-//Netscape Comm. Corp.//DTD Strict HTML//"sv, "-//O'Reilly and Associates//DTD HTML 2.0//"sv, "-//O'Reilly and Associates//DTD HTML Extended 1.0//"sv, "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"sv, "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"sv, "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//"sv, "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//"sv, "-//Spyglass//DTD HTML 2.0 Extended//"sv, "-//Sun Microsystems Corp.//DTD HotJava HTML//"sv, "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"sv, "-//W3C//DTD HTML 3 1995-03-24//"sv, "-//W3C//DTD HTML 3.2 Draft//"sv, "-//W3C//DTD HTML 3.2 Final//"sv, "-//W3C//DTD HTML 3.2//"sv, "-//W3C//DTD HTML 3.2S Draft//"sv, "-//W3C//DTD HTML 4.0 Frameset//"sv, "-//W3C//DTD HTML 4.0 Transitional//"sv, "-//W3C//DTD HTML Experimental 19960712//"sv, "-//W3C//DTD HTML Experimental 970421//"sv, "-//W3C//DTD W3 HTML//"sv, "-//W3O//DTD W3 HTML 3.0//"sv, "-//WebTechs//DTD Mozilla HTML 2.0//"sv, "-//WebTechs//DTD Mozilla HTML//"sv, }; // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point static bool is_mathml_text_integration_point(DOM::Element const& element) { // A node is a MathML text integration point if it is one of the following elements: // - A MathML mi element // - A MathML mo element // - A MathML mn element // - A MathML ms element // - A MathML mtext element return element.local_name().is_one_of(MathML::TagNames::mi, MathML::TagNames::mo, MathML::TagNames::mn, MathML::TagNames::ms, MathML::TagNames::mtext); } // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point static bool is_html_integration_point(DOM::Element const& element) { // A node is an HTML integration point if it is one of the following elements: // - A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" // - A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" if (element.namespace_uri() == Namespace::MathML && element.local_name() == MathML::TagNames::annotation_xml) { auto encoding = element.attribute("encoding"_fly_string); if (encoding.has_value() && (encoding->equals_ignoring_ascii_case("text/html"sv) || encoding->equals_ignoring_ascii_case("application/xhtml+xml"sv))) return true; } // - An SVG foreignObject element // - An SVG desc element // - An SVG title element if (element.namespace_uri() == Namespace::SVG && element.local_name().is_one_of(SVG::TagNames::foreignObject, SVG::TagNames::desc, SVG::TagNames::title)) { return true; } return false; } HTMLParser::HTMLParser(DOM::Document& document, ParserScriptingMode scripting_mode, StringView input, StringView encoding, HTMLParserBackend backend) : m_tokenizer(input, encoding) , m_backend(backend) , m_scripting_mode(scripting_mode) , m_document(document) { if (m_backend == HTMLParserBackend::Rust) m_rust_parser = rust_html_parser_create(); m_tokenizer.set_parser({}, *this); m_document->set_parser({}, *this); m_stack_of_open_elements.set_on_element_popped([this](DOM::Element& element) { handle_element_popped(element); }); auto standardized_encoding = TextCodec::get_standardized_encoding(encoding); VERIFY(standardized_encoding.has_value()); m_document->set_encoding(MUST(String::from_utf8(standardized_encoding.value()))); } HTMLParser::HTMLParser(DOM::Document& document, ParserScriptingMode scripting_mode, ScriptCreatedParser script_created, HTMLParserBackend backend) : m_backend(backend) , m_scripting_mode(scripting_mode) , m_script_created(script_created == ScriptCreatedParser::Yes) , m_document(document) { if (m_backend == HTMLParserBackend::Rust) m_rust_parser = rust_html_parser_create(); m_document->set_parser({}, *this); m_tokenizer.set_parser({}, *this); m_stack_of_open_elements.set_on_element_popped([this](DOM::Element& element) { handle_element_popped(element); }); } HTMLParser::~HTMLParser() = default; void HTMLParser::finalize() { Base::finalize(); if (m_rust_parser) { rust_html_parser_destroy(m_rust_parser); m_rust_parser = nullptr; } } void HTMLParser::visit_edges(Cell::Visitor& visitor) { Base::visit_edges(visitor); visitor.visit(m_document); visitor.visit(m_head_element); visitor.visit(m_form_element); visitor.visit(m_context_element); visitor.visit(m_active_speculative_html_parser); visitor.visit(m_character_insertion_node); m_stack_of_open_elements.visit_edges(visitor); m_list_of_active_formatting_elements.visit_edges(visitor); m_tokenizer.visit_edges(visitor); if (m_rust_parser) rust_html_parser_visit_edges(m_rust_parser, &visitor); } void HTMLParser::initialize(JS::Realm& realm) { Base::initialize(realm); } void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point) { if (m_backend == HTMLParserBackend::Rust) { m_stop_parsing = false; for (;;) { if (m_parser_pause_flag) break; auto result = rust_html_parser_run_document( m_rust_parser, m_tokenizer.ffi_handle({}), this, m_scripting_mode != ParserScriptingMode::Disabled, stop_at_insertion_point == HTMLTokenizer::StopAtInsertionPoint::Yes); if (result == RustFfiHtmlParserRunResult::Ok) break; if (result == RustFfiHtmlParserRunResult::ExecuteScript) { auto script = rust_html_parser_take_pending_script(m_rust_parser); VERIFY(script); process_script_end_tag_from_rust_parser(as(node_from_html_parser_ffi(script))); continue; } if (result == RustFfiHtmlParserRunResult::ExecuteSvgScript) { auto script = rust_html_parser_take_pending_svg_script(m_rust_parser); VERIFY(script); if (process_svg_script_end_tag_from_rust_parser(as(node_from_html_parser_ffi(script)))) break; continue; } VERIFY_NOT_REACHED(); } m_tokenizer.parser_did_run({}); return; } m_stop_parsing = false; for (;;) { if (m_parser_pause_flag) break; auto optional_token = m_tokenizer.next_token(stop_at_insertion_point); if (!optional_token.has_value()) break; auto& token = optional_token.value(); dbgln_if(HTML_PARSER_DEBUG, "[{}] {}", insertion_mode_name(), token.to_string()); if (m_next_line_feed_can_be_ignored) { m_next_line_feed_can_be_ignored = false; if (token.is_character() && token.code_point() == '\n') { continue; } } // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher // As each token is emitted from the tokenizer, the user agent must follow the appropriate steps from the following list, known as the tree construction dispatcher: if (m_stack_of_open_elements.is_empty() || adjusted_current_node()->namespace_uri() == Namespace::HTML || (is_mathml_text_integration_point(*adjusted_current_node()) && token.is_start_tag() && token.tag_name() != MathML::TagNames::mglyph && token.tag_name() != MathML::TagNames::malignmark) || (is_mathml_text_integration_point(*adjusted_current_node()) && token.is_character()) || (adjusted_current_node()->namespace_uri() == Namespace::MathML && adjusted_current_node()->local_name() == MathML::TagNames::annotation_xml && token.is_start_tag() && token.tag_name() == SVG::TagNames::svg) || (is_html_integration_point(*adjusted_current_node()) && (token.is_start_tag() || token.is_character())) || token.is_end_of_file()) { // -> If the stack of open elements is empty // -> If the adjusted current node is an element in the HTML namespace // -> If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" // -> If the adjusted current node is a MathML text integration point and the token is a character token // -> If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" // -> If the adjusted current node is an HTML integration point and the token is a start tag // -> If the adjusted current node is an HTML integration point and the token is a character token // -> If the token is an end-of-file token // Process the token according to the rules given in the section corresponding to the current insertion mode in HTML content. process_using_the_rules_for(m_insertion_mode, token); } else { // -> Otherwise // Process the token according to the rules given in the section for parsing tokens in foreign content. process_using_the_rules_for_foreign_content(token); } if (token.is_end_of_file() && m_tokenizer.is_eof_inserted()) break; if (m_stop_parsing) { dbgln_if(HTML_PARSER_DEBUG, "Stop parsing{}! :^)", m_parsing_fragment ? " fragment" : ""); break; } } flush_character_insertions(); m_tokenizer.parser_did_run({}); } void HTMLParser::run(URL::URL const& url, HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point) { m_document->set_url(url); m_document->set_source(m_tokenizer.source()); run_until_completion(stop_at_insertion_point); } void HTMLParser::pop_all_open_elements() { if (m_backend == HTMLParserBackend::Rust) { rust_html_parser_pop_all_open_elements(m_rust_parser); return; } while (!m_stack_of_open_elements.is_empty()) (void)m_stack_of_open_elements.pop(); } void HTMLParser::configure_element_created_by_rust_parser(DOM::Element& element) { if (element.local_name() == HTML::TagNames::link && element.namespace_uri() == Namespace::HTML) { // AD-HOC: Let elements know which document they were originally parsed for. // This is used for the render-blocking logic. auto& link_element = as(element); link_element.set_parser_document({}, document()); link_element.set_was_enabled_when_created_by_parser({}, !element.has_attribute(HTML::AttributeNames::disabled)); return; } if (element.local_name() != HTML::TagNames::script || element.namespace_uri() != Namespace::HTML) return; auto& script_element = as(element); if (m_scripting_mode != ParserScriptingMode::Fragment) script_element.set_parser_document(Badge {}, document()); script_element.set_force_async(Badge {}, false); if (m_scripting_mode == ParserScriptingMode::Inert) script_element.set_already_started(Badge {}, true); } GC::Ref HTMLParser::create_element_for_rust_parser(HTMLToken const& token, Optional const& namespace_, DOM::Node& intended_parent, bool had_duplicate_attribute, GC::Ptr form_element, bool has_template_element_on_stack) { TemporaryChange> suppress_cpp_form_element { m_form_element, {} }; auto element = create_element_for(token, namespace_, intended_parent); configure_element_created_by_rust_parser(element); // AD-HOC: See AD-HOC comment on Element.m_had_duplicate_attribute_during_tokenization about why this is done. if (had_duplicate_attribute) element->set_had_duplicate_attribute_during_tokenization({}); if (form_element && !has_template_element_on_stack) { auto* html_element = as_if(*element); if (html_element && html_element->is_form_associated_element() && !html_element->is_form_associated_custom_element()) { if ((!html_element->is_listed() || !html_element->has_attribute(HTML::AttributeNames::form)) && &intended_parent.root() == &form_element->root()) { html_element->set_form(form_element.ptr()); html_element->set_parser_inserted({}); } } } return element; } bool HTMLParser::process_script_end_tag_from_rust_parser(HTMLScriptElement& script) { // If the active speculative HTML parser is null and the JavaScript execution context stack is empty, then perform a microtask checkpoint. // The active speculative HTML parser is null here; start/stop are paired around the spin_until below. auto& vm = main_thread_event_loop().vm(); if (!vm.has_running_execution_context()) perform_a_microtask_checkpoint(); // Let the old insertion point have the same value as the current insertion point. m_tokenizer.store_old_insertion_point(); // Let the insertion point be just before the next input character. m_tokenizer.update_insertion_point(); // Increment the parser's script nesting level by one. increment_script_nesting_level(); // https://w3c.github.io/trusted-types/dist/spec/#setting-slot-values-from-parser // Set script’s script text value to its child text content. script.set_string_text(script.child_text_content()); // If the active speculative HTML parser is null, then prepare the script element script. // This might cause some script to execute, which might cause new characters to be inserted into the tokenizer, // and might cause the tokenizer to output more tokens, resulting in a reentrant invocation of the parser. // The active speculative HTML parser is null here (see above). script.prepare_script(Badge {}); // Decrement the parser's script nesting level by one. decrement_script_nesting_level(); // If the parser's script nesting level is zero, then set the parser pause flag to false. if (script_nesting_level() == 0) m_parser_pause_flag = false; // Let the insertion point have the value of the old insertion point. m_tokenizer.restore_old_insertion_point(); // At this stage, if the pending parsing-blocking script is not null, then: if (document().pending_parsing_blocking_script()) { // -> If the script nesting level is not zero: if (script_nesting_level() != 0) { // Set the parser pause flag to true, m_parser_pause_flag = true; // and abort the processing of any nested invocations of the tokenizer, yielding control back to the caller. // (Tokenization will resume when the caller returns to the "outer" tree construction stage.) return true; } // -> Otherwise: // The spec's "While the pending parsing-blocking script is not null" loop and the contained "spin the event // loop" step are implemented asynchronously: pause the parser, schedule a resume check, and yield back to // the caller. The remaining steps (4-13) run from resume_after_parser_blocking_script when the script is // ready. // 3. Start the speculative HTML parser for this instance of the HTML parser. start_the_speculative_html_parser(); m_parser_pause_flag = true; schedule_resume_check(); } return m_parser_pause_flag; } void HTMLParser::prepare_svg_script_for_rust_parser(SVG::SVGScriptElement& script, size_t source_line_number) { // AD-HOC: For SVG script elements, set the parser-inserted flag before the element is inserted into the DOM. // Otherwise inserted()/attribute_changed() would invoke process_the_script_element() with the flag still unset // and bypass the parser-blocking fetch handling. // // https://html.spec.whatwg.org/multipage/parsing.html#scripting-mode // The Fragment scripting mode treats parser-inserted scripts as if they were not parser-inserted, allowing, for // example, executing scripts when applying a fragment created by createContextualFragment(). if (m_scripting_mode != ParserScriptingMode::Fragment) script.set_parser_inserted({}); script.set_source_line_number({}, source_line_number); } void HTMLParser::set_script_source_line_from_rust_parser(DOM::Element& element, size_t source_line_number) { if (auto* html_script_element = as_if(element)) { html_script_element->set_source_line_number({}, source_line_number); return; } if (auto* svg_script_element = as_if(element)) svg_script_element->set_source_line_number({}, source_line_number); } void HTMLParser::mark_script_already_started_from_rust_parser(HTMLScriptElement& script) { script.set_already_started(Badge {}, true); } void HTMLParser::stop_parsing_from_rust_parser() { stop_parsing(); } bool HTMLParser::process_svg_script_end_tag_from_rust_parser(SVG::SVGScriptElement& script) { // Let the old insertion point have the same value as the current insertion point. m_tokenizer.store_old_insertion_point(); // Let the insertion point be just before the next input character. m_tokenizer.update_insertion_point(); // Increment the parser's script nesting level by one. increment_script_nesting_level(); // Set the parser pause flag to true. m_parser_pause_flag = true; // Non-standard: Make sure the end tag if the executed script set up a new pending blocking script (e.g. via // document.write). run(); if (m_parser_pause_flag) return; invoke_post_parse_action(); } void HTMLParser::invoke_post_parse_action() { if (auto action = exchange(m_post_parse_action, nullptr)) action(); } void HTMLParser::increment_script_nesting_level() { ++m_script_nesting_level; } void HTMLParser::decrement_script_nesting_level() { VERIFY(m_script_nesting_level); --m_script_nesting_level; } // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata void HTMLParser::handle_text(HTMLToken& token) { // -> A character token if (token.is_character()) { // Insert the token's character. insert_character(token.code_point()); return; } // -> An end-of-file token if (token.is_end_of_file()) { // Parse error. log_parse_error(); // If the current node is a script element, then set its already started to true. if (current_node()->local_name() == HTML::TagNames::script) as(*current_node()).set_already_started(Badge {}, true); // Pop the current node off the stack of open elements. (void)m_stack_of_open_elements.pop(); // Switch the insertion mode to the original insertion mode and reprocess the token. m_insertion_mode = m_original_insertion_mode; process_using_the_rules_for(m_insertion_mode, token); return; } // -> An end tag whose tag name is "script" if (token.is_end_tag() && token.tag_name() == HTML::TagNames::script) { // Non-standard: Make sure the