/* * Copyright (c) 2026, the Ladybird developers. * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include namespace XML { static constexpr int MAX_XML_TREE_DEPTH = 5000; struct ParserContext { Listener* listener { nullptr }; Optional error; bool document_ended { false }; OwnPtr root_node; Node* current_node { nullptr }; Optional doctype; HashMap processing_instructions; Version version { Version::Version11 }; Vector parse_errors; Parser::Options const* options { nullptr }; bool is_xhtml_document { false }; int depth { 0 }; }; static ByteString xml_char_to_byte_string(xmlChar const* str) { if (!str) return {}; return ByteString(reinterpret_cast(str)); } static ByteString xml_char_to_byte_string(xmlChar const* str, int len) { if (!str || len <= 0) return {}; return ByteString(StringView(reinterpret_cast(str), static_cast(len))); } static StringView xml_char_to_string_view(xmlChar const* str) { if (!str) return {}; return StringView(reinterpret_cast(str), strlen(reinterpret_cast(str))); } static bool is_known_xhtml_public_id(StringView public_id) { return public_id.is_one_of( "-//W3C//DTD XHTML 1.0 Transitional//EN"sv, "-//W3C//DTD XHTML 1.1//EN"sv, "-//W3C//DTD XHTML 1.0 Strict//EN"sv, "-//W3C//DTD XHTML 1.0 Frameset//EN"sv, "-//W3C//DTD XHTML Basic 1.0//EN"sv, "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"sv, "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"sv, "-//W3C//DTD MathML 2.0//EN"sv, "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"sv, "-//WAPFORUM//DTD XHTML Mobile 1.1//EN"sv, "-//WAPFORUM//DTD XHTML Mobile 1.2//EN"sv); } static void external_subset_handler(void* ctx, xmlChar const*, xmlChar const* external_id, xmlChar const*) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context || !external_id) return; auto public_id = xml_char_to_string_view(external_id); if (is_known_xhtml_public_id(public_id)) context->is_xhtml_document = true; } static xmlEntity s_xhtml_entity_result; static char s_xhtml_entity_utf8_buffer[32]; static xmlEntityPtr get_entity_handler(void* ctx, xmlChar const* name) { auto* parser_ctx = static_cast(ctx); auto* predefined = xmlGetPredefinedEntity(name); if (predefined) return predefined; if (parser_ctx->myDoc) { auto* doc_entity = xmlGetDocEntity(parser_ctx->myDoc, name); if (doc_entity) return doc_entity; } auto* context = static_cast(parser_ctx->_private); if (!context || !context->is_xhtml_document) return nullptr; // For XHTML documents, resolve named character entities (e.g.,  ) using the // HTML entity table. This avoids parsing a large embedded DTD on every document // and matches the approach used by Blink and WebKit. if (!context->options || !context->options->resolve_named_html_entity) return nullptr; auto entity_name = xml_char_to_string_view(name); auto resolved = context->options->resolve_named_html_entity(entity_name); if (!resolved.has_value()) return nullptr; auto utf8_bytes = resolved->bytes_as_string_view(); if (utf8_bytes.length() >= sizeof(s_xhtml_entity_utf8_buffer)) return nullptr; (void)utf8_bytes.copy_characters_to_buffer(s_xhtml_entity_utf8_buffer, sizeof(s_xhtml_entity_utf8_buffer)); s_xhtml_entity_result = {}; s_xhtml_entity_result.type = XML_ENTITY_DECL; s_xhtml_entity_result.name = name; s_xhtml_entity_result.content = reinterpret_cast(s_xhtml_entity_utf8_buffer); s_xhtml_entity_result.length = static_cast(utf8_bytes.length()); s_xhtml_entity_result.etype = XML_INTERNAL_PREDEFINED_ENTITY; return &s_xhtml_entity_result; } static void start_document_handler(void* ctx) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; if (parser_ctx->version) { auto version_str = xml_char_to_byte_string(parser_ctx->version); if (version_str == "1.0"sv) context->version = Version::Version10; else context->version = Version::Version11; } if (context->listener) context->listener->document_start(); } static void end_document_handler(void* ctx) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; context->document_ended = true; if (context->listener) context->listener->document_end(); } static void start_element_ns_handler(void* ctx, xmlChar const* localname, xmlChar const* prefix, xmlChar const*, int nb_namespaces, xmlChar const** namespaces, int nb_attributes, int nb_defaulted, xmlChar const** attributes) { (void)nb_defaulted; auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; if (++context->depth > MAX_XML_TREE_DEPTH) { size_t offset = 0; if (parser_ctx->input && parser_ctx->input->cur && parser_ctx->input->base) offset = static_cast(parser_ctx->input->cur - parser_ctx->input->base); ParseError parse_error { .position = LineTrackingLexer::Position { .offset = offset }, .error = ByteString("Excessive node nesting."sv), }; context->parse_errors.append(parse_error); if (context->listener) context->listener->error(parse_error); xmlStopParser(parser_ctx); return; } StringBuilder name_builder; if (prefix) { name_builder.append(xml_char_to_string_view(prefix)); name_builder.append(':'); } name_builder.append(xml_char_to_string_view(localname)); auto name = name_builder.to_byte_string(); OrderedHashMap attrs; for (int i = 0; i < nb_namespaces; i++) { auto* ns_prefix = namespaces[i * 2]; auto* ns_uri = namespaces[i * 2 + 1]; StringBuilder attr_name; if (ns_prefix) { attr_name.append("xmlns:"sv); attr_name.append(xml_char_to_string_view(ns_prefix)); } else { attr_name.append("xmlns"sv); } attrs.set(attr_name.to_byte_string(), xml_char_to_byte_string(ns_uri)); } for (int i = 0; i < nb_attributes; i++) { auto* attr_localname = attributes[i * 5 + 0]; auto* attr_prefix = attributes[i * 5 + 1]; auto* value_begin = attributes[i * 5 + 3]; auto* value_end = attributes[i * 5 + 4]; StringBuilder attr_name; if (attr_prefix) { attr_name.append(xml_char_to_string_view(attr_prefix)); attr_name.append(':'); } attr_name.append(xml_char_to_string_view(attr_localname)); auto value_len = static_cast(value_end - value_begin); auto value = xml_char_to_byte_string(value_begin, value_len); attrs.set(attr_name.to_byte_string(), value); } if (context->listener) { context->listener->element_start(name, attrs); } else { auto element = adopt_own(*new Node { .offset = {}, .content = Node::Element { name, move(attrs), {} }, .parent = context->current_node, }); auto* element_ptr = element.ptr(); if (context->current_node) { VERIFY(context->current_node->is_element()); context->current_node->content.get().children.append(move(element)); } else { context->root_node = move(element); } context->current_node = element_ptr; } } static void end_element_ns_handler(void* ctx, xmlChar const* localname, xmlChar const* prefix, xmlChar const*) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; --context->depth; StringBuilder name_builder; if (prefix) { name_builder.append(xml_char_to_string_view(prefix)); name_builder.append(':'); } name_builder.append(xml_char_to_string_view(localname)); auto name = name_builder.to_byte_string(); if (context->listener) { context->listener->element_end(name); } else if (context->current_node) { context->current_node = context->current_node->parent; } } static void characters_handler(void* ctx, xmlChar const* ch, int len) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; auto text = StringView(reinterpret_cast(ch), static_cast(len)); if (context->listener) { context->listener->text(text); } else if (context->current_node && context->current_node->is_element()) { auto& children = context->current_node->content.get().children; if (!children.is_empty() && children.last()->is_text()) { children.last()->content.get().builder.append(text); } else { Node::Text text_content; text_content.builder.append(text); auto text_node = adopt_own(*new Node { .offset = {}, .content = move(text_content), .parent = context->current_node, }); children.append(move(text_node)); } } } static void cdata_block_handler(void* ctx, xmlChar const* value, int len) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; auto text = StringView(reinterpret_cast(value), static_cast(len)); if (context->listener) { context->listener->cdata_section(text); } else if (context->current_node && context->current_node->is_element()) { auto& children = context->current_node->content.get().children; Node::Text text_content; text_content.builder.append(text); auto text_node = adopt_own(*new Node { .offset = {}, .content = move(text_content), .parent = context->current_node, }); children.append(move(text_node)); } } static void comment_handler(void* ctx, xmlChar const* value) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; auto comment_text = xml_char_to_byte_string(value); if (context->listener) { context->listener->comment(comment_text); } else if (context->current_node && context->current_node->is_element()) { auto& children = context->current_node->content.get().children; auto comment_node = adopt_own(*new Node { .offset = {}, .content = Node::Comment { comment_text }, .parent = context->current_node, }); children.append(move(comment_node)); } } static void processing_instruction_handler(void* ctx, xmlChar const* target, xmlChar const* data) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; auto target_str = xml_char_to_byte_string(target); auto data_str = xml_char_to_byte_string(data); if (context->listener) { context->listener->processing_instruction(target_str, data_str); } else { context->processing_instructions.set(target_str, data_str); } } static void internal_subset_handler(void* ctx, xmlChar const* name, xmlChar const* external_id, xmlChar const* system_id) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context) return; Doctype doctype; doctype.type = xml_char_to_byte_string(name); if (external_id || system_id) { ExternalID ext_id; if (external_id) ext_id.public_id = PublicID { xml_char_to_byte_string(external_id) }; ext_id.system_id = SystemID { xml_char_to_byte_string(system_id) }; doctype.external_id = move(ext_id); } context->doctype = move(doctype); if (context->listener) context->listener->set_doctype(context->doctype.value()); } static void structured_error_handler(void* ctx, xmlError const* error) { auto* parser_ctx = static_cast(ctx); auto* context = static_cast(parser_ctx->_private); if (!context || !error) return; size_t offset = 0; if (parser_ctx->input && parser_ctx->input->cur && parser_ctx->input->base) offset = static_cast(parser_ctx->input->cur - parser_ctx->input->base); ParseError parse_error { .position = LineTrackingLexer::Position { .offset = offset, .line = error->line > 0 ? static_cast(error->line) : 0, .column = error->int2 > 0 ? static_cast(error->int2) : 0, }, .error = ByteString(error->message ? StringView(error->message, strlen(error->message)).trim_whitespace() : "Unknown error"sv), }; context->parse_errors.append(parse_error); if (context->listener) context->listener->error(parse_error); if (!context->error.has_value()) context->error = move(parse_error); } static xmlSAXHandler create_sax_handler(bool preserve_comments, bool resolve_html_entities) { xmlSAXHandler handler = {}; handler.initialized = XML_SAX2_MAGIC; handler.startDocument = start_document_handler; handler.endDocument = end_document_handler; handler.startElementNs = start_element_ns_handler; handler.endElementNs = end_element_ns_handler; handler.characters = characters_handler; handler.cdataBlock = cdata_block_handler; handler.processingInstruction = processing_instruction_handler; handler.internalSubset = internal_subset_handler; handler.serror = structured_error_handler; if (preserve_comments) handler.comment = comment_handler; if (resolve_html_entities) { handler.externalSubset = external_subset_handler; handler.getEntity = get_entity_handler; } return handler; } ErrorOr Parser::parse_with_listener(Listener& listener) { auto source_result = listener.set_source(ByteString(m_source)); if (source_result.is_error()) return ParseError { {}, ByteString("Failed to set source") }; ParserContext context; context.listener = &listener; context.options = &m_options; bool resolve_html_entities = static_cast(m_options.resolve_named_html_entity); auto sax_handler = create_sax_handler(m_options.preserve_comments, resolve_html_entities); int options = XML_PARSE_NONET | XML_PARSE_NOWARNING; if (!m_options.preserve_cdata) options |= XML_PARSE_NOCDATA; auto* parser_ctx = xmlCreatePushParserCtxt(&sax_handler, nullptr, nullptr, 0, nullptr); if (!parser_ctx) return ParseError { {}, ByteString("Failed to create parser context") }; parser_ctx->_private = &context; xmlCtxtUseOptions(parser_ctx, options); xmlSwitchEncoding(parser_ctx, XML_CHAR_ENCODING_UTF8); auto result = xmlParseChunk(parser_ctx, m_source.characters_without_null_termination(), static_cast(m_source.length()), 1); bool well_formed = parser_ctx->wellFormed; xmlFreeParserCtxt(parser_ctx); m_parse_errors = move(context.parse_errors); if (!context.document_ended) listener.document_end(); if (context.error.has_value() && m_options.treat_errors_as_fatal) return context.error.release_value(); if (result != 0 || !well_formed) { if (!m_parse_errors.is_empty()) return m_parse_errors.first(); return ParseError { {}, ByteString("XML parsing failed") }; } return {}; } ErrorOr Parser::parse() { ParserContext context; context.options = &m_options; bool resolve_html_entities = static_cast(m_options.resolve_named_html_entity); auto sax_handler = create_sax_handler(m_options.preserve_comments, resolve_html_entities); int options = XML_PARSE_NONET | XML_PARSE_NOWARNING; if (!m_options.preserve_cdata) options |= XML_PARSE_NOCDATA; auto* parser_ctx = xmlCreatePushParserCtxt(&sax_handler, nullptr, nullptr, 0, nullptr); if (!parser_ctx) return ParseError { {}, ByteString("Failed to create parser context") }; parser_ctx->_private = &context; xmlCtxtUseOptions(parser_ctx, options); xmlSwitchEncoding(parser_ctx, XML_CHAR_ENCODING_UTF8); auto result = xmlParseChunk(parser_ctx, m_source.characters_without_null_termination(), static_cast(m_source.length()), 1); bool well_formed = parser_ctx->wellFormed; xmlFreeParserCtxt(parser_ctx); m_parse_errors = move(context.parse_errors); if (context.error.has_value() && m_options.treat_errors_as_fatal) return context.error.release_value(); if (result != 0 || !well_formed) { if (!m_parse_errors.is_empty()) return m_parse_errors.first(); return ParseError { {}, ByteString("XML parsing failed") }; } if (!context.root_node) return ParseError { {}, ByteString("No root element") }; return Document(context.root_node.release_nonnull(), move(context.doctype), move(context.processing_instructions), context.version); } }