mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2026-04-19 10:20:22 +00:00
For XHTML documents, resolve named character entities (e.g., ) using the HTML entity table via a getEntity SAX callback. This avoids parsing a large embedded DTD on every document and matches the approach used by Blink and WebKit. This also removes the now-unused DTD infrastructure: - Remove resolve_external_resource callback from Parser::Options - Remove resolve_xml_resource() function and its ~60KB embedded DTD - Remove all call sites passing the unused callback
539 lines
18 KiB
C++
539 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2026, the Ladybird developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/StringBuilder.h>
|
|
#include <LibXML/Parser/Parser.h>
|
|
|
|
#include <libxml/encoding.h>
|
|
#include <libxml/parser.h>
|
|
#include <libxml/parserInternals.h>
|
|
#include <libxml/xmlerror.h>
|
|
|
|
namespace XML {
|
|
|
|
static constexpr int MAX_XML_TREE_DEPTH = 5000;
|
|
|
|
struct ParserContext {
|
|
Listener* listener { nullptr };
|
|
Optional<ParseError> error;
|
|
bool document_ended { false };
|
|
|
|
OwnPtr<Node> root_node;
|
|
Node* current_node { nullptr };
|
|
Optional<Doctype> doctype;
|
|
HashMap<Name, ByteString> processing_instructions;
|
|
Version version { Version::Version11 };
|
|
|
|
Vector<ParseError> parse_errors;
|
|
|
|
Parser::Options const* options { nullptr };
|
|
bool is_xhtml_document { false };
|
|
int depth { 0 };
|
|
};
|
|
|
|
static ByteString xml_char_to_byte_string(xmlChar const* str)
|
|
{
|
|
if (!str)
|
|
return {};
|
|
return ByteString(reinterpret_cast<char const*>(str));
|
|
}
|
|
|
|
static ByteString xml_char_to_byte_string(xmlChar const* str, int len)
|
|
{
|
|
if (!str || len <= 0)
|
|
return {};
|
|
return ByteString(StringView(reinterpret_cast<char const*>(str), static_cast<size_t>(len)));
|
|
}
|
|
|
|
static StringView xml_char_to_string_view(xmlChar const* str)
|
|
{
|
|
if (!str)
|
|
return {};
|
|
return StringView(reinterpret_cast<char const*>(str), strlen(reinterpret_cast<char const*>(str)));
|
|
}
|
|
|
|
static bool is_known_xhtml_public_id(StringView public_id)
|
|
{
|
|
return public_id.is_one_of(
|
|
"-//W3C//DTD XHTML 1.0 Transitional//EN"sv,
|
|
"-//W3C//DTD XHTML 1.1//EN"sv,
|
|
"-//W3C//DTD XHTML 1.0 Strict//EN"sv,
|
|
"-//W3C//DTD XHTML 1.0 Frameset//EN"sv,
|
|
"-//W3C//DTD XHTML Basic 1.0//EN"sv,
|
|
"-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"sv,
|
|
"-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"sv,
|
|
"-//W3C//DTD MathML 2.0//EN"sv,
|
|
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"sv,
|
|
"-//WAPFORUM//DTD XHTML Mobile 1.1//EN"sv,
|
|
"-//WAPFORUM//DTD XHTML Mobile 1.2//EN"sv);
|
|
}
|
|
|
|
static void external_subset_handler(void* ctx, xmlChar const*, xmlChar const* external_id, xmlChar const*)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context || !external_id)
|
|
return;
|
|
|
|
auto public_id = xml_char_to_string_view(external_id);
|
|
if (is_known_xhtml_public_id(public_id))
|
|
context->is_xhtml_document = true;
|
|
}
|
|
|
|
static xmlEntity s_xhtml_entity_result;
|
|
static char s_xhtml_entity_utf8_buffer[32];
|
|
|
|
static xmlEntityPtr get_entity_handler(void* ctx, xmlChar const* name)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
|
|
auto* predefined = xmlGetPredefinedEntity(name);
|
|
if (predefined)
|
|
return predefined;
|
|
|
|
if (parser_ctx->myDoc) {
|
|
auto* doc_entity = xmlGetDocEntity(parser_ctx->myDoc, name);
|
|
if (doc_entity)
|
|
return doc_entity;
|
|
}
|
|
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context || !context->is_xhtml_document)
|
|
return nullptr;
|
|
|
|
// For XHTML documents, resolve named character entities (e.g., ) using the
|
|
// HTML entity table. This avoids parsing a large embedded DTD on every document
|
|
// and matches the approach used by Blink and WebKit.
|
|
if (!context->options || !context->options->resolve_named_html_entity)
|
|
return nullptr;
|
|
|
|
auto entity_name = xml_char_to_string_view(name);
|
|
auto resolved = context->options->resolve_named_html_entity(entity_name);
|
|
if (!resolved.has_value())
|
|
return nullptr;
|
|
|
|
auto utf8_bytes = resolved->bytes_as_string_view();
|
|
if (utf8_bytes.length() >= sizeof(s_xhtml_entity_utf8_buffer))
|
|
return nullptr;
|
|
|
|
(void)utf8_bytes.copy_characters_to_buffer(s_xhtml_entity_utf8_buffer, sizeof(s_xhtml_entity_utf8_buffer));
|
|
|
|
s_xhtml_entity_result = {};
|
|
s_xhtml_entity_result.type = XML_ENTITY_DECL;
|
|
s_xhtml_entity_result.name = name;
|
|
s_xhtml_entity_result.content = reinterpret_cast<xmlChar*>(s_xhtml_entity_utf8_buffer);
|
|
s_xhtml_entity_result.length = static_cast<int>(utf8_bytes.length());
|
|
s_xhtml_entity_result.etype = XML_INTERNAL_PREDEFINED_ENTITY;
|
|
|
|
return &s_xhtml_entity_result;
|
|
}
|
|
|
|
static void start_document_handler(void* ctx)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
if (parser_ctx->version) {
|
|
auto version_str = xml_char_to_byte_string(parser_ctx->version);
|
|
if (version_str == "1.0"sv)
|
|
context->version = Version::Version10;
|
|
else
|
|
context->version = Version::Version11;
|
|
}
|
|
|
|
if (context->listener)
|
|
context->listener->document_start();
|
|
}
|
|
|
|
static void end_document_handler(void* ctx)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
context->document_ended = true;
|
|
if (context->listener)
|
|
context->listener->document_end();
|
|
}
|
|
|
|
static void start_element_ns_handler(void* ctx, xmlChar const* localname, xmlChar const* prefix,
|
|
xmlChar const*, int nb_namespaces, xmlChar const** namespaces,
|
|
int nb_attributes, int nb_defaulted, xmlChar const** attributes)
|
|
{
|
|
(void)nb_defaulted;
|
|
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
if (++context->depth > MAX_XML_TREE_DEPTH) {
|
|
size_t offset = 0;
|
|
if (parser_ctx->input && parser_ctx->input->cur && parser_ctx->input->base)
|
|
offset = static_cast<size_t>(parser_ctx->input->cur - parser_ctx->input->base);
|
|
|
|
ParseError parse_error {
|
|
.position = LineTrackingLexer::Position { .offset = offset },
|
|
.error = ByteString("Excessive node nesting."sv),
|
|
};
|
|
context->parse_errors.append(parse_error);
|
|
|
|
if (context->listener)
|
|
context->listener->error(parse_error);
|
|
|
|
xmlStopParser(parser_ctx);
|
|
return;
|
|
}
|
|
|
|
StringBuilder name_builder;
|
|
if (prefix) {
|
|
name_builder.append(xml_char_to_string_view(prefix));
|
|
name_builder.append(':');
|
|
}
|
|
name_builder.append(xml_char_to_string_view(localname));
|
|
auto name = name_builder.to_byte_string();
|
|
|
|
OrderedHashMap<Name, ByteString> attrs;
|
|
|
|
for (int i = 0; i < nb_namespaces; i++) {
|
|
auto* ns_prefix = namespaces[i * 2];
|
|
auto* ns_uri = namespaces[i * 2 + 1];
|
|
|
|
StringBuilder attr_name;
|
|
if (ns_prefix) {
|
|
attr_name.append("xmlns:"sv);
|
|
attr_name.append(xml_char_to_string_view(ns_prefix));
|
|
} else {
|
|
attr_name.append("xmlns"sv);
|
|
}
|
|
attrs.set(attr_name.to_byte_string(), xml_char_to_byte_string(ns_uri));
|
|
}
|
|
|
|
for (int i = 0; i < nb_attributes; i++) {
|
|
auto* attr_localname = attributes[i * 5 + 0];
|
|
auto* attr_prefix = attributes[i * 5 + 1];
|
|
auto* value_begin = attributes[i * 5 + 3];
|
|
auto* value_end = attributes[i * 5 + 4];
|
|
|
|
StringBuilder attr_name;
|
|
if (attr_prefix) {
|
|
attr_name.append(xml_char_to_string_view(attr_prefix));
|
|
attr_name.append(':');
|
|
}
|
|
attr_name.append(xml_char_to_string_view(attr_localname));
|
|
|
|
auto value_len = static_cast<int>(value_end - value_begin);
|
|
auto value = xml_char_to_byte_string(value_begin, value_len);
|
|
attrs.set(attr_name.to_byte_string(), value);
|
|
}
|
|
|
|
if (context->listener) {
|
|
context->listener->element_start(name, attrs);
|
|
} else {
|
|
auto element = adopt_own(*new Node {
|
|
.offset = {},
|
|
.content = Node::Element { name, move(attrs), {} },
|
|
.parent = context->current_node,
|
|
});
|
|
|
|
auto* element_ptr = element.ptr();
|
|
|
|
if (context->current_node) {
|
|
VERIFY(context->current_node->is_element());
|
|
context->current_node->content.get<Node::Element>().children.append(move(element));
|
|
} else {
|
|
context->root_node = move(element);
|
|
}
|
|
|
|
context->current_node = element_ptr;
|
|
}
|
|
}
|
|
|
|
static void end_element_ns_handler(void* ctx, xmlChar const* localname, xmlChar const* prefix, xmlChar const*)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
--context->depth;
|
|
|
|
StringBuilder name_builder;
|
|
if (prefix) {
|
|
name_builder.append(xml_char_to_string_view(prefix));
|
|
name_builder.append(':');
|
|
}
|
|
name_builder.append(xml_char_to_string_view(localname));
|
|
auto name = name_builder.to_byte_string();
|
|
|
|
if (context->listener) {
|
|
context->listener->element_end(name);
|
|
} else if (context->current_node) {
|
|
context->current_node = context->current_node->parent;
|
|
}
|
|
}
|
|
|
|
static void characters_handler(void* ctx, xmlChar const* ch, int len)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
auto text = StringView(reinterpret_cast<char const*>(ch), static_cast<size_t>(len));
|
|
|
|
if (context->listener) {
|
|
context->listener->text(text);
|
|
} else if (context->current_node && context->current_node->is_element()) {
|
|
auto& children = context->current_node->content.get<Node::Element>().children;
|
|
if (!children.is_empty() && children.last()->is_text()) {
|
|
children.last()->content.get<Node::Text>().builder.append(text);
|
|
} else {
|
|
Node::Text text_content;
|
|
text_content.builder.append(text);
|
|
auto text_node = adopt_own(*new Node {
|
|
.offset = {},
|
|
.content = move(text_content),
|
|
.parent = context->current_node,
|
|
});
|
|
children.append(move(text_node));
|
|
}
|
|
}
|
|
}
|
|
|
|
static void cdata_block_handler(void* ctx, xmlChar const* value, int len)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
auto text = StringView(reinterpret_cast<char const*>(value), static_cast<size_t>(len));
|
|
|
|
if (context->listener) {
|
|
context->listener->cdata_section(text);
|
|
} else if (context->current_node && context->current_node->is_element()) {
|
|
auto& children = context->current_node->content.get<Node::Element>().children;
|
|
Node::Text text_content;
|
|
text_content.builder.append(text);
|
|
auto text_node = adopt_own(*new Node {
|
|
.offset = {},
|
|
.content = move(text_content),
|
|
.parent = context->current_node,
|
|
});
|
|
children.append(move(text_node));
|
|
}
|
|
}
|
|
|
|
static void comment_handler(void* ctx, xmlChar const* value)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
auto comment_text = xml_char_to_byte_string(value);
|
|
|
|
if (context->listener) {
|
|
context->listener->comment(comment_text);
|
|
} else if (context->current_node && context->current_node->is_element()) {
|
|
auto& children = context->current_node->content.get<Node::Element>().children;
|
|
auto comment_node = adopt_own(*new Node {
|
|
.offset = {},
|
|
.content = Node::Comment { comment_text },
|
|
.parent = context->current_node,
|
|
});
|
|
children.append(move(comment_node));
|
|
}
|
|
}
|
|
|
|
static void processing_instruction_handler(void* ctx, xmlChar const* target, xmlChar const* data)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
auto target_str = xml_char_to_byte_string(target);
|
|
auto data_str = xml_char_to_byte_string(data);
|
|
|
|
if (context->listener) {
|
|
context->listener->processing_instruction(target_str, data_str);
|
|
} else {
|
|
context->processing_instructions.set(target_str, data_str);
|
|
}
|
|
}
|
|
|
|
static void internal_subset_handler(void* ctx, xmlChar const* name, xmlChar const* external_id, xmlChar const* system_id)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context)
|
|
return;
|
|
|
|
Doctype doctype;
|
|
doctype.type = xml_char_to_byte_string(name);
|
|
|
|
if (external_id || system_id) {
|
|
ExternalID ext_id;
|
|
if (external_id)
|
|
ext_id.public_id = PublicID { xml_char_to_byte_string(external_id) };
|
|
ext_id.system_id = SystemID { xml_char_to_byte_string(system_id) };
|
|
doctype.external_id = move(ext_id);
|
|
}
|
|
|
|
context->doctype = move(doctype);
|
|
|
|
if (context->listener)
|
|
context->listener->set_doctype(context->doctype.value());
|
|
}
|
|
|
|
static void structured_error_handler(void* ctx, xmlError const* error)
|
|
{
|
|
auto* parser_ctx = static_cast<xmlParserCtxtPtr>(ctx);
|
|
auto* context = static_cast<ParserContext*>(parser_ctx->_private);
|
|
if (!context || !error)
|
|
return;
|
|
|
|
size_t offset = 0;
|
|
if (parser_ctx->input && parser_ctx->input->cur && parser_ctx->input->base)
|
|
offset = static_cast<size_t>(parser_ctx->input->cur - parser_ctx->input->base);
|
|
|
|
ParseError parse_error {
|
|
.position = LineTrackingLexer::Position {
|
|
.offset = offset,
|
|
.line = error->line > 0 ? static_cast<size_t>(error->line) : 0,
|
|
.column = error->int2 > 0 ? static_cast<size_t>(error->int2) : 0,
|
|
},
|
|
.error = ByteString(error->message ? StringView(error->message, strlen(error->message)).trim_whitespace() : "Unknown error"sv),
|
|
};
|
|
|
|
context->parse_errors.append(parse_error);
|
|
|
|
if (context->listener)
|
|
context->listener->error(parse_error);
|
|
|
|
if (!context->error.has_value())
|
|
context->error = move(parse_error);
|
|
}
|
|
|
|
static xmlSAXHandler create_sax_handler(bool preserve_comments, bool resolve_html_entities)
|
|
{
|
|
xmlSAXHandler handler = {};
|
|
handler.initialized = XML_SAX2_MAGIC;
|
|
handler.startDocument = start_document_handler;
|
|
handler.endDocument = end_document_handler;
|
|
handler.startElementNs = start_element_ns_handler;
|
|
handler.endElementNs = end_element_ns_handler;
|
|
handler.characters = characters_handler;
|
|
handler.cdataBlock = cdata_block_handler;
|
|
handler.processingInstruction = processing_instruction_handler;
|
|
handler.internalSubset = internal_subset_handler;
|
|
handler.serror = structured_error_handler;
|
|
if (preserve_comments)
|
|
handler.comment = comment_handler;
|
|
if (resolve_html_entities) {
|
|
handler.externalSubset = external_subset_handler;
|
|
handler.getEntity = get_entity_handler;
|
|
}
|
|
return handler;
|
|
}
|
|
|
|
ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
|
|
{
|
|
auto source_result = listener.set_source(ByteString(m_source));
|
|
if (source_result.is_error())
|
|
return ParseError { {}, ByteString("Failed to set source") };
|
|
|
|
ParserContext context;
|
|
context.listener = &listener;
|
|
context.options = &m_options;
|
|
|
|
bool resolve_html_entities = static_cast<bool>(m_options.resolve_named_html_entity);
|
|
auto sax_handler = create_sax_handler(m_options.preserve_comments, resolve_html_entities);
|
|
|
|
int options = XML_PARSE_NONET | XML_PARSE_NOWARNING;
|
|
if (!m_options.preserve_cdata)
|
|
options |= XML_PARSE_NOCDATA;
|
|
|
|
auto* parser_ctx = xmlCreatePushParserCtxt(&sax_handler, nullptr, nullptr, 0, nullptr);
|
|
if (!parser_ctx)
|
|
return ParseError { {}, ByteString("Failed to create parser context") };
|
|
|
|
parser_ctx->_private = &context;
|
|
xmlCtxtUseOptions(parser_ctx, options);
|
|
|
|
xmlSwitchEncoding(parser_ctx, XML_CHAR_ENCODING_UTF8);
|
|
|
|
auto result = xmlParseChunk(parser_ctx, m_source.characters_without_null_termination(), static_cast<int>(m_source.length()), 1);
|
|
|
|
bool well_formed = parser_ctx->wellFormed;
|
|
xmlFreeParserCtxt(parser_ctx);
|
|
|
|
m_parse_errors = move(context.parse_errors);
|
|
|
|
if (!context.document_ended)
|
|
listener.document_end();
|
|
|
|
if (context.error.has_value() && m_options.treat_errors_as_fatal)
|
|
return context.error.release_value();
|
|
|
|
if (result != 0 || !well_formed) {
|
|
if (!m_parse_errors.is_empty())
|
|
return m_parse_errors.first();
|
|
return ParseError { {}, ByteString("XML parsing failed") };
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
ErrorOr<Document, ParseError> Parser::parse()
|
|
{
|
|
ParserContext context;
|
|
context.options = &m_options;
|
|
|
|
bool resolve_html_entities = static_cast<bool>(m_options.resolve_named_html_entity);
|
|
auto sax_handler = create_sax_handler(m_options.preserve_comments, resolve_html_entities);
|
|
|
|
int options = XML_PARSE_NONET | XML_PARSE_NOWARNING;
|
|
if (!m_options.preserve_cdata)
|
|
options |= XML_PARSE_NOCDATA;
|
|
|
|
auto* parser_ctx = xmlCreatePushParserCtxt(&sax_handler, nullptr, nullptr, 0, nullptr);
|
|
if (!parser_ctx)
|
|
return ParseError { {}, ByteString("Failed to create parser context") };
|
|
|
|
parser_ctx->_private = &context;
|
|
xmlCtxtUseOptions(parser_ctx, options);
|
|
|
|
xmlSwitchEncoding(parser_ctx, XML_CHAR_ENCODING_UTF8);
|
|
|
|
auto result = xmlParseChunk(parser_ctx, m_source.characters_without_null_termination(), static_cast<int>(m_source.length()), 1);
|
|
|
|
bool well_formed = parser_ctx->wellFormed;
|
|
xmlFreeParserCtxt(parser_ctx);
|
|
|
|
m_parse_errors = move(context.parse_errors);
|
|
|
|
if (context.error.has_value() && m_options.treat_errors_as_fatal)
|
|
return context.error.release_value();
|
|
|
|
if (result != 0 || !well_formed) {
|
|
if (!m_parse_errors.is_empty())
|
|
return m_parse_errors.first();
|
|
return ParseError { {}, ByteString("XML parsing failed") };
|
|
}
|
|
|
|
if (!context.root_node)
|
|
return ParseError { {}, ByteString("No root element") };
|
|
|
|
return Document(context.root_node.release_nonnull(), move(context.doctype), move(context.processing_instructions), context.version);
|
|
}
|
|
|
|
}
|