2020-05-24 00:14:23 +02:00
|
|
|
/*
|
2024-10-04 13:19:50 +02:00
|
|
|
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
|
2020-05-24 00:14:23 +02:00
|
|
|
*
|
2021-04-22 01:24:48 -07:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-05-24 00:14:23 +02:00
|
|
|
*/
|
|
|
|
|
2024-09-30 17:52:30 -06:00
|
|
|
#include <AK/HashTable.h>
|
2020-07-28 19:18:23 +02:00
|
|
|
#include <LibWeb/HTML/Parser/HTMLToken.h>
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2020-07-28 18:20:36 +02:00
|
|
|
namespace Web::HTML {
|
2020-05-24 00:14:23 +02:00
|
|
|
|
2023-11-05 11:45:55 +13:00
|
|
|
String HTMLToken::to_string() const
|
2020-05-24 00:14:23 +02:00
|
|
|
{
|
|
|
|
StringBuilder builder;
|
|
|
|
|
|
|
|
switch (type()) {
|
|
|
|
case HTMLToken::Type::DOCTYPE:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("DOCTYPE"sv);
|
|
|
|
builder.append(" { name: '"sv);
|
2021-07-15 00:03:50 +02:00
|
|
|
builder.append(doctype_data().name);
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("' }"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
|
|
|
case HTMLToken::Type::StartTag:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("StartTag"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
|
|
|
case HTMLToken::Type::EndTag:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("EndTag"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
|
|
|
case HTMLToken::Type::Comment:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("Comment"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
|
|
|
case HTMLToken::Type::Character:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("Character"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
|
|
|
case HTMLToken::Type::EndOfFile:
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("EndOfFile"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
break;
|
2020-05-24 20:24:43 +02:00
|
|
|
case HTMLToken::Type::Invalid:
|
2021-02-23 20:42:32 +01:00
|
|
|
VERIFY_NOT_REACHED();
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append(" { name: '"sv);
|
2021-07-14 23:37:48 +02:00
|
|
|
builder.append(tag_name());
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("', { "sv);
|
2021-07-14 23:53:11 +02:00
|
|
|
for_each_attribute([&](auto& attribute) {
|
2021-05-23 08:50:48 +02:00
|
|
|
builder.append(attribute.local_name);
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("=\""sv);
|
2021-05-23 08:50:48 +02:00
|
|
|
builder.append(attribute.value);
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("\" "sv);
|
2021-07-14 23:53:11 +02:00
|
|
|
return IterationDecision::Continue;
|
|
|
|
});
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("} }"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|
|
|
|
|
LibWeb: Change HTMLToken storage architecture
This completely changes how HTMLTokens store their data. Previously,
space was allocated for all token types separately. Now, the HTMLToken's
data is stored in just a String, two booleans and a Variant.
This change reduces sizeof(HTMLToken) from 68 to 32. Also, this reduces
raw tokenization time by around 20 to 50 percent, depending on the page.
Full document parsing time (with HTMLDocumentParser, on a local HTML
page without any dependency files) is reduced by between 4 and 20
percent, depending on the page.
Since tokenizing HTML pages can easily generated 50'000 tokens and more,
the storage has been designed in a way that avoids heap allocations
where possible, while trying to reduce the size of the tokens. The only
tokens which need to allocate on the heap are thus DOCTYPE tokens (max.
1 per document), and tag tokens (but only if they have attributes). This
way, only around 5 percent of all tokens generated need to allocate on
the heap (except for StringImpl allocations).
2021-07-15 18:49:39 +02:00
|
|
|
if (is_comment()) {
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append(" { data: '"sv);
|
LibWeb: Change HTMLToken storage architecture
This completely changes how HTMLTokens store their data. Previously,
space was allocated for all token types separately. Now, the HTMLToken's
data is stored in just a String, two booleans and a Variant.
This change reduces sizeof(HTMLToken) from 68 to 32. Also, this reduces
raw tokenization time by around 20 to 50 percent, depending on the page.
Full document parsing time (with HTMLDocumentParser, on a local HTML
page without any dependency files) is reduced by between 4 and 20
percent, depending on the page.
Since tokenizing HTML pages can easily generated 50'000 tokens and more,
the storage has been designed in a way that avoids heap allocations
where possible, while trying to reduce the size of the tokens. The only
tokens which need to allocate on the heap are thus DOCTYPE tokens (max.
1 per document), and tag tokens (but only if they have attributes). This
way, only around 5 percent of all tokens generated need to allocate on
the heap (except for StringImpl allocations).
2021-07-15 18:49:39 +02:00
|
|
|
builder.append(comment());
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("' }"sv);
|
LibWeb: Change HTMLToken storage architecture
This completely changes how HTMLTokens store their data. Previously,
space was allocated for all token types separately. Now, the HTMLToken's
data is stored in just a String, two booleans and a Variant.
This change reduces sizeof(HTMLToken) from 68 to 32. Also, this reduces
raw tokenization time by around 20 to 50 percent, depending on the page.
Full document parsing time (with HTMLDocumentParser, on a local HTML
page without any dependency files) is reduced by between 4 and 20
percent, depending on the page.
Since tokenizing HTML pages can easily generated 50'000 tokens and more,
the storage has been designed in a way that avoids heap allocations
where possible, while trying to reduce the size of the tokens. The only
tokens which need to allocate on the heap are thus DOCTYPE tokens (max.
1 per document), and tag tokens (but only if they have attributes). This
way, only around 5 percent of all tokens generated need to allocate on
the heap (except for StringImpl allocations).
2021-07-15 18:49:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (is_character()) {
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append(" { data: '"sv);
|
LibWeb: Change HTMLToken storage architecture
This completely changes how HTMLTokens store their data. Previously,
space was allocated for all token types separately. Now, the HTMLToken's
data is stored in just a String, two booleans and a Variant.
This change reduces sizeof(HTMLToken) from 68 to 32. Also, this reduces
raw tokenization time by around 20 to 50 percent, depending on the page.
Full document parsing time (with HTMLDocumentParser, on a local HTML
page without any dependency files) is reduced by between 4 and 20
percent, depending on the page.
Since tokenizing HTML pages can easily generated 50'000 tokens and more,
the storage has been designed in a way that avoids heap allocations
where possible, while trying to reduce the size of the tokens. The only
tokens which need to allocate on the heap are thus DOCTYPE tokens (max.
1 per document), and tag tokens (but only if they have attributes). This
way, only around 5 percent of all tokens generated need to allocate on
the heap (except for StringImpl allocations).
2021-07-15 18:49:39 +02:00
|
|
|
builder.append_code_point(code_point());
|
2022-07-11 17:32:29 +00:00
|
|
|
builder.append("' }"sv);
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|
|
|
|
|
2021-06-04 11:31:43 +02:00
|
|
|
if (type() == HTMLToken::Type::Character) {
|
|
|
|
builder.appendff("@{}:{}", m_start_position.line, m_start_position.column);
|
|
|
|
} else {
|
|
|
|
builder.appendff("@{}:{}-{}:{}", m_start_position.line, m_start_position.column, m_end_position.line, m_end_position.column);
|
|
|
|
}
|
2021-05-20 23:11:41 +04:30
|
|
|
|
2023-11-05 11:45:55 +13:00
|
|
|
return MUST(builder.to_string());
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|
|
|
|
|
2024-09-30 17:52:30 -06:00
|
|
|
void HTMLToken::normalize_attributes()
|
|
|
|
{
|
|
|
|
// From AttributeNameState: https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
|
|
|
//
|
|
|
|
// When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
|
|
|
|
// the complete attribute's name must be compared to the other attributes on the same token;
|
|
|
|
// if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
|
|
|
|
// parse error and the new attribute must be removed from the token.
|
|
|
|
|
|
|
|
// NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
|
|
|
|
// are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
|
|
|
|
// in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
|
|
|
|
|
|
|
|
HashTable<FlyString> seen_attributes;
|
|
|
|
auto* ptr = tag_attributes();
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
auto& tag_attributes = *ptr;
|
|
|
|
for (size_t i = 0; i < tag_attributes.size(); ++i) {
|
|
|
|
auto& attribute = tag_attributes[i];
|
|
|
|
if (seen_attributes.set(attribute.local_name, AK::HashSetExistingEntryBehavior::Keep) == AK::HashSetResult::KeptExistingEntry) {
|
|
|
|
// This is a duplicate attribute, remove it.
|
|
|
|
tag_attributes.remove(i);
|
|
|
|
--i;
|
2024-12-02 12:33:52 +00:00
|
|
|
m_had_duplicate_attribute = true;
|
2024-09-30 17:52:30 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-24 00:14:23 +02:00
|
|
|
}
|