2020-05-22 21:46:13 +02:00
|
|
|
/*
|
2024-10-04 13:19:50 +02:00
|
|
|
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
|
2022-02-15 18:45:10 +00:00
|
|
|
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
|
2020-05-22 21:46:13 +02:00
|
|
|
*
|
2021-04-22 01:24:48 -07:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-05-22 21:46:13 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
#include <AK/String.h>
|
2020-05-22 21:46:13 +02:00
|
|
|
#include <AK/StringView.h>
|
|
|
|
|
#include <AK/Types.h>
|
2025-07-19 19:35:33 -07:00
|
|
|
#include <LibWeb/Export.h>
|
2020-07-28 19:18:23 +02:00
|
|
|
#include <LibWeb/HTML/Parser/HTMLToken.h>
|
2020-05-22 21:46:13 +02:00
|
|
|
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
struct RustFfiTokenizerHandle;
|
|
|
|
|
|
2020-07-28 18:20:36 +02:00
|
|
|
namespace Web::HTML {
|
|
|
|
|
|
2026-05-16 13:47:49 +02:00
|
|
|
class HTMLParser;
|
|
|
|
|
|
2020-05-22 21:46:13 +02:00
|
|
|
#define ENUMERATE_TOKENIZER_STATES \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(Data) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RCDATA) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptData) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(TagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(TagName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AttributeName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BogusComment) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentStart) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(Comment) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentEnd) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CDATASection) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(CharacterReference) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
|
|
|
|
|
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
|
|
|
|
|
|
2025-07-19 19:35:33 -07:00
|
|
|
class WEB_API HTMLTokenizer {
|
2020-05-22 21:46:13 +02:00
|
|
|
public:
|
2022-02-19 15:58:21 +01:00
|
|
|
explicit HTMLTokenizer();
|
2023-12-16 17:49:34 +03:30
|
|
|
explicit HTMLTokenizer(StringView input, ByteString const& encoding);
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
~HTMLTokenizer();
|
2020-05-22 21:46:13 +02:00
|
|
|
|
2020-05-24 20:24:43 +02:00
|
|
|
enum class State {
|
|
|
|
|
#define __ENUMERATE_TOKENIZER_STATE(state) state,
|
|
|
|
|
ENUMERATE_TOKENIZER_STATES
|
|
|
|
|
#undef __ENUMERATE_TOKENIZER_STATE
|
|
|
|
|
};
|
|
|
|
|
|
2024-02-18 12:45:53 -05:00
|
|
|
enum class StopAtInsertionPoint {
|
|
|
|
|
No,
|
|
|
|
|
Yes,
|
|
|
|
|
};
|
|
|
|
|
Optional<HTMLToken> next_token(StopAtInsertionPoint = StopAtInsertionPoint::No);
|
2020-05-22 21:46:13 +02:00
|
|
|
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
void switch_to(State new_state);
|
2020-05-24 20:24:43 +02:00
|
|
|
|
2025-05-10 11:28:35 +02:00
|
|
|
auto const& source() const { return m_source; }
|
2020-05-28 12:35:19 +02:00
|
|
|
|
2026-04-26 03:21:39 +02:00
|
|
|
String unparsed_input() const;
|
|
|
|
|
|
2026-04-28 18:04:17 +02:00
|
|
|
void append_to_input_stream(StringView input);
|
|
|
|
|
void close_input_stream();
|
|
|
|
|
bool is_input_stream_closed() const { return m_input_stream_closed; }
|
2023-09-12 23:16:10 +12:00
|
|
|
void insert_input_at_insertion_point(StringView input);
|
2022-02-19 15:58:21 +01:00
|
|
|
void insert_eof();
|
|
|
|
|
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
bool is_insertion_point_defined() const;
|
|
|
|
|
bool is_insertion_point_reached();
|
|
|
|
|
void undefine_insertion_point();
|
|
|
|
|
void store_insertion_point();
|
|
|
|
|
void restore_insertion_point();
|
|
|
|
|
void store_old_insertion_point() { store_insertion_point(); }
|
|
|
|
|
void restore_old_insertion_point() { restore_insertion_point(); }
|
|
|
|
|
void update_insertion_point();
|
2022-02-19 15:58:21 +01:00
|
|
|
|
2022-09-20 21:08:14 +02:00
|
|
|
// This permanently cuts off the tokenizer input stream.
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
void abort();
|
2022-09-20 21:08:14 +02:00
|
|
|
|
2025-10-23 21:45:00 +02:00
|
|
|
void parser_did_run(Badge<HTMLParser>);
|
2026-05-15 20:39:06 +02:00
|
|
|
RustFfiTokenizerHandle* ffi_handle(Badge<HTMLParser>) { return m_tokenizer; }
|
2025-10-23 21:45:00 +02:00
|
|
|
|
2020-05-22 21:46:13 +02:00
|
|
|
private:
|
2021-07-12 12:44:21 +02:00
|
|
|
static char const* state_name(State state)
|
2020-05-22 21:46:13 +02:00
|
|
|
{
|
|
|
|
|
switch (state) {
|
|
|
|
|
#define __ENUMERATE_TOKENIZER_STATE(state) \
|
|
|
|
|
case State::state: \
|
|
|
|
|
return #state;
|
|
|
|
|
ENUMERATE_TOKENIZER_STATES
|
|
|
|
|
#undef __ENUMERATE_TOKENIZER_STATE
|
|
|
|
|
};
|
2021-02-23 20:42:32 +01:00
|
|
|
VERIFY_NOT_REACHED();
|
2020-05-22 21:46:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
State m_state { State::Data };
|
2025-05-10 11:28:35 +02:00
|
|
|
String m_source;
|
2026-04-28 18:04:17 +02:00
|
|
|
bool m_input_stream_closed { false };
|
2022-09-20 21:08:14 +02:00
|
|
|
|
LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.
Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.
Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.
Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.
TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 15:13:43 +02:00
|
|
|
RustFfiTokenizerHandle* m_tokenizer { nullptr };
|
2020-05-22 21:46:13 +02:00
|
|
|
};
|
2020-05-27 23:01:04 +02:00
|
|
|
|
2020-05-22 21:46:13 +02:00
|
|
|
}
|