2020-05-22 21:46:13 +02:00
/*
* Copyright ( c ) 2020 , Andreas Kling < kling @ serenityos . org >
2022-02-15 18:52:45 +00:00
* Copyright ( c ) 2022 , Linus Groh < linusg @ serenityos . org >
2020-05-22 21:46:13 +02:00
*
2021-04-22 01:24:48 -07:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-05-22 21:46:13 +02:00
*/
2021-06-01 21:18:08 +02:00
# include <AK/CharacterTypes.h>
2021-01-17 16:57:17 +01:00
# include <AK/Debug.h>
2021-04-24 20:22:30 -07:00
# include <AK/SourceLocation.h>
2020-05-28 12:35:19 +02:00
# include <LibTextCodec/Decoder.h>
2020-07-28 19:18:23 +02:00
# include <LibWeb/HTML/Parser/Entities.h>
2022-02-15 18:52:45 +00:00
# include <LibWeb/HTML/Parser/HTMLParser.h>
2020-07-28 19:18:23 +02:00
# include <LibWeb/HTML/Parser/HTMLToken.h>
# include <LibWeb/HTML/Parser/HTMLTokenizer.h>
2022-02-15 18:52:45 +00:00
# include <LibWeb/Namespace.h>
2020-10-09 11:25:10 -04:00
# include <string.h>
2020-05-22 21:46:13 +02:00
2020-07-28 18:20:36 +02:00
namespace Web : : HTML {
2020-05-24 00:14:23 +02:00
# pragma GCC diagnostic ignored "-Wunused-label"
2020-05-27 16:16:23 +02:00
# define CONSUME_NEXT_INPUT_CHARACTER \
2020-08-05 16:31:20 -04:00
current_input_character = next_code_point ( ) ;
2020-05-27 16:16:23 +02:00
2021-05-23 08:20:03 +02:00
# define SWITCH_TO(new_state) \
do { \
VERIFY ( m_current_builder . is_empty ( ) ) ; \
SWITCH_TO_WITH_UNCLEAN_BUILDER ( new_state ) ; \
} while ( 0 )
# define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \
do { \
will_switch_to ( State : : new_state ) ; \
m_state = State : : new_state ; \
CONSUME_NEXT_INPUT_CHARACTER ; \
goto new_state ; \
2020-05-24 22:00:46 +02:00
} while ( 0 )
2020-05-22 21:46:13 +02:00
2020-05-24 22:00:46 +02:00
# define RECONSUME_IN(new_state) \
do { \
will_reconsume_in ( State : : new_state ) ; \
m_state = State : : new_state ; \
goto new_state ; \
} while ( 0 )
2020-05-22 21:46:13 +02:00
2020-05-27 16:16:23 +02:00
# define SWITCH_TO_RETURN_STATE \
do { \
will_switch_to ( m_return_state ) ; \
m_state = m_return_state ; \
goto _StartOfFunction ; \
} while ( 0 )
2021-05-20 23:11:41 +04:30
# define RECONSUME_IN_RETURN_STATE \
do { \
will_reconsume_in ( m_return_state ) ; \
m_state = m_return_state ; \
if ( current_input_character . has_value ( ) ) \
restore_to ( m_prev_utf8_iterator ) ; \
goto _StartOfFunction ; \
2020-05-27 16:16:23 +02:00
} while ( 0 )
2021-05-23 12:39:00 +02:00
# define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
do { \
2021-07-15 00:34:54 +02:00
VERIFY ( m_current_builder . is_empty ( ) ) ; \
2021-05-23 12:39:00 +02:00
will_switch_to ( State : : new_state ) ; \
m_state = State : : new_state ; \
will_emit ( m_current_token ) ; \
m_queued_tokens . enqueue ( move ( m_current_token ) ) ; \
return m_queued_tokens . dequeue ( ) ; \
2020-05-26 15:50:05 +02:00
} while ( 0 )
2020-08-05 16:31:20 -04:00
# define EMIT_CHARACTER_AND_RECONSUME_IN(code_point, new_state) \
2020-09-18 09:49:51 +02:00
do { \
2020-08-05 16:31:20 -04:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ; \
2020-09-18 09:49:51 +02:00
will_reconsume_in ( State : : new_state ) ; \
m_state = State : : new_state ; \
goto new_state ; \
2020-05-24 22:00:46 +02:00
} while ( 0 )
2020-05-24 00:14:23 +02:00
2021-07-14 23:33:12 +02:00
# define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
do { \
for ( auto code_point : m_temporary_buffer ) { \
if ( consumed_as_part_of_an_attribute ( ) ) { \
m_current_builder . append_code_point ( code_point ) ; \
} else { \
create_new_token ( HTMLToken : : Type : : Character ) ; \
m_current_token . set_code_point ( code_point ) ; \
m_queued_tokens . enqueue ( move ( m_current_token ) ) ; \
} \
} \
2020-05-27 16:16:23 +02:00
} while ( 0 )
2021-05-20 23:11:41 +04:30
# define DONT_CONSUME_NEXT_INPUT_CHARACTER \
do { \
restore_to ( m_prev_utf8_iterator ) ; \
2020-06-04 21:06:54 +02:00
} while ( 0 )
2020-05-22 21:46:13 +02:00
2020-08-05 16:31:20 -04:00
# define ON(code_point) \
if ( current_input_character . has_value ( ) & & current_input_character . value ( ) = = code_point )
2020-05-22 21:46:13 +02:00
# define ON_EOF \
if ( ! current_input_character . has_value ( ) )
# define ON_ASCII_ALPHA \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_alpha ( current_input_character . value ( ) ) )
2020-05-22 21:46:13 +02:00
2020-05-27 16:16:23 +02:00
# define ON_ASCII_ALPHANUMERIC \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_alphanumeric ( current_input_character . value ( ) ) )
2020-05-27 16:16:23 +02:00
2020-05-23 19:56:07 +02:00
# define ON_ASCII_UPPER_ALPHA \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_upper_alpha ( current_input_character . value ( ) ) )
2020-05-23 19:56:07 +02:00
2020-05-24 20:24:43 +02:00
# define ON_ASCII_LOWER_ALPHA \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_lower_alpha ( current_input_character . value ( ) ) )
2020-05-24 20:24:43 +02:00
2020-05-27 16:16:23 +02:00
# define ON_ASCII_DIGIT \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_digit ( current_input_character . value ( ) ) )
2020-05-27 16:16:23 +02:00
# define ON_ASCII_HEX_DIGIT \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii_hex_digit ( current_input_character . value ( ) ) )
2020-05-27 16:16:23 +02:00
2020-05-22 21:46:13 +02:00
# define ON_WHITESPACE \
2021-06-01 21:18:08 +02:00
if ( current_input_character . has_value ( ) & & is_ascii ( current_input_character . value ( ) ) & & " \t \n \f " sv . contains ( current_input_character . value ( ) ) )
2020-05-22 21:46:13 +02:00
# define ANYTHING_ELSE if (1)
2021-05-23 12:39:00 +02:00
# define EMIT_EOF \
do { \
if ( m_has_emitted_eof ) \
return { } ; \
m_has_emitted_eof = true ; \
create_new_token ( HTMLToken : : Type : : EndOfFile ) ; \
will_emit ( m_current_token ) ; \
m_queued_tokens . enqueue ( move ( m_current_token ) ) ; \
return m_queued_tokens . dequeue ( ) ; \
2020-05-24 22:00:46 +02:00
} while ( 0 )
2020-05-24 20:24:43 +02:00
2021-05-23 12:39:00 +02:00
# define EMIT_CURRENT_TOKEN \
do { \
2021-07-15 00:34:54 +02:00
VERIFY ( m_current_builder . is_empty ( ) ) ; \
2021-05-23 12:39:00 +02:00
will_emit ( m_current_token ) ; \
m_queued_tokens . enqueue ( move ( m_current_token ) ) ; \
return m_queued_tokens . dequeue ( ) ; \
2020-05-24 22:00:46 +02:00
} while ( 0 )
2020-05-24 00:14:23 +02:00
2021-07-14 23:33:12 +02:00
# define EMIT_CHARACTER(code_point) \
do { \
create_new_token ( HTMLToken : : Type : : Character ) ; \
m_current_token . set_code_point ( code_point ) ; \
m_queued_tokens . enqueue ( move ( m_current_token ) ) ; \
return m_queued_tokens . dequeue ( ) ; \
2020-05-24 22:00:46 +02:00
} while ( 0 )
2020-05-22 21:46:13 +02:00
2020-05-24 20:24:43 +02:00
# define EMIT_CURRENT_CHARACTER \
EMIT_CHARACTER ( current_input_character . value ( ) ) ;
2020-08-05 16:31:20 -04:00
# define SWITCH_TO_AND_EMIT_CHARACTER(code_point, new_state) \
2020-09-18 09:49:51 +02:00
do { \
will_switch_to ( State : : new_state ) ; \
m_state = State : : new_state ; \
2020-08-05 16:31:20 -04:00
EMIT_CHARACTER ( code_point ) ; \
2020-06-06 07:06:46 +01:00
} while ( 0 )
# define SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(new_state) \
SWITCH_TO_AND_EMIT_CHARACTER ( current_input_character . value ( ) , new_state )
2020-05-22 21:46:13 +02:00
# define BEGIN_STATE(state) \
state : \
2020-05-23 14:04:53 +02:00
case State : : state : { \
{ \
{
2020-05-22 21:46:13 +02:00
# define END_STATE \
2021-02-23 20:42:32 +01:00
VERIFY_NOT_REACHED ( ) ; \
2020-05-23 14:04:53 +02:00
break ; \
} \
} \
}
2020-05-22 21:46:13 +02:00
2021-07-12 12:44:21 +02:00
static inline void log_parse_error ( SourceLocation const & location = SourceLocation : : current ( ) )
2021-04-24 20:22:30 -07:00
{
dbgln_if ( TOKENIZER_TRACE_DEBUG , " Parse error (tokenization) {} " , location ) ;
}
2020-08-05 16:31:20 -04:00
Optional < u32 > HTMLTokenizer : : next_code_point ( )
2020-05-22 21:46:13 +02:00
{
2020-06-04 21:06:54 +02:00
if ( m_utf8_iterator = = m_utf8_view . end ( ) )
2020-05-22 21:46:13 +02:00
return { } ;
2022-02-18 22:46:28 +00:00
u32 code_point ;
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
// https://infra.spec.whatwg.org/#normalize-newlines
if ( peek_code_point ( 0 ) . value_or ( 0 ) = = ' \r ' & & peek_code_point ( 1 ) . value_or ( 0 ) = = ' \n ' ) {
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
skip ( 2 ) ;
code_point = ' \n ' ;
} else if ( peek_code_point ( 0 ) . value_or ( 0 ) = = ' \r ' ) {
// replace every remaining U+000D CR code point with a U+000A LF code point.
skip ( 1 ) ;
code_point = ' \n ' ;
} else {
skip ( 1 ) ;
code_point = * m_prev_utf8_iterator ;
}
dbgln_if ( TOKENIZER_TRACE_DEBUG , " (Tokenizer) Next code_point: {} " , code_point ) ;
return code_point ;
2020-05-22 21:46:13 +02:00
}
2021-05-20 23:11:41 +04:30
void HTMLTokenizer : : skip ( size_t count )
{
2022-06-02 01:03:44 +02:00
if ( ! m_source_positions . is_empty ( ) )
m_source_positions . append ( m_source_positions . last ( ) ) ;
2021-05-20 23:11:41 +04:30
for ( size_t i = 0 ; i < count ; + + i ) {
2021-05-21 11:03:31 +04:30
m_prev_utf8_iterator = m_utf8_iterator ;
2021-05-20 23:11:41 +04:30
auto code_point = * m_utf8_iterator ;
2022-06-02 01:03:44 +02:00
if ( ! m_source_positions . is_empty ( ) ) {
if ( code_point = = ' \n ' ) {
m_source_positions . last ( ) . column = 0 ;
m_source_positions . last ( ) . line + + ;
} else {
m_source_positions . last ( ) . column + + ;
}
2021-05-20 23:11:41 +04:30
}
+ + m_utf8_iterator ;
}
}
2020-08-05 16:31:20 -04:00
Optional < u32 > HTMLTokenizer : : peek_code_point ( size_t offset ) const
2020-05-22 21:46:13 +02:00
{
2020-06-04 22:02:54 +02:00
auto it = m_utf8_iterator ;
for ( size_t i = 0 ; i < offset & & it ! = m_utf8_view . end ( ) ; + + i )
+ + it ;
if ( it = = m_utf8_view . end ( ) )
2020-05-22 21:46:13 +02:00
return { } ;
2020-06-04 22:02:54 +02:00
return * it ;
2020-05-22 21:46:13 +02:00
}
2021-06-04 11:31:43 +02:00
HTMLToken : : Position HTMLTokenizer : : nth_last_position ( size_t n )
{
if ( n + 1 > m_source_positions . size ( ) ) {
dbgln_if ( TOKENIZER_TRACE_DEBUG , " (Tokenizer::nth_last_position) Invalid position requested: {}th-last of {}. Returning (0-0). " , n , m_source_positions . size ( ) ) ;
return HTMLToken : : Position { 0 , 0 } ;
} ;
return m_source_positions . at ( m_source_positions . size ( ) - 1 - n ) ;
}
2020-05-24 00:14:23 +02:00
Optional < HTMLToken > HTMLTokenizer : : next_token ( )
2020-05-22 21:46:13 +02:00
{
2022-06-02 01:03:44 +02:00
if ( ! m_source_positions . is_empty ( ) ) {
2021-05-20 23:15:33 +04:30
auto last_position = m_source_positions . last ( ) ;
2022-02-18 23:49:51 +01:00
m_source_positions . clear_with_capacity ( ) ;
2021-05-20 23:15:33 +04:30
m_source_positions . append ( move ( last_position ) ) ;
}
2020-05-27 16:16:23 +02:00
_StartOfFunction :
2020-05-26 15:50:05 +02:00
if ( ! m_queued_tokens . is_empty ( ) )
return m_queued_tokens . dequeue ( ) ;
2020-05-22 21:46:13 +02:00
for ( ; ; ) {
2020-08-05 16:31:20 -04:00
auto current_input_character = next_code_point ( ) ;
2020-05-22 21:46:13 +02:00
switch ( m_state ) {
2022-02-15 18:12:15 +00:00
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( Data )
{
ON ( ' & ' )
{
m_return_state = State : : Data ;
SWITCH_TO ( CharacterReference ) ;
}
ON ( ' < ' )
{
SWITCH_TO ( TagOpen ) ;
}
2020-05-28 00:28:32 +02:00
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-28 00:28:32 +02:00
EMIT_CURRENT_CHARACTER ;
}
2020-05-22 21:46:13 +02:00
ON_EOF
{
2020-05-24 00:14:23 +02:00
EMIT_EOF ;
2020-05-22 21:46:13 +02:00
}
ANYTHING_ELSE
{
2020-05-24 20:24:43 +02:00
EMIT_CURRENT_CHARACTER ;
2020-05-22 21:46:13 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.6 Tag open state, https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( TagOpen )
{
ON ( ' ! ' )
{
SWITCH_TO ( MarkupDeclarationOpen ) ;
}
ON ( ' / ' )
{
SWITCH_TO ( EndTagOpen ) ;
}
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : StartTag ) ;
RECONSUME_IN ( TagName ) ;
}
2020-05-23 18:43:09 +02:00
ON ( ' ? ' )
2020-05-28 00:28:32 +02:00
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-28 00:28:32 +02:00
create_new_token ( HTMLToken : : Type : : Comment ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_start_position ( { } , nth_last_position ( 2 ) ) ;
2020-05-28 00:28:32 +02:00
RECONSUME_IN ( BogusComment ) ;
}
ON_EOF
2020-05-23 18:43:09 +02:00
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:47:07 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
EMIT_EOF ;
2020-05-23 18:43:09 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , Data ) ;
2020-05-23 18:43:09 +02:00
}
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( TagName )
{
2020-05-23 01:19:42 +02:00
ON_WHITESPACE
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 1 ) ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( BeforeAttributeName ) ;
}
ON ( ' / ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( SelfClosingStartTag ) ;
}
2020-05-22 21:46:13 +02:00
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 1 ) ) ;
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-22 21:46:13 +02:00
}
2020-05-28 00:28:32 +02:00
ON_ASCII_UPPER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-28 00:28:32 +02:00
continue ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-28 00:28:32 +02:00
continue ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-28 00:28:32 +02:00
EMIT_EOF ;
}
2020-05-22 21:46:13 +02:00
ANYTHING_ELSE
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2021-07-15 15:53:54 +02:00
m_current_token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-22 21:46:13 +02:00
continue ;
}
}
2020-05-23 10:33:23 +02:00
END_STATE
2020-05-22 21:46:13 +02:00
2022-02-15 18:12:15 +00:00
// 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( EndTagOpen )
{
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : EndTag ) ;
RECONSUME_IN ( TagName ) ;
}
2020-05-28 00:28:32 +02:00
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-28 00:28:32 +02:00
SWITCH_TO ( Data ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:47:07 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
EMIT_EOF ;
2020-05-28 00:28:32 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-28 00:28:32 +02:00
create_new_token ( HTMLToken : : Type : : Comment ) ;
RECONSUME_IN ( BogusComment ) ;
}
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( MarkupDeclarationOpen )
{
DONT_CONSUME_NEXT_INPUT_CHARACTER ;
2020-05-25 19:22:23 +02:00
if ( consume_next_if_match ( " -- " ) ) {
2020-05-23 01:54:26 +02:00
create_new_token ( HTMLToken : : Type : : Comment ) ;
2021-07-29 15:36:10 +02:00
m_current_token . set_start_position ( { } , nth_last_position ( 3 ) ) ;
2020-05-23 01:54:26 +02:00
SWITCH_TO ( CommentStart ) ;
}
2020-05-25 19:22:23 +02:00
if ( consume_next_if_match ( " DOCTYPE " , CaseSensitivity : : CaseInsensitive ) ) {
2020-05-22 21:46:13 +02:00
SWITCH_TO ( DOCTYPE ) ;
}
2020-06-04 21:06:54 +02:00
if ( consume_next_if_match ( " [CDATA[ " ) ) {
2022-02-15 18:52:45 +00:00
// We keep the parser optional so that syntax highlighting can be lexer-only.
// The parser registers itself with the lexer it creates.
if ( m_parser ! = nullptr & & m_parser - > adjusted_current_node ( ) . namespace_ ( ) ! = Namespace : : HTML ) {
SWITCH_TO ( CDATASection ) ;
} else {
create_new_token ( HTMLToken : : Type : : Comment ) ;
m_current_builder . append ( " [CDATA[ " ) ;
SWITCH_TO_WITH_UNCLEAN_BUILDER ( BogusComment ) ;
}
2020-06-04 21:06:54 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 21:06:54 +02:00
create_new_token ( HTMLToken : : Type : : Comment ) ;
SWITCH_TO ( BogusComment ) ;
}
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2020-05-28 00:28:32 +02:00
BEGIN_STATE ( BogusComment )
{
ON ( ' > ' )
{
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-04 21:06:54 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-28 00:28:32 +02:00
}
ON_EOF
{
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-04 21:06:54 +02:00
EMIT_EOF ;
2020-05-28 00:28:32 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-04 21:06:54 +02:00
continue ;
2020-05-28 00:28:32 +02:00
}
ANYTHING_ELSE
{
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-06-04 21:06:54 +02:00
continue ;
2020-05-28 00:28:32 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( DOCTYPE )
{
ON_WHITESPACE
{
SWITCH_TO ( BeforeDOCTYPEName ) ;
}
2020-05-23 19:56:07 +02:00
ON ( ' > ' )
{
RECONSUME_IN ( BeforeDOCTYPEName ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-23 19:56:07 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BeforeDOCTYPEName ) ;
2020-05-23 19:56:07 +02:00
}
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( BeforeDOCTYPEName )
{
ON_WHITESPACE
{
continue ;
}
2020-05-23 19:56:07 +02:00
ON_ASCII_UPPER_ALPHA
{
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_name = false ;
2021-05-23 08:20:03 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( DOCTYPEName ) ;
2020-05-23 19:56:07 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_name = false ;
2021-05-23 08:20:03 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( DOCTYPEName ) ;
2020-05-23 19:56:07 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 19:56:07 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-23 19:56:07 +02:00
}
2020-05-22 21:46:13 +02:00
ANYTHING_ELSE
{
create_new_token ( HTMLToken : : Type : : DOCTYPE ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_name = false ;
2021-05-23 08:20:03 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( DOCTYPEName ) ;
2020-05-22 21:46:13 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( DOCTYPEName )
{
2020-05-23 19:56:07 +02:00
ON_WHITESPACE
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . name = consume_current_builder ( ) ;
2020-05-23 19:56:07 +02:00
SWITCH_TO ( AfterDOCTYPEName ) ;
}
2020-05-22 21:46:13 +02:00
ON ( ' > ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . name = consume_current_builder ( ) ;
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-22 21:46:13 +02:00
}
2020-05-23 19:56:07 +02:00
ON_ASCII_UPPER_ALPHA
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-23 19:56:07 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-23 19:56:07 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-23 19:56:07 +02:00
}
2020-05-22 21:46:13 +02:00
ANYTHING_ELSE
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-22 21:46:13 +02:00
continue ;
}
2020-05-23 19:56:07 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
2020-05-23 19:56:07 +02:00
BEGIN_STATE ( AfterDOCTYPEName )
2020-05-25 19:50:44 +02:00
{
ON_WHITESPACE
{
continue ;
}
ON ( ' > ' )
{
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-06-01 21:18:08 +02:00
if ( to_ascii_uppercase ( current_input_character . value ( ) ) = = ' P ' & & consume_next_if_match ( " UBLIC " , CaseSensitivity : : CaseInsensitive ) ) {
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPEPublicKeyword ) ;
}
2021-06-01 21:18:08 +02:00
if ( to_ascii_uppercase ( current_input_character . value ( ) ) = = ' S ' & & consume_next_if_match ( " YSTEM " , CaseSensitivity : : CaseInsensitive ) ) {
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPESystemKeyword ) ;
}
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( AfterDOCTYPEPublicKeyword )
{
ON_WHITESPACE
{
SWITCH_TO ( BeforeDOCTYPEPublicIdentifier ) ;
}
ON ( ' " ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_public_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPEPublicIdentifierDoubleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON ( ' \' ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_public_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPEPublicIdentifierSingleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( AfterDOCTYPESystemKeyword )
{
ON_WHITESPACE
{
SWITCH_TO ( BeforeDOCTYPESystemIdentifier ) ;
}
ON ( ' " ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . system_identifier = { } ;
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPESystemIdentifierDoubleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON ( ' \' ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . system_identifier = { } ;
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPESystemIdentifierSingleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( BeforeDOCTYPEPublicIdentifier )
{
ON_WHITESPACE
{
continue ;
}
ON ( ' " ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_public_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPEPublicIdentifierDoubleQuoted ) ;
}
ON ( ' \' ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_public_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPEPublicIdentifierSingleQuoted ) ;
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( BeforeDOCTYPESystemIdentifier )
{
ON_WHITESPACE
{
continue ;
}
ON ( ' " ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPESystemIdentifierDoubleQuoted ) ;
}
ON ( ' \' ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPESystemIdentifierSingleQuoted ) ;
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( DOCTYPEPublicIdentifierDoubleQuoted )
{
ON ( ' " ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . public_identifier = consume_current_builder ( ) ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPEPublicIdentifier ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . public_identifier = consume_current_builder ( ) ;
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-25 19:50:44 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( DOCTYPEPublicIdentifierSingleQuoted )
{
ON ( ' \' ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . public_identifier = consume_current_builder ( ) ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPEPublicIdentifier ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . public_identifier = consume_current_builder ( ) ;
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-25 19:50:44 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( DOCTYPESystemIdentifierDoubleQuoted )
{
ON ( ' " ' )
{
2022-03-02 12:26:31 +01:00
m_current_token . ensure_doctype_data ( ) . system_identifier = consume_current_builder ( ) ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPESystemIdentifier ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2022-03-02 12:26:31 +01:00
m_current_token . ensure_doctype_data ( ) . system_identifier = consume_current_builder ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-25 19:50:44 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( DOCTYPESystemIdentifierSingleQuoted )
{
ON ( ' \' ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . system_identifier = consume_current_builder ( ) ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( AfterDOCTYPESystemIdentifier ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 05:00:45 +01:00
continue ;
2020-05-25 19:50:44 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . system_identifier = consume_current_builder ( ) ;
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:20:03 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-25 19:50:44 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( AfterDOCTYPEPublicIdentifier )
{
ON_WHITESPACE
{
SWITCH_TO ( BetweenDOCTYPEPublicAndSystemIdentifiers ) ;
}
ON ( ' > ' )
{
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON ( ' " ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPESystemIdentifierDoubleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON ( ' \' ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-06-11 05:00:45 +01:00
SWITCH_TO ( DOCTYPESystemIdentifierSingleQuoted ) ;
2020-05-25 19:50:44 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( BetweenDOCTYPEPublicAndSystemIdentifiers )
{
ON_WHITESPACE
{
continue ;
}
ON ( ' > ' )
{
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON ( ' " ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPESystemIdentifierDoubleQuoted ) ;
}
ON ( ' \' ' )
{
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . missing_system_identifier = false ;
2020-05-25 19:50:44 +02:00
SWITCH_TO ( DOCTYPESystemIdentifierSingleQuoted ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-25 19:50:44 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
2020-05-25 19:50:44 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
2020-05-25 19:50:44 +02:00
BEGIN_STATE ( AfterDOCTYPESystemIdentifier )
2020-05-23 19:56:07 +02:00
{
ON_WHITESPACE
{
continue ;
}
ON ( ' > ' )
{
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 19:56:07 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-15 18:58:11 +02:00
m_current_token . ensure_doctype_data ( ) . force_quirks = true ;
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
2020-05-23 19:56:07 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
RECONSUME_IN ( BogusDOCTYPE ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
2020-06-11 05:00:45 +01:00
BEGIN_STATE ( BogusDOCTYPE )
{
ON ( ' > ' )
{
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 05:00:45 +01:00
continue ;
}
ON_EOF
{
2021-05-23 12:39:00 +02:00
m_queued_tokens . enqueue ( move ( m_current_token ) ) ;
2020-06-11 05:00:45 +01:00
EMIT_EOF ;
}
ANYTHING_ELSE
{
continue ;
2020-05-23 19:56:07 +02:00
}
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( BeforeAttributeName )
{
ON_WHITESPACE
{
continue ;
}
ON ( ' / ' )
{
2021-07-14 23:53:11 +02:00
if ( m_current_token . has_attributes ( ) )
m_current_token . last_attribute ( ) . name_end_position = nth_last_position ( 1 ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AfterAttributeName ) ;
}
ON ( ' > ' )
{
RECONSUME_IN ( AfterAttributeName ) ;
}
ON_EOF
{
RECONSUME_IN ( AfterAttributeName ) ;
}
ON ( ' = ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:17:35 +02:00
HTMLToken : : Attribute new_attribute ;
2021-05-20 23:11:41 +04:30
new_attribute . name_start_position = nth_last_position ( 1 ) ;
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2021-07-14 23:53:11 +02:00
m_current_token . add_attribute ( move ( new_attribute ) ) ;
2021-05-23 08:50:48 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( AttributeName ) ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
2021-07-14 23:17:35 +02:00
HTMLToken : : Attribute new_attribute ;
2021-05-20 23:11:41 +04:30
new_attribute . name_start_position = nth_last_position ( 1 ) ;
2021-07-14 23:53:11 +02:00
m_current_token . add_attribute ( move ( new_attribute ) ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AttributeName ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( SelfClosingStartTag )
{
2020-05-27 18:07:51 +02:00
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_self_closing ( true ) ;
2020-05-30 11:30:45 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-27 18:07:51 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-01 12:37:28 +02:00
EMIT_EOF ;
2020-05-27 18:07:51 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-01 12:37:28 +02:00
RECONSUME_IN ( BeforeAttributeName ) ;
2020-05-27 18:07:51 +02:00
}
2020-05-23 01:19:42 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AttributeName )
{
ON_WHITESPACE
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . local_name = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AfterAttributeName ) ;
}
ON ( ' / ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . local_name = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AfterAttributeName ) ;
}
ON ( ' > ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . local_name = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AfterAttributeName ) ;
}
ON_EOF
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . local_name = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
RECONSUME_IN ( AfterAttributeName ) ;
}
ON ( ' = ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . name_end_position = nth_last_position ( 1 ) ;
m_current_token . last_attribute ( ) . local_name = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( BeforeAttributeValue ) ;
}
2020-06-26 22:41:35 +01:00
ON_ASCII_UPPER_ALPHA
{
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-06-26 22:41:35 +01:00
continue ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-26 22:41:35 +01:00
continue ;
}
ON ( ' " ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeName ;
}
ON ( ' \' ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeName ;
}
ON ( ' < ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeName ;
}
2020-05-23 01:19:42 +02:00
ANYTHING_ELSE
{
2020-06-26 22:41:35 +01:00
AnythingElseAttributeName :
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:19:42 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AfterAttributeName )
{
2020-05-27 18:27:32 +02:00
ON_WHITESPACE
{
continue ;
}
ON ( ' / ' )
{
SWITCH_TO ( SelfClosingStartTag ) ;
}
ON ( ' = ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . name_end_position = nth_last_position ( 1 ) ;
2020-05-27 18:27:32 +02:00
SWITCH_TO ( BeforeAttributeValue ) ;
}
ON ( ' > ' )
{
2020-06-04 11:59:14 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-27 18:27:32 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:47:07 +02:00
EMIT_EOF ;
2020-05-27 18:27:32 +02:00
}
ANYTHING_ELSE
{
2021-07-14 23:53:11 +02:00
m_current_token . add_attribute ( { } ) ;
2022-06-02 01:03:44 +02:00
if ( ! m_source_positions . is_empty ( ) )
m_current_token . last_attribute ( ) . name_start_position = m_source_positions . last ( ) ;
2020-05-27 18:27:32 +02:00
RECONSUME_IN ( AttributeName ) ;
}
2020-05-23 01:19:42 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( BeforeAttributeValue )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value_start_position = nth_last_position ( 1 ) ;
2020-05-23 01:19:42 +02:00
ON_WHITESPACE
{
continue ;
}
ON ( ' " ' )
{
SWITCH_TO ( AttributeValueDoubleQuoted ) ;
}
ON ( ' \' ' )
{
SWITCH_TO ( AttributeValueSingleQuoted ) ;
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 11:59:14 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( AttributeValueUnquoted ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AttributeValueDoubleQuoted )
{
ON ( ' " ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( AfterAttributeValueQuoted ) ;
}
ON ( ' & ' )
{
m_return_state = State : : AttributeValueDoubleQuoted ;
2021-07-24 23:15:47 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CharacterReference ) ;
2020-05-23 01:19:42 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-26 22:41:35 +01:00
continue ;
2020-05-23 01:19:42 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-27 12:05:21 +02:00
EMIT_EOF ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:19:42 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AttributeValueSingleQuoted )
{
ON ( ' \' ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value = consume_current_builder ( ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( AfterAttributeValueQuoted ) ;
}
ON ( ' & ' )
{
m_return_state = State : : AttributeValueSingleQuoted ;
2021-07-24 23:15:47 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CharacterReference ) ;
2020-05-23 01:19:42 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-26 22:41:35 +01:00
continue ;
2020-05-23 01:19:42 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-27 12:05:21 +02:00
EMIT_EOF ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:19:42 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AttributeValueUnquoted )
{
ON_WHITESPACE
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value = consume_current_builder ( ) ;
2021-12-10 17:17:12 +00:00
m_current_token . last_attribute ( ) . value_end_position = nth_last_position ( 1 ) ;
2020-05-23 01:19:42 +02:00
SWITCH_TO ( BeforeAttributeName ) ;
}
ON ( ' & ' )
{
m_return_state = State : : AttributeValueUnquoted ;
2021-07-24 23:15:47 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CharacterReference ) ;
2020-05-23 01:19:42 +02:00
}
ON ( ' > ' )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value = consume_current_builder ( ) ;
m_current_token . last_attribute ( ) . value_end_position = nth_last_position ( 1 ) ;
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:19:42 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-26 22:41:35 +01:00
continue ;
}
ON ( ' " ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeValueUnquoted ;
}
ON ( ' \' ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeValueUnquoted ;
}
ON ( ' < ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeValueUnquoted ;
}
ON ( ' = ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeValueUnquoted ;
}
ON ( ' ` ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
goto AnythingElseAttributeValueUnquoted ;
2020-05-23 01:19:42 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-27 12:05:21 +02:00
EMIT_EOF ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
2020-06-26 22:41:35 +01:00
AnythingElseAttributeValueUnquoted :
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:19:42 +02:00
continue ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
2020-05-23 01:19:42 +02:00
BEGIN_STATE ( AfterAttributeValueQuoted )
{
2021-07-14 23:53:11 +02:00
m_current_token . last_attribute ( ) . value_end_position = nth_last_position ( 1 ) ;
2020-05-23 01:19:42 +02:00
ON_WHITESPACE
{
SWITCH_TO ( BeforeAttributeName ) ;
}
ON ( ' / ' )
{
SWITCH_TO ( SelfClosingStartTag ) ;
}
ON ( ' > ' )
{
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:19:42 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-30 18:40:23 +02:00
EMIT_EOF ;
2020-05-23 01:19:42 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-30 18:40:23 +02:00
RECONSUME_IN ( BeforeAttributeName ) ;
2020-05-23 01:19:42 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentStart )
{
ON ( ' - ' )
{
SWITCH_TO ( CommentStartDash ) ;
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 02:09:07 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentStartDash )
{
ON ( ' - ' )
{
2021-07-15 00:35:53 +02:00
SWITCH_TO ( CommentEnd ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 02:09:07 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:54:26 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-11 02:09:07 +01:00
EMIT_EOF ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2021-05-23 09:31:40 +02:00
m_current_builder . append ( ' - ' ) ;
2020-05-23 01:54:26 +02:00
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( Comment )
{
ON ( ' < ' )
{
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentLessThanSign ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' - ' )
{
2021-05-23 09:31:40 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentEndDash ) ;
2020-05-23 01:54:26 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( 0xFFFD ) ;
2020-06-11 02:09:07 +01:00
continue ;
2020-05-23 01:54:26 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-11 02:09:07 +01:00
EMIT_EOF ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:54:26 +02:00
continue ;
}
}
2020-05-23 14:04:53 +02:00
END_STATE
2020-05-23 01:54:26 +02:00
2022-02-15 18:12:15 +00:00
// 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentEnd )
{
ON ( ' > ' )
{
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-05-24 00:14:23 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' ! ' )
{
2021-07-15 00:35:53 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentEndBang ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' - ' )
{
2021-05-23 09:31:40 +02:00
m_current_builder . append ( ' - ' ) ;
2020-05-23 01:54:26 +02:00
continue ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-11 02:09:07 +01:00
EMIT_EOF ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2022-02-18 23:02:52 +00:00
m_current_builder . append ( " -- " ) ;
2020-05-23 01:54:26 +02:00
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentEndBang )
{
ON ( ' - ' )
{
2021-05-23 09:31:40 +02:00
m_current_builder . append ( " --! " ) ;
2021-07-15 00:35:53 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentEndDash ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' > ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-11 02:09:07 +01:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-23 01:54:26 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-11 02:09:07 +01:00
EMIT_EOF ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2021-05-23 09:31:40 +02:00
m_current_builder . append ( " --! " ) ;
2020-05-23 01:54:26 +02:00
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentEndDash )
{
ON ( ' - ' )
{
2021-05-23 09:31:40 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentEnd ) ;
2020-05-23 01:54:26 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2021-07-14 23:32:18 +02:00
m_current_token . set_comment ( consume_current_builder ( ) ) ;
2020-06-11 02:09:07 +01:00
EMIT_EOF ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2021-05-23 09:31:40 +02:00
m_current_builder . append ( ' - ' ) ;
2020-05-23 01:54:26 +02:00
RECONSUME_IN ( Comment ) ;
}
}
2020-05-23 14:04:53 +02:00
END_STATE
2020-05-23 01:54:26 +02:00
2022-02-15 18:12:15 +00:00
// 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentLessThanSign )
{
ON ( ' ! ' )
{
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentLessThanSignBang ) ;
2020-05-23 01:54:26 +02:00
}
ON ( ' < ' )
{
2021-05-23 09:31:40 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-23 01:54:26 +02:00
continue ;
}
ANYTHING_ELSE
{
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentLessThanSignBang )
{
ON ( ' - ' )
{
2021-07-15 00:35:53 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentLessThanSignBangDash ) ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( Comment ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentLessThanSignBangDash )
{
ON ( ' - ' )
{
2021-07-15 00:35:53 +02:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( CommentLessThanSignBangDashDash ) ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2020-06-11 02:09:07 +01:00
RECONSUME_IN ( CommentEndDash ) ;
2020-05-23 01:54:26 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2020-05-23 01:54:26 +02:00
BEGIN_STATE ( CommentLessThanSignBangDashDash )
{
ON ( ' > ' )
{
2020-06-11 02:09:07 +01:00
RECONSUME_IN ( CommentEnd ) ;
}
ON_EOF
{
RECONSUME_IN ( CommentEnd ) ;
2020-05-23 01:54:26 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-01 20:14:23 +02:00
RECONSUME_IN ( CommentEnd ) ;
2020-05-23 01:54:26 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
2020-05-22 21:46:13 +02:00
BEGIN_STATE ( CharacterReference )
{
2020-05-27 16:16:23 +02:00
m_temporary_buffer . clear ( ) ;
m_temporary_buffer . append ( ' & ' ) ;
ON_ASCII_ALPHANUMERIC
{
RECONSUME_IN ( NamedCharacterReference ) ;
}
ON ( ' # ' )
{
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
2021-07-28 00:37:26 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( NumericCharacterReference ) ;
2020-05-27 16:16:23 +02:00
}
ANYTHING_ELSE
{
2020-12-29 14:42:47 +00:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-05-27 16:16:23 +02:00
RECONSUME_IN_RETURN_STATE ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( NamedCharacterReference )
{
2020-06-04 22:02:54 +02:00
size_t byte_offset = m_utf8_view . byte_offset_of ( m_prev_utf8_iterator ) ;
2022-02-18 22:12:47 +00:00
auto match = HTML : : code_points_from_entity ( m_decoded_input . substring_view ( byte_offset , m_decoded_input . length ( ) - byte_offset ) ) ;
2020-05-28 11:44:19 +02:00
if ( match . has_value ( ) ) {
2021-05-20 23:11:41 +04:30
skip ( match - > entity . length ( ) - 1 ) ;
2020-05-28 11:44:19 +02:00
for ( auto ch : match . value ( ) . entity )
m_temporary_buffer . append ( ch ) ;
2020-12-29 14:42:47 +00:00
if ( consumed_as_part_of_an_attribute ( ) & & ! match . value ( ) . entity . ends_with ( ' ; ' ) ) {
2020-08-05 16:31:20 -04:00
auto next_code_point = peek_code_point ( 0 ) ;
2021-06-01 21:18:08 +02:00
if ( next_code_point . has_value ( ) & & ( next_code_point . value ( ) = = ' = ' | | is_ascii_alphanumeric ( next_code_point . value ( ) ) ) ) {
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-05-28 11:44:19 +02:00
SWITCH_TO_RETURN_STATE ;
}
}
if ( ! match . value ( ) . entity . ends_with ( ' ; ' ) ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-28 11:44:19 +02:00
}
2021-06-12 13:24:45 +02:00
m_temporary_buffer = match . value ( ) . code_points ;
2020-05-28 11:44:19 +02:00
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-05-28 11:44:19 +02:00
SWITCH_TO_RETURN_STATE ;
} else {
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-12-29 14:42:47 +00:00
// FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
// I can't wrap my head around how to do it as the spec says.
RECONSUME_IN ( AmbiguousAmpersand ) ;
2020-05-28 11:44:19 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
2020-05-28 11:44:19 +02:00
BEGIN_STATE ( AmbiguousAmpersand )
{
ON_ASCII_ALPHANUMERIC
{
if ( consumed_as_part_of_an_attribute ( ) ) {
2021-05-23 08:50:48 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-28 11:44:19 +02:00
continue ;
} else {
EMIT_CURRENT_CHARACTER ;
}
}
2020-05-27 16:16:23 +02:00
ON ( ' ; ' )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
RECONSUME_IN_RETURN_STATE ;
2020-05-27 16:16:23 +02:00
}
ANYTHING_ELSE
{
2020-05-28 11:44:19 +02:00
RECONSUME_IN_RETURN_STATE ;
2020-05-27 16:16:23 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( NumericCharacterReference )
{
m_character_reference_code = 0 ;
ON ( ' X ' )
{
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
2021-07-28 00:37:26 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( HexadecimalCharacterReferenceStart ) ;
2020-05-27 16:16:23 +02:00
}
ON ( ' x ' )
{
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
2021-07-28 00:37:26 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( HexadecimalCharacterReferenceStart ) ;
2020-05-27 16:16:23 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( DecimalCharacterReferenceStart ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( HexadecimalCharacterReferenceStart )
{
ON_ASCII_HEX_DIGIT
{
RECONSUME_IN ( HexadecimalCharacterReference ) ;
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-06-26 22:41:35 +01:00
RECONSUME_IN_RETURN_STATE ;
2020-05-27 16:16:23 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( DecimalCharacterReferenceStart )
{
ON_ASCII_DIGIT
{
RECONSUME_IN ( DecimalCharacterReference ) ;
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-06-26 22:41:35 +01:00
RECONSUME_IN_RETURN_STATE ;
2020-05-27 16:16:23 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( HexadecimalCharacterReference )
{
ON_ASCII_DIGIT
{
m_character_reference_code * = 16 ;
m_character_reference_code + = current_input_character . value ( ) - 0x30 ;
continue ;
}
ON_ASCII_UPPER_ALPHA
{
m_character_reference_code * = 16 ;
m_character_reference_code + = current_input_character . value ( ) - 0x37 ;
continue ;
}
ON_ASCII_LOWER_ALPHA
{
m_character_reference_code * = 16 ;
m_character_reference_code + = current_input_character . value ( ) - 0x57 ;
continue ;
}
ON ( ' ; ' )
{
2021-07-28 00:37:26 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( NumericCharacterReferenceEnd ) ;
2020-05-27 16:16:23 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
RECONSUME_IN ( NumericCharacterReferenceEnd ) ;
2020-05-27 16:16:23 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( DecimalCharacterReference )
{
ON_ASCII_DIGIT
{
m_character_reference_code * = 10 ;
m_character_reference_code + = current_input_character . value ( ) - 0x30 ;
continue ;
}
ON ( ' ; ' )
{
2021-07-28 00:37:26 +01:00
SWITCH_TO_WITH_UNCLEAN_BUILDER ( NumericCharacterReferenceEnd ) ;
2020-05-27 16:16:23 +02:00
}
ANYTHING_ELSE
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
RECONSUME_IN ( NumericCharacterReferenceEnd ) ;
2020-05-27 16:16:23 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
2020-05-27 16:16:23 +02:00
BEGIN_STATE ( NumericCharacterReferenceEnd )
{
2020-06-04 16:48:15 +02:00
DONT_CONSUME_NEXT_INPUT_CHARACTER ;
2020-05-27 16:16:23 +02:00
if ( m_character_reference_code = = 0 ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
m_character_reference_code = 0xFFFD ;
2020-05-27 16:16:23 +02:00
}
if ( m_character_reference_code > 0x10ffff ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
m_character_reference_code = 0xFFFD ;
2020-05-27 16:16:23 +02:00
}
2021-06-01 21:18:08 +02:00
if ( is_unicode_surrogate ( m_character_reference_code ) ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-26 22:41:35 +01:00
m_character_reference_code = 0xFFFD ;
2020-05-27 16:16:23 +02:00
}
2021-06-01 21:18:08 +02:00
if ( is_unicode_noncharacter ( m_character_reference_code ) ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-27 16:16:23 +02:00
}
2021-06-01 21:18:08 +02:00
if ( m_character_reference_code = = 0xd | | ( is_unicode_control ( m_character_reference_code ) & & ! is_ascii_space ( m_character_reference_code ) ) ) {
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-05-27 16:16:23 +02:00
constexpr struct {
u32 number ;
2020-08-05 16:31:20 -04:00
u32 code_point ;
2020-05-27 16:16:23 +02:00
} conversion_table [ ] = {
{ 0x80 , 0x20AC } ,
{ 0x82 , 0x201A } ,
{ 0x83 , 0x0192 } ,
{ 0x84 , 0x201E } ,
{ 0x85 , 0x2026 } ,
{ 0x86 , 0x2020 } ,
{ 0x87 , 0x2021 } ,
{ 0x88 , 0x02C6 } ,
{ 0x89 , 0x2030 } ,
{ 0x8A , 0x0160 } ,
{ 0x8B , 0x2039 } ,
{ 0x8C , 0x0152 } ,
{ 0x8E , 0x017D } ,
{ 0x91 , 0x2018 } ,
{ 0x92 , 0x2019 } ,
{ 0x93 , 0x201C } ,
{ 0x94 , 0x201D } ,
{ 0x95 , 0x2022 } ,
{ 0x96 , 0x2013 } ,
{ 0x97 , 0x2014 } ,
{ 0x98 , 0x02DC } ,
{ 0x99 , 0x2122 } ,
{ 0x9A , 0x0161 } ,
{ 0x9B , 0x203A } ,
{ 0x9C , 0x0153 } ,
{ 0x9E , 0x017E } ,
{ 0x9F , 0x0178 } ,
} ;
for ( auto & entry : conversion_table ) {
if ( m_character_reference_code = = entry . number ) {
2020-08-05 16:31:20 -04:00
m_character_reference_code = entry . code_point ;
2020-05-27 16:16:23 +02:00
break ;
}
}
}
m_temporary_buffer . clear ( ) ;
m_temporary_buffer . append ( m_character_reference_code ) ;
2020-08-05 16:28:19 -04:00
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE ;
2020-05-27 16:16:23 +02:00
SWITCH_TO_RETURN_STATE ;
2020-05-22 21:46:13 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
2020-05-24 20:24:43 +02:00
BEGIN_STATE ( RCDATA )
{
ON ( ' & ' )
{
m_return_state = State : : RCDATA ;
SWITCH_TO ( CharacterReference ) ;
}
ON ( ' < ' )
{
SWITCH_TO ( RCDATALessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 21:06:54 +02:00
EMIT_CHARACTER ( 0xFFFD ) ;
2020-05-24 20:24:43 +02:00
}
ON_EOF
{
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
2020-05-24 20:24:43 +02:00
BEGIN_STATE ( RCDATALessThanSign )
{
ON ( ' / ' )
{
m_temporary_buffer . clear ( ) ;
SWITCH_TO ( RCDATAEndTagOpen ) ;
}
ANYTHING_ELSE
{
2020-06-05 12:02:30 +02:00
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , RCDATA ) ;
2020-05-24 20:24:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
2020-05-24 20:24:43 +02:00
BEGIN_STATE ( RCDATAEndTagOpen )
{
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : EndTag ) ;
RECONSUME_IN ( RCDATAEndTagName ) ;
}
ANYTHING_ELSE
{
2020-05-30 18:40:23 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
RECONSUME_IN ( RCDATA ) ;
2020-05-24 20:24:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
2020-05-24 20:24:43 +02:00
BEGIN_STATE ( RCDATAEndTagName )
{
ON_WHITESPACE
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-28 00:28:32 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 18:40:23 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 18:40:23 +02:00
RECONSUME_IN ( RCDATA ) ;
2020-05-28 00:28:32 +02:00
}
SWITCH_TO ( BeforeAttributeName ) ;
2020-05-24 20:24:43 +02:00
}
ON ( ' / ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-28 00:28:32 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 18:40:23 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 18:40:23 +02:00
RECONSUME_IN ( RCDATA ) ;
2020-05-28 00:28:32 +02:00
}
SWITCH_TO ( SelfClosingStartTag ) ;
2020-05-24 20:24:43 +02:00
}
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-24 20:24:43 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 18:40:23 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 18:40:23 +02:00
RECONSUME_IN ( RCDATA ) ;
2020-05-24 20:24:43 +02:00
}
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON_ASCII_UPPER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-05-24 20:24:43 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ON_ASCII_LOWER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( current_input_character . value ( ) ) ;
2020-05-24 20:24:43 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
2020-05-24 20:36:43 +02:00
continue ;
}
ANYTHING_ELSE
{
2020-05-30 18:40:23 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 18:40:23 +02:00
RECONSUME_IN ( RCDATA ) ;
2020-05-24 20:36:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.3 RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
2020-05-24 20:36:43 +02:00
BEGIN_STATE ( RAWTEXT )
{
ON ( ' < ' )
{
SWITCH_TO ( RAWTEXTLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 21:06:54 +02:00
EMIT_CHARACTER ( 0xFFFD ) ;
2020-05-24 20:36:43 +02:00
}
ON_EOF
{
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
2020-05-24 20:36:43 +02:00
BEGIN_STATE ( RAWTEXTLessThanSign )
{
ON ( ' / ' )
{
m_temporary_buffer . clear ( ) ;
SWITCH_TO ( RAWTEXTEndTagOpen ) ;
}
ANYTHING_ELSE
{
2020-06-06 07:06:46 +01:00
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , RAWTEXT ) ;
2020-05-24 20:36:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
2020-05-24 20:36:43 +02:00
BEGIN_STATE ( RAWTEXTEndTagOpen )
{
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : EndTag ) ;
RECONSUME_IN ( RAWTEXTEndTagName ) ;
}
ANYTHING_ELSE
{
2020-05-30 16:15:16 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
RECONSUME_IN ( RAWTEXT ) ;
2020-05-24 20:36:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
2020-05-24 20:36:43 +02:00
BEGIN_STATE ( RAWTEXTEndTagName )
{
ON_WHITESPACE
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-28 00:28:32 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 16:15:16 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 16:15:16 +02:00
RECONSUME_IN ( RAWTEXT ) ;
2020-05-28 00:28:32 +02:00
}
SWITCH_TO ( BeforeAttributeName ) ;
2020-05-24 20:36:43 +02:00
}
ON ( ' / ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-28 00:28:32 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 16:15:16 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 16:15:16 +02:00
RECONSUME_IN ( RAWTEXT ) ;
2020-05-28 00:28:32 +02:00
}
SWITCH_TO ( SelfClosingStartTag ) ;
2020-05-24 20:36:43 +02:00
}
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-24 20:36:43 +02:00
if ( ! current_end_tag_token_is_appropriate ( ) ) {
2020-05-30 16:15:16 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 16:15:16 +02:00
RECONSUME_IN ( RAWTEXT ) ;
2020-05-24 20:36:43 +02:00
}
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
}
ON_ASCII_UPPER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-05-24 20:36:43 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ON_ASCII_LOWER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append ( current_input_character . value ( ) ) ;
2020-05-24 20:36:43 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
2020-05-24 20:24:43 +02:00
continue ;
}
ANYTHING_ELSE
{
2020-05-30 16:15:16 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 16:15:16 +02:00
RECONSUME_IN ( RAWTEXT ) ;
2020-05-24 20:24:43 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
2020-05-24 22:00:46 +02:00
BEGIN_STATE ( ScriptData )
{
ON ( ' < ' )
{
SWITCH_TO ( ScriptDataLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 21:06:54 +02:00
EMIT_CHARACTER ( 0xFFFD ) ;
2020-05-28 00:28:32 +02:00
}
ON_EOF
{
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
2020-05-28 00:28:32 +02:00
BEGIN_STATE ( PLAINTEXT )
{
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-04 21:06:54 +02:00
EMIT_CHARACTER ( 0xFFFD ) ;
2020-05-24 22:00:46 +02:00
}
ON_EOF
{
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
2020-05-24 22:00:46 +02:00
BEGIN_STATE ( ScriptDataLessThanSign )
{
ON ( ' / ' )
{
m_temporary_buffer . clear ( ) ;
SWITCH_TO ( ScriptDataEndTagOpen ) ;
}
ON ( ' ! ' )
{
2020-05-28 18:44:17 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' ! ' ) ) ;
SWITCH_TO ( ScriptDataEscapeStart ) ;
2020-05-24 22:00:46 +02:00
}
ANYTHING_ELSE
{
2020-05-26 15:50:05 +02:00
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapeStart )
{
ON ( ' - ' )
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataEscapeStartDash ) ;
2020-05-28 18:44:17 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( ScriptData ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapeStartDash )
{
ON ( ' - ' )
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataEscapedDashDash ) ;
2020-05-28 18:44:17 +02:00
}
ANYTHING_ELSE
{
RECONSUME_IN ( ScriptData ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapedDashDash )
{
ON ( ' - ' )
{
EMIT_CHARACTER ( ' - ' ) ;
}
ON ( ' < ' )
{
SWITCH_TO ( ScriptDataEscapedLessThanSign ) ;
}
ON ( ' > ' )
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( ' > ' , ScriptData ) ;
2020-05-28 18:44:17 +02:00
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( 0xFFFD , ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
2020-05-28 18:44:17 +02:00
}
ANYTHING_ELSE
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapedLessThanSign )
{
ON ( ' / ' )
{
m_temporary_buffer . clear ( ) ;
SWITCH_TO ( ScriptDataEscapedEndTagOpen ) ;
}
ON_ASCII_ALPHA
{
m_temporary_buffer . clear ( ) ;
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , ScriptDataDoubleEscapeStart ) ;
}
ANYTHING_ELSE
{
EMIT_CHARACTER_AND_RECONSUME_IN ( ' < ' , ScriptDataEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapedEndTagOpen )
{
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : EndTag ) ;
RECONSUME_IN ( ScriptDataEscapedEndTagName ) ;
}
ANYTHING_ELSE
{
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
RECONSUME_IN ( ScriptDataEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapedEndTagName )
{
ON_WHITESPACE
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-06-06 07:06:46 +01:00
if ( current_end_tag_token_is_appropriate ( ) )
2020-05-28 18:44:17 +02:00
SWITCH_TO ( BeforeAttributeName ) ;
2020-06-06 07:06:46 +01:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer ) {
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-28 18:44:17 +02:00
}
2020-06-06 07:06:46 +01:00
RECONSUME_IN ( ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
ON ( ' / ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-06-06 07:06:46 +01:00
if ( current_end_tag_token_is_appropriate ( ) )
2020-05-28 18:44:17 +02:00
SWITCH_TO ( SelfClosingStartTag ) ;
2020-06-06 07:06:46 +01:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer ) {
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-28 18:44:17 +02:00
}
2020-06-06 07:06:46 +01:00
RECONSUME_IN ( ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-06-06 07:06:46 +01:00
if ( current_end_tag_token_is_appropriate ( ) )
2020-05-28 18:44:17 +02:00
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-06-06 07:06:46 +01:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer ) {
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-28 18:44:17 +02:00
}
2020-06-06 07:06:46 +01:00
RECONSUME_IN ( ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
ON_ASCII_UPPER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-05-28 18:44:17 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ON_ASCII_LOWER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append ( current_input_character . value ( ) ) ;
2020-05-28 18:44:17 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ANYTHING_ELSE
{
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer ) {
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-28 18:44:17 +02:00
}
RECONSUME_IN ( ScriptDataEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataDoubleEscapeStart )
{
2020-06-06 07:06:46 +01:00
auto temporary_buffer_equal_to_script = [ this ] ( ) - > bool {
if ( m_temporary_buffer . size ( ) ! = 6 )
return false ;
// FIXME: Is there a better way of doing this?
2020-07-28 19:18:23 +02:00
return m_temporary_buffer [ 0 ] = = ' s ' & & m_temporary_buffer [ 1 ] = = ' c ' & & m_temporary_buffer [ 2 ] = = ' r ' & & m_temporary_buffer [ 3 ] = = ' i ' & & m_temporary_buffer [ 4 ] = = ' p ' & & m_temporary_buffer [ 5 ] = = ' t ' ;
2020-06-06 07:06:46 +01:00
} ;
ON_WHITESPACE
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
}
ON ( ' / ' )
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
}
ON ( ' > ' )
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
}
ON_ASCII_UPPER_ALPHA
{
2021-06-01 21:18:08 +02:00
m_temporary_buffer . append ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-06-06 07:06:46 +01:00
EMIT_CURRENT_CHARACTER ;
}
ON_ASCII_LOWER_ALPHA
{
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
EMIT_CURRENT_CHARACTER ;
}
ANYTHING_ELSE
{
RECONSUME_IN ( ScriptDataEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
2020-06-06 07:06:46 +01:00
BEGIN_STATE ( ScriptDataDoubleEscaped )
{
ON ( ' - ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataDoubleEscapedDash ) ;
}
ON ( ' < ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' < ' , ScriptDataDoubleEscapedLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_CHARACTER ( 0xFFFD ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
2020-06-06 07:06:46 +01:00
BEGIN_STATE ( ScriptDataDoubleEscapedDash )
{
ON ( ' - ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataDoubleEscapedDashDash ) ;
}
ON ( ' < ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' < ' , ScriptDataDoubleEscapedLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( 0xFFFD , ScriptDataDoubleEscaped ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
}
ANYTHING_ELSE
{
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
2020-06-06 07:06:46 +01:00
BEGIN_STATE ( ScriptDataDoubleEscapedDashDash )
{
ON ( ' - ' )
{
EMIT_CHARACTER ( ' - ' ) ;
}
ON ( ' < ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' < ' , ScriptDataDoubleEscapedLessThanSign ) ;
}
ON ( ' > ' )
{
SWITCH_TO_AND_EMIT_CHARACTER ( ' > ' , ScriptData ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( 0xFFFD , ScriptDataDoubleEscaped ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
}
ANYTHING_ELSE
{
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
2020-06-06 07:06:46 +01:00
BEGIN_STATE ( ScriptDataDoubleEscapedLessThanSign )
{
ON ( ' / ' )
{
m_temporary_buffer . clear ( ) ;
SWITCH_TO_AND_EMIT_CHARACTER ( ' / ' , ScriptDataDoubleEscapeEnd ) ;
}
ANYTHING_ELSE
{
RECONSUME_IN ( ScriptDataDoubleEscaped ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
2020-06-06 07:06:46 +01:00
BEGIN_STATE ( ScriptDataDoubleEscapeEnd )
{
auto temporary_buffer_equal_to_script = [ this ] ( ) - > bool {
if ( m_temporary_buffer . size ( ) ! = 6 )
return false ;
// FIXME: Is there a better way of doing this?
2020-07-28 19:18:23 +02:00
return m_temporary_buffer [ 0 ] = = ' s ' & & m_temporary_buffer [ 1 ] = = ' c ' & & m_temporary_buffer [ 2 ] = = ' r ' & & m_temporary_buffer [ 3 ] = = ' i ' & & m_temporary_buffer [ 4 ] = = ' p ' & & m_temporary_buffer [ 5 ] = = ' t ' ;
2020-06-06 07:06:46 +01:00
} ;
ON_WHITESPACE
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
}
ON ( ' / ' )
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
}
ON ( ' > ' )
{
if ( temporary_buffer_equal_to_script ( ) )
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
else
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataDoubleEscaped ) ;
}
ON_ASCII_UPPER_ALPHA
{
2021-06-01 21:18:08 +02:00
m_temporary_buffer . append ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-06-06 07:06:46 +01:00
EMIT_CURRENT_CHARACTER ;
}
ON_ASCII_LOWER_ALPHA
{
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
EMIT_CURRENT_CHARACTER ;
}
ANYTHING_ELSE
{
RECONSUME_IN ( ScriptDataDoubleEscaped ) ;
}
2020-05-28 18:44:17 +02:00
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscapedDash )
{
ON ( ' - ' )
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataEscapedDashDash ) ;
2020-05-28 18:44:17 +02:00
}
ON ( ' < ' )
{
SWITCH_TO ( ScriptDataEscapedLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( 0xFFFD , ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
2020-05-28 18:44:17 +02:00
}
ANYTHING_ELSE
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CURRENT_CHARACTER ( ScriptDataEscaped ) ;
2020-05-28 18:44:17 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
2020-05-28 18:44:17 +02:00
BEGIN_STATE ( ScriptDataEscaped )
{
ON ( ' - ' )
{
2020-06-06 07:06:46 +01:00
SWITCH_TO_AND_EMIT_CHARACTER ( ' - ' , ScriptDataEscapedDash ) ;
2020-05-28 18:44:17 +02:00
}
ON ( ' < ' )
{
SWITCH_TO ( ScriptDataEscapedLessThanSign ) ;
}
ON ( 0 )
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_CHARACTER ( 0xFFFD ) ;
2020-05-28 18:44:17 +02:00
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-06 07:06:46 +01:00
EMIT_EOF ;
2020-05-28 18:44:17 +02:00
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
2020-05-24 22:00:46 +02:00
BEGIN_STATE ( ScriptDataEndTagOpen )
{
ON_ASCII_ALPHA
{
create_new_token ( HTMLToken : : Type : : EndTag ) ;
RECONSUME_IN ( ScriptDataEndTagName ) ;
}
ANYTHING_ELSE
{
2020-06-06 07:06:46 +01:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
RECONSUME_IN ( ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
2020-05-24 22:00:46 +02:00
BEGIN_STATE ( ScriptDataEndTagName )
{
ON_WHITESPACE
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-24 22:00:46 +02:00
if ( current_end_tag_token_is_appropriate ( ) )
SWITCH_TO ( BeforeAttributeName ) ;
2020-05-30 22:59:41 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 22:59:41 +02:00
RECONSUME_IN ( ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
ON ( ' / ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-24 22:00:46 +02:00
if ( current_end_tag_token_is_appropriate ( ) )
SWITCH_TO ( SelfClosingStartTag ) ;
2020-05-30 22:59:41 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 22:59:41 +02:00
RECONSUME_IN ( ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
ON ( ' > ' )
{
2021-07-14 23:37:48 +02:00
m_current_token . set_tag_name ( consume_current_builder ( ) ) ;
2020-05-24 22:00:46 +02:00
if ( current_end_tag_token_is_appropriate ( ) )
SWITCH_TO_AND_EMIT_CURRENT_TOKEN ( Data ) ;
2020-05-30 22:59:41 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-16 00:36:10 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 22:59:41 +02:00
RECONSUME_IN ( ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
ON_ASCII_UPPER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append_code_point ( to_ascii_lowercase ( current_input_character . value ( ) ) ) ;
2020-05-24 22:00:46 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ON_ASCII_LOWER_ALPHA
{
2021-05-23 09:16:07 +02:00
m_current_builder . append ( current_input_character . value ( ) ) ;
2020-05-24 22:00:46 +02:00
m_temporary_buffer . append ( current_input_character . value ( ) ) ;
continue ;
}
ANYTHING_ELSE
{
2020-05-30 22:59:41 +02:00
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' < ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' / ' ) ) ;
2021-07-15 01:25:34 +02:00
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
m_current_builder . clear ( ) ;
2020-08-05 16:31:20 -04:00
for ( auto code_point : m_temporary_buffer )
m_queued_tokens . enqueue ( HTMLToken : : make_character ( code_point ) ) ;
2020-05-30 22:59:41 +02:00
RECONSUME_IN ( ScriptData ) ;
2020-05-24 22:00:46 +02:00
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
2020-06-12 23:43:06 +01:00
BEGIN_STATE ( CDATASection )
{
ON ( ' ] ' )
{
SWITCH_TO ( CDATASectionBracket ) ;
}
ON_EOF
{
2021-04-24 20:22:30 -07:00
log_parse_error ( ) ;
2020-06-12 23:43:06 +01:00
EMIT_EOF ;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2020-06-12 23:43:06 +01:00
BEGIN_STATE ( CDATASectionBracket )
{
ON ( ' ] ' )
{
SWITCH_TO ( CDATASectionEnd ) ;
}
ANYTHING_ELSE
{
EMIT_CHARACTER_AND_RECONSUME_IN ( ' ] ' , CDATASection ) ;
}
}
END_STATE
2022-02-15 18:12:15 +00:00
// 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2020-06-12 23:43:06 +01:00
BEGIN_STATE ( CDATASectionEnd )
{
ON ( ' ] ' )
{
EMIT_CHARACTER ( ' ] ' ) ;
}
ON ( ' > ' )
{
SWITCH_TO ( Data ) ;
}
ANYTHING_ELSE
{
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' ] ' ) ) ;
m_queued_tokens . enqueue ( HTMLToken : : make_character ( ' ] ' ) ) ;
RECONSUME_IN ( CDATASection ) ;
}
}
END_STATE
2020-05-22 21:46:13 +02:00
default :
2020-05-24 22:00:46 +02:00
TODO ( ) ;
2020-05-22 21:46:13 +02:00
}
}
}
2021-11-11 00:55:02 +01:00
bool HTMLTokenizer : : consume_next_if_match ( StringView string , CaseSensitivity case_sensitivity )
2020-05-22 21:46:13 +02:00
{
for ( size_t i = 0 ; i < string . length ( ) ; + + i ) {
2020-08-05 16:31:20 -04:00
auto code_point = peek_code_point ( i ) ;
if ( ! code_point . has_value ( ) )
2020-05-22 21:46:13 +02:00
return false ;
// FIXME: This should be more Unicode-aware.
2020-05-25 19:22:23 +02:00
if ( case_sensitivity = = CaseSensitivity : : CaseInsensitive ) {
2020-08-05 16:31:20 -04:00
if ( code_point . value ( ) < 0x80 ) {
2021-06-01 21:18:08 +02:00
if ( to_ascii_lowercase ( code_point . value ( ) ) ! = to_ascii_lowercase ( string [ i ] ) )
2020-05-25 19:22:23 +02:00
return false ;
continue ;
}
}
2020-08-05 16:31:20 -04:00
if ( code_point . value ( ) ! = ( u32 ) string [ i ] )
2020-05-22 21:46:13 +02:00
return false ;
}
2021-05-20 23:11:41 +04:30
skip ( string . length ( ) ) ;
2020-05-22 21:46:13 +02:00
return true ;
}
void HTMLTokenizer : : create_new_token ( HTMLToken : : Type type )
{
2021-07-15 15:55:46 +02:00
m_current_token = { type } ;
2021-05-20 23:11:41 +04:30
size_t offset = 0 ;
switch ( type ) {
case HTMLToken : : Type : : StartTag :
offset = 1 ;
break ;
case HTMLToken : : Type : : EndTag :
offset = 2 ;
break ;
default :
break ;
}
2021-07-15 15:53:54 +02:00
m_current_token . set_start_position ( { } , nth_last_position ( offset ) ) ;
2020-05-22 21:46:13 +02:00
}
2022-02-19 15:58:21 +01:00
HTMLTokenizer : : HTMLTokenizer ( )
{
m_decoded_input = " " ;
m_utf8_view = Utf8View ( m_decoded_input ) ;
m_utf8_iterator = m_utf8_view . begin ( ) ;
m_prev_utf8_iterator = m_utf8_view . begin ( ) ;
m_source_positions . empend ( 0u , 0u ) ;
}
2021-11-11 00:55:02 +01:00
HTMLTokenizer : : HTMLTokenizer ( StringView input , String const & encoding )
2020-05-22 21:46:13 +02:00
{
2020-05-28 12:35:19 +02:00
auto * decoder = TextCodec : : decoder_for ( encoding ) ;
2021-02-23 20:42:32 +01:00
VERIFY ( decoder ) ;
2020-05-28 12:35:19 +02:00
m_decoded_input = decoder - > to_utf8 ( input ) ;
2020-06-04 21:06:54 +02:00
m_utf8_view = Utf8View ( m_decoded_input ) ;
m_utf8_iterator = m_utf8_view . begin ( ) ;
2022-02-19 15:58:21 +01:00
m_prev_utf8_iterator = m_utf8_view . begin ( ) ;
2021-05-20 23:11:41 +04:30
m_source_positions . empend ( 0u , 0u ) ;
2020-05-22 21:46:13 +02:00
}
2022-02-19 15:58:21 +01:00
void HTMLTokenizer : : insert_input_at_insertion_point ( String const & input )
{
auto utf8_iterator_byte_offset = m_utf8_view . byte_offset_of ( m_utf8_iterator ) ;
// FIXME: Implement a InputStream to handle insertion_point and iterators.
StringBuilder builder { } ;
builder . append ( m_decoded_input . substring ( 0 , m_insertion_point . position ) ) ;
builder . append ( input ) ;
builder . append ( m_decoded_input . substring ( m_insertion_point . position ) ) ;
m_decoded_input = builder . build ( ) ;
m_utf8_view = Utf8View ( m_decoded_input ) ;
m_utf8_iterator = m_utf8_view . iterator_at_byte_offset ( utf8_iterator_byte_offset ) ;
m_insertion_point . position + = input . length ( ) ;
}
void HTMLTokenizer : : insert_eof ( )
{
m_explicit_eof_inserted = true ;
}
bool HTMLTokenizer : : is_eof_inserted ( )
{
return m_explicit_eof_inserted ;
}
2020-05-22 21:46:13 +02:00
void HTMLTokenizer : : will_switch_to ( [[maybe_unused]] State new_state )
{
2021-02-07 15:33:24 +03:30
dbgln_if ( TOKENIZER_TRACE_DEBUG , " [{}] Switch to {} " , state_name ( m_state ) , state_name ( new_state ) ) ;
2020-05-22 21:46:13 +02:00
}
void HTMLTokenizer : : will_reconsume_in ( [[maybe_unused]] State new_state )
{
2021-02-07 15:33:24 +03:30
dbgln_if ( TOKENIZER_TRACE_DEBUG , " [{}] Reconsume in {} " , state_name ( m_state ) , state_name ( new_state ) ) ;
2020-05-22 21:46:13 +02:00
}
2021-09-25 23:15:48 +02:00
void HTMLTokenizer : : switch_to ( Badge < HTMLParser > , State new_state )
2020-05-24 20:24:43 +02:00
{
2021-02-07 15:33:24 +03:30
dbgln_if ( TOKENIZER_TRACE_DEBUG , " [{}] Parser switches tokenizer state to {} " , state_name ( m_state ) , state_name ( new_state ) ) ;
2020-05-24 20:24:43 +02:00
m_state = new_state ;
}
void HTMLTokenizer : : will_emit ( HTMLToken & token )
{
if ( token . is_start_tag ( ) )
2021-05-22 20:17:09 +02:00
m_last_emitted_start_tag_name = token . tag_name ( ) ;
2021-07-15 15:53:54 +02:00
token . set_end_position ( { } , nth_last_position ( 0 ) ) ;
2020-05-24 20:24:43 +02:00
}
bool HTMLTokenizer : : current_end_tag_token_is_appropriate ( ) const
{
2021-02-23 20:42:32 +01:00
VERIFY ( m_current_token . is_end_tag ( ) ) ;
2021-05-22 20:17:09 +02:00
if ( ! m_last_emitted_start_tag_name . has_value ( ) )
2020-05-24 20:24:43 +02:00
return false ;
2021-05-22 20:17:09 +02:00
return m_current_token . tag_name ( ) = = m_last_emitted_start_tag_name . value ( ) ;
2020-05-24 20:24:43 +02:00
}
2020-05-27 16:16:23 +02:00
bool HTMLTokenizer : : consumed_as_part_of_an_attribute ( ) const
{
return m_return_state = = State : : AttributeValueUnquoted | | m_return_state = = State : : AttributeValueSingleQuoted | | m_return_state = = State : : AttributeValueDoubleQuoted ;
}
2021-07-12 12:44:21 +02:00
void HTMLTokenizer : : restore_to ( Utf8CodePointIterator const & new_iterator )
2021-05-20 23:11:41 +04:30
{
2022-02-13 14:08:53 +00:00
auto diff = m_utf8_iterator - new_iterator ;
if ( diff > 0 ) {
2022-06-02 01:03:44 +02:00
for ( ssize_t i = 0 ; i < diff ; + + i ) {
if ( ! m_source_positions . is_empty ( ) )
m_source_positions . take_last ( ) ;
}
2022-02-13 14:08:53 +00:00
} else {
// Going forwards...?
TODO ( ) ;
2021-05-20 23:11:41 +04:30
}
2021-05-21 11:03:31 +04:30
m_utf8_iterator = new_iterator ;
2021-05-20 23:11:41 +04:30
}
2021-05-23 08:20:03 +02:00
String HTMLTokenizer : : consume_current_builder ( )
{
auto string = m_current_builder . to_string ( ) ;
m_current_builder . clear ( ) ;
return string ;
}
2020-05-22 21:46:13 +02:00
}