2021-05-12 10:47:12 +02:00
/*
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2021-07-13 18:37:03 +01:00
# include <AK/CharacterTypes.h>
# include <AK/GenericLexer.h>
2021-05-12 10:47:12 +02:00
# include <AK/StringView.h>
# include <AK/Utf8View.h>
# include <LibTextCodec/Decoder.h>
2022-09-18 01:03:58 +02:00
# include <LibWeb/DOM/Attr.h>
2021-10-15 09:57:07 -04:00
# include <LibWeb/DOM/Document.h>
2021-05-12 10:47:12 +02:00
# include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
2022-10-01 18:14:32 +01:00
# include <LibWeb/Infra/CharacterTypes.h>
2021-05-12 10:47:12 +02:00
# include <ctype.h>
namespace Web : : HTML {
2022-04-01 20:58:27 +03:00
bool prescan_should_abort ( ByteBuffer const & input , size_t const & position )
2021-05-12 10:47:12 +02:00
{
return position > = input . size ( ) | | position > = 1024 ;
}
2022-04-01 20:58:27 +03:00
bool prescan_is_whitespace_or_slash ( u8 const & byte )
2021-05-12 10:47:12 +02:00
{
return byte = = ' \t ' | | byte = = ' \n ' | | byte = = ' \f ' | | byte = = ' \r ' | | byte = = ' ' | | byte = = ' / ' ;
}
2022-04-01 20:58:27 +03:00
bool prescan_skip_whitespace_and_slashes ( ByteBuffer const & input , size_t & position )
2021-05-12 10:47:12 +02:00
{
while ( ! prescan_should_abort ( input , position ) & & ( input [ position ] = = ' \t ' | | input [ position ] = = ' \n ' | | input [ position ] = = ' \f ' | | input [ position ] = = ' \r ' | | input [ position ] = = ' ' | | input [ position ] = = ' / ' ) )
+ + position ;
return ! prescan_should_abort ( input , position ) ;
}
2021-07-13 18:37:03 +01:00
// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
2023-12-16 17:49:34 +03:30
Optional < StringView > extract_character_encoding_from_meta_element ( ByteString const & string )
2021-07-13 18:37:03 +01:00
{
// Checking for "charset" is case insensitive, as is getting an encoding.
// Therefore, stick to lowercase from the start for simplicity.
auto lowercase_string = string . to_lowercase ( ) ;
GenericLexer lexer ( lowercase_string ) ;
for ( ; ; ) {
2022-07-11 17:32:29 +00:00
auto charset_index = lexer . remaining ( ) . find ( " charset " sv ) ;
2021-07-13 18:37:03 +01:00
if ( ! charset_index . has_value ( ) )
return { } ;
// 7 is the length of "charset".
lexer . ignore ( charset_index . value ( ) + 7 ) ;
lexer . ignore_while ( [ ] ( char c ) {
2022-10-01 18:14:32 +01:00
return Infra : : is_ascii_whitespace ( c ) ;
2021-07-13 18:37:03 +01:00
} ) ;
if ( lexer . peek ( ) ! = ' = ' )
continue ;
break ;
}
// Ignore the '='.
lexer . ignore ( ) ;
lexer . ignore_while ( [ ] ( char c ) {
2022-10-01 18:14:32 +01:00
return Infra : : is_ascii_whitespace ( c ) ;
2021-07-13 18:37:03 +01:00
} ) ;
if ( lexer . is_eof ( ) )
return { } ;
if ( lexer . consume_specific ( ' " ' ) ) {
2022-07-11 20:10:18 +00:00
auto matching_double_quote = lexer . remaining ( ) . find ( ' " ' ) ;
2021-07-13 18:37:03 +01:00
if ( ! matching_double_quote . has_value ( ) )
return { } ;
auto encoding = lexer . remaining ( ) . substring_view ( 0 , matching_double_quote . value ( ) ) ;
return TextCodec : : get_standardized_encoding ( encoding ) ;
}
if ( lexer . consume_specific ( ' \' ' ) ) {
2022-07-11 20:10:18 +00:00
auto matching_single_quote = lexer . remaining ( ) . find ( ' \' ' ) ;
2021-07-13 18:37:03 +01:00
if ( ! matching_single_quote . has_value ( ) )
return { } ;
auto encoding = lexer . remaining ( ) . substring_view ( 0 , matching_single_quote . value ( ) ) ;
return TextCodec : : get_standardized_encoding ( encoding ) ;
}
auto encoding = lexer . consume_until ( [ ] ( char c ) {
2022-10-01 18:14:32 +01:00
return Infra : : is_ascii_whitespace ( c ) | | c = = ' ; ' ;
2021-07-13 18:37:03 +01:00
} ) ;
return TextCodec : : get_standardized_encoding ( encoding ) ;
}
2023-12-28 22:49:53 +01:00
// https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing
2022-09-18 01:03:58 +02:00
JS : : GCPtr < DOM : : Attr > prescan_get_attribute ( DOM : : Document & document , ByteBuffer const & input , size_t & position )
2021-05-12 10:47:12 +02:00
{
2023-12-28 22:49:53 +01:00
// 1. If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x2F (/) then advance position to the next byte and redo this step.
2021-05-12 10:47:12 +02:00
if ( ! prescan_skip_whitespace_and_slashes ( input , position ) )
return { } ;
2023-12-28 22:49:53 +01:00
// 2. If the byte at position is 0x3E (>), then abort the get an attribute algorithm. There isn't one.
2021-05-12 10:47:12 +02:00
if ( input [ position ] = = ' > ' )
return { } ;
2023-12-28 22:49:53 +01:00
// 3. Otherwise, the byte at position is the start of the attribute name. Let attribute name and attribute value be the empty string.
// 4. Process the byte at position as follows:
2021-05-12 10:47:12 +02:00
StringBuilder attribute_name ;
while ( true ) {
2023-12-28 22:49:53 +01:00
// -> If it is 0x3D (=), and the attribute name is longer than the empty string
2021-05-12 10:47:12 +02:00
if ( input [ position ] = = ' = ' & & ! attribute_name . is_empty ( ) ) {
2023-12-28 22:49:53 +01:00
// Advance position to the next byte and jump to the step below labeled value.
2021-05-12 10:47:12 +02:00
+ + position ;
goto value ;
2023-12-28 22:49:53 +01:00
}
// -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
if ( input [ position ] = = ' \t ' | | input [ position ] = = ' \n ' | | input [ position ] = = ' \f ' | | input [ position ] = = ' \r ' | | input [ position ] = = ' ' ) {
// Jump to the step below labeled spaces.
2021-05-12 10:47:12 +02:00
goto spaces ;
2023-12-28 22:49:53 +01:00
}
// -> If it is 0x2F (/) or 0x3E (>)
if ( input [ position ] = = ' / ' | | input [ position ] = = ' > ' ) {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
return DOM : : Attr : : create ( document , MUST ( attribute_name . to_string ( ) ) , String { } ) ;
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if ( input [ position ] > = ' A ' & & input [ position ] < = ' Z ' ) {
// Append the code point b+0x20 to attribute name (where b is the value of the byte at position). (This converts the input to lowercase.)
2023-12-28 23:30:20 +01:00
attribute_name . append_code_point ( input [ position ] + 0x20 ) ;
2023-12-28 22:49:53 +01:00
}
// -> Anything else
else {
// Append the code point with the same value as the byte at position to attribute name.
// (It doesn't actually matter how bytes outside the ASCII range are handled here,
// since only ASCII bytes can contribute to the detection of a character encoding.)
attribute_name . append_code_point ( input [ position ] ) ;
}
// 5. Advance position to the next byte and return to the previous step.
2021-05-12 10:47:12 +02:00
+ + position ;
if ( prescan_should_abort ( input , position ) )
return { } ;
}
spaces :
2023-12-28 22:49:53 +01:00
// 6. Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
// then advance position to the next byte, then, repeat this step.
2021-05-12 10:47:12 +02:00
if ( ! prescan_skip_whitespace_and_slashes ( input , position ) )
return { } ;
2023-12-28 22:49:53 +01:00
// 7. If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
// The attribute's name is the value of attribute name, its value is the empty string.
2021-05-12 10:47:12 +02:00
if ( input [ position ] ! = ' = ' )
2023-09-10 16:06:58 +12:00
return DOM : : Attr : : create ( document , MUST ( attribute_name . to_string ( ) ) , String { } ) ;
2023-12-28 22:49:53 +01:00
// 8. Advance position past the 0x3D (=) byte.
2021-05-12 10:47:12 +02:00
+ + position ;
value :
2023-12-28 22:49:53 +01:00
// 9. Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
// then advance position to the next byte, then, repeat this step.
2021-05-12 10:47:12 +02:00
if ( ! prescan_skip_whitespace_and_slashes ( input , position ) )
return { } ;
StringBuilder attribute_value ;
2023-12-28 22:49:53 +01:00
// 10. Process the byte at position as follows:
// -> If it is 0x22 (") or 0x27 (')
2021-05-12 10:47:12 +02:00
if ( input [ position ] = = ' " ' | | input [ position ] = = ' \' ' ) {
2023-12-28 22:49:53 +01:00
// 1. Let b be the value of the byte at position.
2021-05-12 10:47:12 +02:00
u8 quote_character = input [ position ] ;
2023-12-28 22:49:53 +01:00
// 2. Quote loop: Advance position to the next byte.
2021-05-12 10:47:12 +02:00
+ + position ;
2023-12-28 22:49:53 +01:00
2021-05-12 10:47:12 +02:00
for ( ; ! prescan_should_abort ( input , position ) ; + + position ) {
2023-12-28 22:49:53 +01:00
// 3. If the value of the byte at position is the value of b, then advance position to the next byte
// and abort the "get an attribute" algorithm.
// The attribute's name is the value of attribute name, and its value is the value of attribute value.
2021-05-12 10:47:12 +02:00
if ( input [ position ] = = quote_character )
2023-09-10 16:06:58 +12:00
return DOM : : Attr : : create ( document , MUST ( attribute_name . to_string ( ) ) , MUST ( attribute_value . to_string ( ) ) ) ;
2023-12-28 22:49:53 +01:00
// 4. Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z),
// then append a code point to attribute value whose value is 0x20 more than the value of the byte at position.
if ( input [ position ] > = ' A ' & & input [ position ] < = ' Z ' ) {
2023-12-28 23:30:20 +01:00
attribute_value . append_code_point ( input [ position ] + 0x20 ) ;
2023-12-28 22:49:53 +01:00
}
// 5. Otherwise, append a code point to attribute value whose value is the same as the value of the byte at position.
else {
attribute_value . append_code_point ( input [ position ] ) ;
}
// 6. Return to the step above labeled quote loop.
2021-05-12 10:47:12 +02:00
}
return { } ;
2023-12-28 22:49:53 +01:00
}
// -> If it is 0x3E (>)
if ( input [ position ] = = ' > ' ) {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
2023-09-10 16:06:58 +12:00
return DOM : : Attr : : create ( document , MUST ( attribute_name . to_string ( ) ) , String { } ) ;
2023-12-28 22:49:53 +01:00
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if ( input [ position ] > = ' A ' & & input [ position ] < = ' Z ' ) {
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
2023-12-28 23:30:20 +01:00
attribute_value . append_code_point ( input [ position ] + 0x20 ) ;
2023-12-28 22:49:53 +01:00
// Advance position to the next byte.
+ + position ;
}
// -> Anything else
else {
// Append a code point with the same value as the byte at position to attribute value.
2023-12-28 23:30:20 +01:00
attribute_value . append_code_point ( input [ position ] ) ;
2023-12-28 22:49:53 +01:00
// Advance position to the next byte.
+ + position ;
}
2021-05-12 10:47:12 +02:00
if ( prescan_should_abort ( input , position ) )
return { } ;
2023-12-28 22:49:53 +01:00
// 11. Process the byte at position as follows:
2021-05-12 10:47:12 +02:00
for ( ; ! prescan_should_abort ( input , position ) ; + + position ) {
2023-12-28 22:49:53 +01:00
// -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
if ( input [ position ] = = ' \t ' | | input [ position ] = = ' \n ' | | input [ position ] = = ' \f ' | | input [ position ] = = ' \r ' | | input [ position ] = = ' ' | | input [ position ] = = ' > ' ) {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
2023-09-10 16:06:58 +12:00
return DOM : : Attr : : create ( document , MUST ( attribute_name . to_string ( ) ) , MUST ( attribute_value . to_string ( ) ) ) ;
2023-12-28 22:49:53 +01:00
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if ( input [ position ] > = ' A ' & & input [ position ] < = ' Z ' ) {
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
2023-12-28 23:30:20 +01:00
attribute_value . append_code_point ( input [ position ] + 0x20 ) ;
2023-12-28 22:49:53 +01:00
}
// -> Anything else
else {
// Append a code point with the same value as the byte at position to attribute value.
2023-12-28 23:30:20 +01:00
attribute_value . append_code_point ( input [ position ] ) ;
2023-12-28 22:49:53 +01:00
}
// 12. Advance position to the next byte and return to the previous step.
2021-05-12 10:47:12 +02:00
}
return { } ;
}
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
2023-12-16 17:49:34 +03:30
Optional < ByteString > run_prescan_byte_stream_algorithm ( DOM : : Document & document , ByteBuffer const & input )
2021-05-12 10:47:12 +02:00
{
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
// Detects '<?x'
if ( ! prescan_should_abort ( input , 6 ) ) {
if ( input [ 0 ] = = 0x3C & & input [ 1 ] = = 0x00 & & input [ 2 ] = = 0x3F & & input [ 3 ] = = 0x00 & & input [ 4 ] = = 0x78 & & input [ 5 ] = = 0x00 )
return " utf-16le " ;
if ( input [ 0 ] = = 0x00 & & input [ 1 ] = = 0x3C & & input [ 2 ] = = 0x00 & & input [ 4 ] = = 0x3F & & input [ 5 ] = = 0x00 & & input [ 6 ] = = 0x78 )
return " utf-16be " ;
}
for ( size_t position = 0 ; ! prescan_should_abort ( input , position ) ; + + position ) {
if ( ! prescan_should_abort ( input , position + 5 ) & & input [ position ] = = ' < ' & & input [ position + 1 ] = = ' ! '
& & input [ position + 2 ] = = ' - ' & & input [ position + 3 ] = = ' - ' ) {
position + = 2 ;
for ( ; ! prescan_should_abort ( input , position + 3 ) ; + + position ) {
if ( input [ position ] = = ' - ' & & input [ position + 1 ] = = ' - ' & & input [ position + 2 ] = = ' > ' ) {
position + = 2 ;
break ;
}
}
} else if ( ! prescan_should_abort ( input , position + 6 )
& & input [ position ] = = ' < '
& & ( input [ position + 1 ] = = ' M ' | | input [ position + 1 ] = = ' m ' )
& & ( input [ position + 2 ] = = ' E ' | | input [ position + 2 ] = = ' e ' )
& & ( input [ position + 3 ] = = ' T ' | | input [ position + 3 ] = = ' t ' )
& & ( input [ position + 4 ] = = ' A ' | | input [ position + 4 ] = = ' a ' )
& & prescan_is_whitespace_or_slash ( input [ position + 5 ] ) ) {
position + = 6 ;
2023-09-10 16:06:58 +12:00
Vector < FlyString > attribute_list { } ;
2021-05-12 10:47:12 +02:00
bool got_pragma = false ;
Optional < bool > need_pragma { } ;
2023-12-16 17:49:34 +03:30
Optional < ByteString > charset { } ;
2021-05-12 10:47:12 +02:00
while ( true ) {
2021-10-15 09:57:07 -04:00
auto attribute = prescan_get_attribute ( document , input , position ) ;
if ( ! attribute )
2021-05-12 10:47:12 +02:00
break ;
2021-10-15 09:57:07 -04:00
if ( attribute_list . contains_slow ( attribute - > name ( ) ) )
2021-05-12 10:47:12 +02:00
continue ;
2023-09-10 16:06:58 +12:00
auto const & attribute_name = attribute - > name ( ) ;
2021-10-15 09:57:07 -04:00
attribute_list . append ( attribute - > name ( ) ) ;
2021-05-12 10:47:12 +02:00
2021-07-13 18:37:03 +01:00
if ( attribute_name = = " http-equiv " ) {
2021-10-15 09:57:07 -04:00
got_pragma = attribute - > value ( ) = = " content-type " ;
2021-07-13 18:37:03 +01:00
} else if ( attribute_name = = " content " ) {
2023-12-16 17:49:34 +03:30
auto encoding = extract_character_encoding_from_meta_element ( attribute - > value ( ) . to_byte_string ( ) ) ;
2021-07-13 18:37:03 +01:00
if ( encoding . has_value ( ) & & ! charset . has_value ( ) ) {
charset = encoding . value ( ) ;
need_pragma = true ;
}
} else if ( attribute_name = = " charset " ) {
2021-10-15 09:57:07 -04:00
auto maybe_charset = TextCodec : : get_standardized_encoding ( attribute - > value ( ) ) ;
2021-05-12 10:47:12 +02:00
if ( maybe_charset . has_value ( ) ) {
2023-12-16 17:49:34 +03:30
charset = Optional < ByteString > { maybe_charset } ;
2021-05-12 10:47:12 +02:00
need_pragma = { false } ;
}
}
}
if ( ! need_pragma . has_value ( ) | | ( need_pragma . value ( ) & & ! got_pragma ) | | ! charset . has_value ( ) )
continue ;
if ( charset . value ( ) = = " UTF-16BE/LE " )
return " UTF-8 " ;
else if ( charset . value ( ) = = " x-user-defined " )
return " windows-1252 " ;
else
return charset . value ( ) ;
} else if ( ! prescan_should_abort ( input , position + 3 ) & & input [ position ] = = ' < '
& & ( ( input [ position + 1 ] = = ' / ' & & isalpha ( input [ position + 2 ] ) ) | | isalpha ( input [ position + 1 ] ) ) ) {
position + = 2 ;
prescan_skip_whitespace_and_slashes ( input , position ) ;
2021-10-15 09:57:07 -04:00
while ( prescan_get_attribute ( document , input , position ) ) { } ;
2021-05-12 10:47:12 +02:00
} else if ( ! prescan_should_abort ( input , position + 1 ) & & input [ position ] = = ' < ' & & ( input [ position + 1 ] = = ' ! ' | | input [ position + 1 ] = = ' / ' | | input [ position + 1 ] = = ' ? ' ) ) {
position + = 2 ;
while ( input [ position ] ! = ' > ' ) {
+ + position ;
if ( prescan_should_abort ( input , position ) )
return { } ;
}
} else {
// Do nothing.
}
}
return { } ;
}
// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
2023-12-16 17:49:34 +03:30
ByteString run_encoding_sniffing_algorithm ( DOM : : Document & document , ByteBuffer const & input )
2021-05-12 10:47:12 +02:00
{
if ( input . size ( ) > = 2 ) {
if ( input [ 0 ] = = 0xFE & & input [ 1 ] = = 0xFF ) {
return " UTF-16BE " ;
} else if ( input [ 0 ] = = 0xFF & & input [ 1 ] = = 0xFE ) {
return " UTF-16LE " ;
} else if ( input . size ( ) > = 3 & & input [ 0 ] = = 0xEF & & input [ 1 ] = = 0xBB & & input [ 2 ] = = 0xBF ) {
return " UTF-8 " ;
}
}
// FIXME: If the user has explicitly instructed the user agent to override the document's character
// encoding with a specific encoding.
// FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
// at any later step in this algorithm.
// FIXME: If the transport layer specifies a character encoding, and it is supported.
2021-10-15 09:57:07 -04:00
auto optional_encoding = run_prescan_byte_stream_algorithm ( document , input ) ;
2021-05-12 10:47:12 +02:00
if ( optional_encoding . has_value ( ) ) {
return optional_encoding . value ( ) ;
}
// FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
// is non-null and a child browsing context.
// FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
// when it was last visited.
if ( ! Utf8View ( StringView ( input ) ) . validate ( ) ) {
// FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
return " windows-1252 " ;
}
// NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
// "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
return " UTF-8 " ;
}
}