/* * Copyright (c) 2021, Max Wipfli * Copyright (c) 2025, Jelle Raaijmakers * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include namespace Web::HTML { static bool prescan_should_abort(ReadonlyBytes input, size_t const& position) { return position >= input.size() || position >= 1024; } static constexpr bool is_whitespace(u8 byte) { return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' '; } static constexpr bool is_whitespace_or_slash(u8 byte) { return is_whitespace(byte) || byte == '/'; } static constexpr bool is_whitespace_or_end_chevron(u8 byte) { return is_whitespace(byte) || byte == '>'; } static bool prescan_skip_whitespace_and_slashes(ReadonlyBytes input, size_t& position) { while (!prescan_should_abort(input, position) && is_whitespace_or_slash(input[position])) ++position; return !prescan_should_abort(input, position); } // https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element Optional extract_character_encoding_from_meta_element(ByteString const& string) { // Checking for "charset" is case insensitive, as is getting an encoding. // Therefore, stick to lowercase from the start for simplicity. auto lowercase_string = string.to_lowercase(); GenericLexer lexer(lowercase_string); for (;;) { auto charset_index = lexer.remaining().find("charset"sv); if (!charset_index.has_value()) return {}; // 7 is the length of "charset". lexer.ignore(charset_index.value() + 7); lexer.ignore_while([](char c) { return Infra::is_ascii_whitespace(c); }); if (lexer.peek() != '=') continue; break; } // Ignore the '='. lexer.ignore(); lexer.ignore_while([](char c) { return Infra::is_ascii_whitespace(c); }); if (lexer.is_eof()) return {}; if (lexer.consume_specific('"')) { auto matching_double_quote = lexer.remaining().find('"'); if (!matching_double_quote.has_value()) return {}; auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value()); return TextCodec::get_standardized_encoding(encoding); } if (lexer.consume_specific('\'')) { auto matching_single_quote = lexer.remaining().find('\''); if (!matching_single_quote.has_value()) return {}; auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value()); return TextCodec::get_standardized_encoding(encoding); } auto encoding = lexer.consume_until([](char c) { return Infra::is_ascii_whitespace(c) || c == ';'; }); return TextCodec::get_standardized_encoding(encoding); } // https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing GC::Ptr prescan_get_attribute(DOM::Document& document, ReadonlyBytes input, size_t& position) { // 1. If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x2F (/), // then advance position to the next byte and redo this step. if (!prescan_skip_whitespace_and_slashes(input, position)) return {}; // 2. If the byte at position is 0x3E (>), then abort the get an attribute algorithm. There isn't one. if (input[position] == '>') return {}; // 3. Otherwise, the byte at position is the start of the attribute name. Let attribute name and attribute value be the empty string. // 4. Process the byte at position as follows: StringBuilder attribute_name; while (true) { // -> If it is 0x3D (=), and the attribute name is longer than the empty string if (input[position] == '=' && !attribute_name.is_empty()) { // Advance position to the next byte and jump to the step below labeled value. ++position; goto value; } // -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) if (is_whitespace(input[position])) { // Jump to the step below labeled spaces. goto spaces; } // -> If it is 0x2F (/) or 0x3E (>) if (input[position] == '/' || input[position] == '>') { // Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string. return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {}); } // -> If it is in the range 0x41 (A) to 0x5A (Z) if (input[position] >= 'A' && input[position] <= 'Z') { // Append the code point b+0x20 to attribute name (where b is the value of the byte at position). (This converts the input to lowercase.) attribute_name.append_code_point(input[position] + 0x20); } // -> Anything else else { // Append the code point with the same value as the byte at position to attribute name. // (It doesn't actually matter how bytes outside the ASCII range are handled here, // since only ASCII bytes can contribute to the detection of a character encoding.) attribute_name.append_code_point(input[position]); } // 5. Advance position to the next byte and return to the previous step. ++position; if (prescan_should_abort(input, position)) return {}; } spaces: // 6. Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP), // then advance position to the next byte, then, repeat this step. if (!prescan_skip_whitespace_and_slashes(input, position)) return {}; // 7. If the byte at position is not 0x3D (=), abort the get an attribute algorithm. // The attribute's name is the value of attribute name, its value is the empty string. if (input[position] != '=') return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {}); // 8. Advance position past the 0x3D (=) byte. ++position; value: // 9. Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP), // then advance position to the next byte, then, repeat this step. if (!prescan_skip_whitespace_and_slashes(input, position)) return {}; StringBuilder attribute_value; // 10. Process the byte at position as follows: // -> If it is 0x22 (") or 0x27 (') if (input[position] == '"' || input[position] == '\'') { // 1. Let b be the value of the byte at position. u8 quote_character = input[position]; // 2. Quote loop: Advance position to the next byte. ++position; for (; !prescan_should_abort(input, position); ++position) { // 3. If the value of the byte at position is the value of b, then advance position to the next byte // and abort the "get an attribute" algorithm. // The attribute's name is the value of attribute name, and its value is the value of attribute value. if (input[position] == quote_character) { ++position; return DOM::Attr::create(document, MUST(attribute_name.to_string()), MUST(attribute_value.to_string())); } // 4. Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z), // then append a code point to attribute value whose value is 0x20 more than the value of the byte at position. if (input[position] >= 'A' && input[position] <= 'Z') { attribute_value.append_code_point(input[position] + 0x20); } // 5. Otherwise, append a code point to attribute value whose value is the same as the value of the byte at position. else { attribute_value.append_code_point(input[position]); } // 6. Return to the step above labeled quote loop. } return {}; } // -> If it is 0x3E (>) if (input[position] == '>') { // Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string. return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {}); } // -> If it is in the range 0x41 (A) to 0x5A (Z) if (input[position] >= 'A' && input[position] <= 'Z') { // Append a code point b+0x20 to attribute value (where b is the value of the byte at position). attribute_value.append_code_point(input[position] + 0x20); // Advance position to the next byte. ++position; } // -> Anything else else { // Append a code point with the same value as the byte at position to attribute value. attribute_value.append_code_point(input[position]); // Advance position to the next byte. ++position; } if (prescan_should_abort(input, position)) return {}; // 11. Process the byte at position as follows: for (; !prescan_should_abort(input, position); ++position) { // -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) if (is_whitespace_or_end_chevron(input[position])) { // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value. return DOM::Attr::create(document, MUST(attribute_name.to_string()), MUST(attribute_value.to_string())); } // -> If it is in the range 0x41 (A) to 0x5A (Z) if (input[position] >= 'A' && input[position] <= 'Z') { // Append a code point b+0x20 to attribute value (where b is the value of the byte at position). attribute_value.append_code_point(input[position] + 0x20); } // -> Anything else else { // Append a code point with the same value as the byte at position to attribute value. attribute_value.append_code_point(input[position]); } // 12. Advance position to the next byte and return to the previous step. } return {}; } // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding Optional run_prescan_byte_stream_algorithm(DOM::Document& document, ReadonlyBytes input) { // 1. Let position be a pointer to a byte in the input byte stream, initially pointing at the first byte. size_t position = 0; // 2. Prescan for UTF-16 XML declarations: If position points to: if (!prescan_should_abort(input, position + 5)) { // * A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian ' { 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 }) return "utf-16le"; // * A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian ' { 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 }) return "utf-16be"; } // NOTE: For historical reasons, the prefix is two bytes longer than in Appendix F of XML and the encoding name is // not checked. // 3. Loop: If position points to: while (!prescan_should_abort(input, position)) { // * A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`' sequence) and comes after the 0x3C byte that was found. (The two 0x2D // bytes can be the same as those in the '