LibWeb: Skip right amount of characters during encoding detection

When detecting an element's opening tag, the spec asks us to skip ahead to the first whitespace or end chevron character before trying to read attributes. Instead, we were always skipping 2 positions ahead and then ignoring all whitespace characters and slashes, which was clearly wrong. Theoretically this could have caused some weird behaviors if part of the opening tag matched an expected attribute name, but it's very unlikely to see that in the wild.
Author: https://github.com/gmta Commit: f52632d48a Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6893
2025-12-07 21:59:54 +00:00 · 2025-11-21 11:30:57 +01:00 · 2025-11-21 11:30:57 +01:00 · f52632d48a · 2025-11-21 16:44:15 +00:00
commit f52632d48a
parent 4bcf988e46
2 changed files with 22 additions and 17 deletions
--- a/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
+++ b/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
+ * Copyright (c) 2025, Jelle Raaijmakers <jelle@ladybird.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@ -13,23 +14,32 @@
 #include <LibWeb/DOM/Document.h>
 #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
 #include <LibWeb/Infra/CharacterTypes.h>
-#include <ctype.h>

 namespace Web::HTML {

-bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
+static bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
 {
    return position >= input.size() || position >= 1024;
 }

-bool prescan_is_whitespace_or_slash(u8 const& byte)
+static constexpr bool is_whitespace(u8 byte)
 {
-    return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
+    return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ';
 }

-bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
+static constexpr bool is_whitespace_or_slash(u8 byte)
 {
-    while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
+    return is_whitespace(byte) || byte == '/';
+}
+
+static constexpr bool is_whitespace_or_end_chevron(u8 byte)
+{
+    return is_whitespace(byte) || byte == '>';
+}
+
+static bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
+{
+    while (!prescan_should_abort(input, position) && is_whitespace_or_slash(input[position]))
        ++position;
    return !prescan_should_abort(input, position);
 }
@ -117,7 +127,7 @@ GC::Ptr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer con
            goto value;
        }
        // -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
-        if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') {
+        if (is_whitespace(input[position])) {
            // Jump to the step below labeled spaces.
            goto spaces;
        }
@ -227,7 +237,7 @@ value:
    // 11. Process the byte at position as follows:
    for (; !prescan_should_abort(input, position); ++position) {
        // -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
-        if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') {
+        if (is_whitespace_or_end_chevron(input[position])) {
            // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
            return DOM::Attr::create(document, MUST(attribute_name.to_string()), MUST(attribute_value.to_string()));
        }
@ -251,8 +261,6 @@ value:
 // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
 Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
 {
-    // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
-
    // Detects '<?x'
    if (!prescan_should_abort(input, 5)) {
        // A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0
@ -279,7 +287,7 @@ Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document,
            && (input[position + 2] == 'E' || input[position + 2] == 'e')
            && (input[position + 3] == 'T' || input[position + 3] == 't')
            && (input[position + 4] == 'A' || input[position + 4] == 'a')
-            && prescan_is_whitespace_or_slash(input[position + 5])) {
+            && is_whitespace_or_slash(input[position + 5])) {
            position += 6;
            Vector<FlyString> attribute_list {};
            bool got_pragma = false;
@ -322,9 +330,9 @@ Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document,
            else
                return charset.value();
        } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
-            && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
-            position += 2;
-            prescan_skip_whitespace_and_slashes(input, position);
+            && ((input[position + 1] == '/' && is_ascii_alpha(input[position + 2])) || is_ascii_alpha(input[position + 1]))) {
+            while (!prescan_should_abort(input, position) && !is_whitespace_or_end_chevron(input[position]))
+                ++position;
            while (prescan_get_attribute(document, input, position)) { };
        } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
            position += 1;
--- a/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
+++ b/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
@ -13,9 +13,6 @@

 namespace Web::HTML {

-bool prescan_should_abort(ByteBuffer const& input, size_t const& position);
-bool prescan_is_whitespace_or_slash(u8 const& byte);
-bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position);
 Optional<StringView> extract_character_encoding_from_meta_element(ByteString const&);
 GC::Ptr<DOM::Attr> prescan_get_attribute(DOM::Document&, ByteBuffer const& input, size_t& position);
 Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document&, ByteBuffer const& input);