AK+LibJS+LibWeb: Recognize that our UTF-16 string is actually WTF-16

For the web, we allow a wobbly UTF-16 encoding (i.e. lonely surrogates
are permitted). Only in a few exceptional cases do we strictly require
valid UTF-16. As such, our `validate(AllowLonelySurrogates::Yes)` calls
will always succeed. It's a wasted effort to ever make such a check.

This patch eliminates such invocations. The validation methods will now
only check for strict UTF-16, and are only invoked when needed.
This commit is contained in:
Timothy Flynn 2025-08-08 16:32:26 -04:00 committed by Tim Flynn
parent 36c7302178
commit 8472e469f4
Notes: github-actions[bot] 2025-08-13 13:57:41 +00:00
20 changed files with 61 additions and 158 deletions

View file

@ -237,82 +237,52 @@ TEST_CASE(validate_invalid_utf16)
{
// Lonely high surrogate.
invalid = u"\xd800"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 1uz);
invalid = u"\xdbff"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 1uz);
}
{
// Lonely low surrogate.
invalid = u"\xdc00"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 1uz);
invalid = u"\xdfff"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 1uz);
}
{
// High surrogate followed by non-surrogate.
invalid = u"\xd800\x0000"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 2uz);
invalid = u"\xd800\xe000"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 2uz);
}
{
// High surrogate followed by high surrogate.
invalid = u"\xd800\xd800"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 2uz);
invalid = u"\xd800\xdbff"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 2uz);
}
{
// Valid UTF-16 followed by invalid code units.
invalid = u"\x0041\x0041\xd800"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 2uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 3uz);
invalid = u"\x0041\x0041\xd800"sv;
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 2uz);
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
EXPECT_EQ(valid_code_units, 3uz);
}
}