LibWeb: Implement CSS decode bytes algo

This commit is contained in:
Lorenz A 2025-09-16 21:01:23 +02:00 committed by Jelle Raaijmakers
parent 4b989b8efd
commit e73e0b3c92
Notes: github-actions[bot] 2025-10-16 14:46:26 +00:00
9 changed files with 182 additions and 46 deletions

View file

@ -2,6 +2,7 @@
* Copyright (c) 2021, the SerenityOS developers. * Copyright (c) 2021, the SerenityOS developers.
* Copyright (c) 2021-2024, Sam Atkins <sam@ladybird.org> * Copyright (c) 2021-2024, Sam Atkins <sam@ladybird.org>
* Copyright (c) 2022-2024, Andreas Kling <andreas@ladybird.org> * Copyright (c) 2022-2024, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
@ -146,18 +147,17 @@ void CSSImportRule::fetch()
// 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl. // 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl.
// FIXME: Tidy up our parsing API. For now, do the decoding here. // FIXME: Tidy up our parsing API. For now, do the decoding here.
// FIXME: Get the encoding from the response somehow. Optional<String> mime_type_charset;
auto encoding = "utf-8"sv; if (auto extracted_mime_type = response->header_list()->extract_mime_type(); extracted_mime_type.has_value()) {
auto maybe_decoder = TextCodec::decoder_for(encoding); if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
if (!maybe_decoder.has_value()) { mime_type_charset = charset.value();
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Unsupported encoding: {}", parsed_url, encoding);
return;
} }
auto& decoder = maybe_decoder.release_value(); // The environment encoding of an imported style sheet is the encoding of the style sheet that imported it. [css-syntax-3]
// FIXME: Save encoding on Stylesheet to get it here
auto decoded_or_error = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(decoder, *byte_stream); Optional<StringView> environment_encoding;
auto decoded_or_error = css_decode_bytes(environment_encoding, mime_type_charset, *byte_stream);
if (decoded_or_error.is_error()) { if (decoded_or_error.is_error()) {
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Encoding was: {}", parsed_url, encoding); dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {}", parsed_url);
return; return;
} }
auto decoded = decoded_or_error.release_value(); auto decoded = decoded_or_error.release_value();

View file

@ -4,10 +4,12 @@
* Copyright (c) 2021-2024, Sam Atkins <atkinssj@serenityos.org> * Copyright (c) 2021-2024, Sam Atkins <atkinssj@serenityos.org>
* Copyright (c) 2021, Tobias Christiansen <tobyase@serenityos.org> * Copyright (c) 2021, Tobias Christiansen <tobyase@serenityos.org>
* Copyright (c) 2022, MacDue <macdue@dueutil.tech> * Copyright (c) 2022, MacDue <macdue@dueutil.tech>
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
#include <LibTextCodec/Decoder.h>
#include <LibWeb/Bindings/MainThreadVM.h> #include <LibWeb/Bindings/MainThreadVM.h>
#include <LibWeb/Bindings/PrincipalHostDefined.h> #include <LibWeb/Bindings/PrincipalHostDefined.h>
#include <LibWeb/CSS/CSSMediaRule.h> #include <LibWeb/CSS/CSSMediaRule.h>
@ -138,4 +140,75 @@ Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::Par
return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values(); return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values();
} }
// https://drafts.csswg.org/css-syntax/#css-decode-bytes
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string)
{
// https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
auto determine_the_fallback_encoding = [&mime_type_charset, &environment_encoding, &encoded_string]() -> StringView {
// 1. If HTTP or equivalent protocol provides an encoding label (e.g. via the charset parameter of the Content-Type header) for the stylesheet,
// get an encoding from encoding label. If that does not return failure, return it.
if (mime_type_charset.has_value()) {
if (auto encoding = TextCodec::get_standardized_encoding(mime_type_charset.value()); encoding.has_value())
return encoding.value();
}
// 2. Otherwise, check stylesheets byte stream. If the first 1024 bytes of the stream begin with the hex sequence
// 40 63 68 61 72 73 65 74 20 22 XX* 22 3B
// where each XX byte is a value between 0x16 and 0x21 inclusive or a value between 0x23 and 0x7F inclusive,
// then get an encoding from a string formed out of the sequence of XX bytes, interpreted as ASCII.
auto check_stylesheets_byte_stream = [&encoded_string]() -> Optional<StringView> {
size_t scan_length = min(encoded_string.size(), 1024);
auto pattern_start = "@charset \""sv;
auto pattern_end = "\";"sv;
if (scan_length < pattern_start.length())
return {};
StringView buffer_view = encoded_string.bytes().slice(0, scan_length);
if (!buffer_view.starts_with(pattern_start))
return {};
auto encoding_start = pattern_start.length();
auto end_index = buffer_view.find(pattern_end, encoding_start);
if (!end_index.has_value())
return {};
size_t encoding_length = end_index.value() - encoding_start;
auto encoding_view = buffer_view.substring_view(encoding_start, encoding_length);
for (char c : encoding_view) {
u8 byte = static_cast<u8>(c);
if ((byte < 0x01 || byte > 0x21) && (byte < 0x23 || byte > 0x7F)) {
return {};
}
}
return TextCodec::get_standardized_encoding(encoding_view);
};
// If the return value was utf-16be or utf-16le, return utf-8; if it was anything else except failure, return it.
auto byte_stream_value = check_stylesheets_byte_stream();
if (byte_stream_value.has_value() && (byte_stream_value == "UTF-16BE"sv || byte_stream_value == "UTF-16LE"))
return "utf-8"sv;
if (byte_stream_value.has_value())
return byte_stream_value.value();
// 3. Otherwise, if an environment encoding is provided by the referring document, return it.
if (environment_encoding.has_value())
return environment_encoding.value();
// 4. Otherwise, return utf-8.
return "utf-8"sv;
};
// 1. Determine the fallback encoding of stylesheet, and let fallback be the result.
auto fallback = determine_the_fallback_encoding();
auto decoder = TextCodec::decoder_for(fallback);
if (!decoder.has_value()) {
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", fallback);
return Error::from_string_literal("No Decoder found");
}
// 2. Decode stylesheets stream of bytes with fallback encoding fallback, and return the result.
return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
}
} }

View file

@ -602,5 +602,6 @@ Vector<NonnullRefPtr<CSS::MediaQuery>> parse_media_query_list(CSS::Parser::Parsi
RefPtr<CSS::Supports> parse_css_supports(CSS::Parser::ParsingParams const&, StringView); RefPtr<CSS::Supports> parse_css_supports(CSS::Parser::ParsingParams const&, StringView);
Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::ParsingParams const&, StringView); Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::ParsingParams const&, StringView);
GC::Ref<JS::Realm> internal_css_realm(); GC::Ref<JS::Realm> internal_css_realm();
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string);
} }

View file

@ -422,10 +422,13 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
{ {
// 1. If the resource's Content-Type metadata is not text/css, then set success to false. // 1. If the resource's Content-Type metadata is not text/css, then set success to false.
auto mime_type_string = m_mime_type; auto mime_type_string = m_mime_type;
if (!mime_type_string.has_value()) { Optional<String> mime_type_charset;
auto extracted_mime_type = response.header_list()->extract_mime_type(); auto extracted_mime_type = response.header_list()->extract_mime_type();
if (extracted_mime_type.has_value()) if (extracted_mime_type.has_value()) {
if (!mime_type_string.has_value())
mime_type_string = extracted_mime_type->essence(); mime_type_string = extracted_mime_type->essence();
if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
mime_type_charset = charset.value();
} }
if (mime_type_string.has_value() && mime_type_string != "text/css"sv) { if (mime_type_string.has_value() && mime_type_string != "text/css"sv) {
@ -469,25 +472,17 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
// The CSS environment encoding is the result of running the following steps: [CSSSYNTAX] // The CSS environment encoding is the result of running the following steps: [CSSSYNTAX]
// 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING] // 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING]
// 2. Otherwise, return the document's character encoding. [DOM] // 2. Otherwise, return the document's character encoding. [DOM]
Optional<StringView> environment_encoding;
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) {
if (auto environment_encoding = TextCodec::get_standardized_encoding(charset.release_value()); environment_encoding.has_value())
environment_encoding = environment_encoding.value();
}
if (!environment_encoding.has_value() && document().encoding().has_value())
environment_encoding = document().encoding().value();
Optional<String> encoding; auto maybe_decoded_string = css_decode_bytes(environment_encoding, mime_type_charset, body_bytes.get<ByteBuffer>());
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value())
encoding = charset.release_value();
if (!encoding.has_value())
encoding = document().encoding_or_default();
auto decoder = TextCodec::decoder_for(*encoding);
if (!decoder.has_value()) {
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", encoding);
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
} else {
auto const& encoded_string = body_bytes.get<ByteBuffer>();
auto maybe_decoded_string = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
if (maybe_decoded_string.is_error()) { if (maybe_decoded_string.is_error()) {
dbgln("Style sheet {} claimed to be '{}' but decoding failed", response.url().value_or(URL::URL()), encoding); dbgln("Failed to decode CSS file: {}", response.url().value_or(URL::URL()));
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
} else { } else {
VERIFY(!response.url_list().is_empty()); VERIFY(!response.url_list().is_empty());
@ -507,7 +502,6 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load)); dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load));
} }
} }
}
// 5. Otherwise, fire an event named error at el. // 5. Otherwise, fire an event named error at el.
else { else {
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));

View file

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head>
<title>CSS Test: Stylesheet encodings: KOI8-R</title>
<link rel="author" title="Ian Hickson" href="mailto:ian@hixie.ch"/>
<link rel="alternate" href="http://www.hixie.ch/tests/adhoc/css/parsing/encoding/007.html" type="text/html"/>
<link rel="help" href="http://www.w3.org/TR/CSS21/syndata.html#charset" />
<link rel="match" href="../../../../../expected/wpt-import/css/CSS2/syntax/../reference/ref-green-background.xht"/>
<meta name="flags" content="http" />
<style type="text/css">
p { background: red; color: yellow; }
</style>
<link rel="stylesheet" href="support/at-charset-077.css"/>
</head>
<body>
<p class="t&#x0418;st">This should have a green background.</p>
</body>
</html>

View file

@ -0,0 +1,2 @@
@charset "koi8-r";
.tést { color: white; background: green; }

View file

@ -0,0 +1,6 @@
Harness status: OK
Found 1 tests
1 Pass
Pass The character encoding of the page can be set by a meta element with charset attribute.

View file

@ -0,0 +1,4 @@
@charset "utf-8";
.test div.ÜÃÚ {
width: 100px;
}

View file

@ -0,0 +1,37 @@
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="iso-8859-15"> <title>meta charset attribute</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='https://html.spec.whatwg.org/multipage/#the-input-byte-stream'>
<script src="../../../resources/testharness.js"></script>
<script src="../../../resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="support/encodingtests-15.css">
</head>
<body>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<!--Notes:
The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.
The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
-->
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, "The character encoding of the page can be set by a meta element with charset attribute.");
</script>
<div id='log'></div>
</body>
</html>