mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-10-19 07:33:20 +00:00
LibWeb: Implement CSS decode bytes algo
This commit is contained in:
parent
4b989b8efd
commit
e73e0b3c92
Notes:
github-actions[bot]
2025-10-16 14:46:26 +00:00
Author: https://github.com/lpas
Commit: e73e0b3c92
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6216
Reviewed-by: https://github.com/AtkinsSJ
Reviewed-by: https://github.com/gmta ✅
9 changed files with 182 additions and 46 deletions
|
@ -2,6 +2,7 @@
|
|||
* Copyright (c) 2021, the SerenityOS developers.
|
||||
* Copyright (c) 2021-2024, Sam Atkins <sam@ladybird.org>
|
||||
* Copyright (c) 2022-2024, Andreas Kling <andreas@ladybird.org>
|
||||
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
@ -146,18 +147,17 @@ void CSSImportRule::fetch()
|
|||
|
||||
// 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl.
|
||||
// FIXME: Tidy up our parsing API. For now, do the decoding here.
|
||||
// FIXME: Get the encoding from the response somehow.
|
||||
auto encoding = "utf-8"sv;
|
||||
auto maybe_decoder = TextCodec::decoder_for(encoding);
|
||||
if (!maybe_decoder.has_value()) {
|
||||
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Unsupported encoding: {}", parsed_url, encoding);
|
||||
return;
|
||||
Optional<String> mime_type_charset;
|
||||
if (auto extracted_mime_type = response->header_list()->extract_mime_type(); extracted_mime_type.has_value()) {
|
||||
if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
|
||||
mime_type_charset = charset.value();
|
||||
}
|
||||
auto& decoder = maybe_decoder.release_value();
|
||||
|
||||
auto decoded_or_error = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(decoder, *byte_stream);
|
||||
// The environment encoding of an imported style sheet is the encoding of the style sheet that imported it. [css-syntax-3]
|
||||
// FIXME: Save encoding on Stylesheet to get it here
|
||||
Optional<StringView> environment_encoding;
|
||||
auto decoded_or_error = css_decode_bytes(environment_encoding, mime_type_charset, *byte_stream);
|
||||
if (decoded_or_error.is_error()) {
|
||||
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Encoding was: {}", parsed_url, encoding);
|
||||
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {}", parsed_url);
|
||||
return;
|
||||
}
|
||||
auto decoded = decoded_or_error.release_value();
|
||||
|
|
|
@ -4,10 +4,12 @@
|
|||
* Copyright (c) 2021-2024, Sam Atkins <atkinssj@serenityos.org>
|
||||
* Copyright (c) 2021, Tobias Christiansen <tobyase@serenityos.org>
|
||||
* Copyright (c) 2022, MacDue <macdue@dueutil.tech>
|
||||
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibWeb/Bindings/MainThreadVM.h>
|
||||
#include <LibWeb/Bindings/PrincipalHostDefined.h>
|
||||
#include <LibWeb/CSS/CSSMediaRule.h>
|
||||
|
@ -138,4 +140,75 @@ Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::Par
|
|||
return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values();
|
||||
}
|
||||
|
||||
// https://drafts.csswg.org/css-syntax/#css-decode-bytes
|
||||
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string)
|
||||
{
|
||||
// https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
|
||||
auto determine_the_fallback_encoding = [&mime_type_charset, &environment_encoding, &encoded_string]() -> StringView {
|
||||
// 1. If HTTP or equivalent protocol provides an encoding label (e.g. via the charset parameter of the Content-Type header) for the stylesheet,
|
||||
// get an encoding from encoding label. If that does not return failure, return it.
|
||||
if (mime_type_charset.has_value()) {
|
||||
if (auto encoding = TextCodec::get_standardized_encoding(mime_type_charset.value()); encoding.has_value())
|
||||
return encoding.value();
|
||||
}
|
||||
// 2. Otherwise, check stylesheet’s byte stream. If the first 1024 bytes of the stream begin with the hex sequence
|
||||
// 40 63 68 61 72 73 65 74 20 22 XX* 22 3B
|
||||
// where each XX byte is a value between 0x16 and 0x21 inclusive or a value between 0x23 and 0x7F inclusive,
|
||||
// then get an encoding from a string formed out of the sequence of XX bytes, interpreted as ASCII.
|
||||
auto check_stylesheets_byte_stream = [&encoded_string]() -> Optional<StringView> {
|
||||
size_t scan_length = min(encoded_string.size(), 1024);
|
||||
auto pattern_start = "@charset \""sv;
|
||||
auto pattern_end = "\";"sv;
|
||||
|
||||
if (scan_length < pattern_start.length())
|
||||
return {};
|
||||
|
||||
StringView buffer_view = encoded_string.bytes().slice(0, scan_length);
|
||||
if (!buffer_view.starts_with(pattern_start))
|
||||
return {};
|
||||
|
||||
auto encoding_start = pattern_start.length();
|
||||
auto end_index = buffer_view.find(pattern_end, encoding_start);
|
||||
if (!end_index.has_value())
|
||||
return {};
|
||||
|
||||
size_t encoding_length = end_index.value() - encoding_start;
|
||||
auto encoding_view = buffer_view.substring_view(encoding_start, encoding_length);
|
||||
|
||||
for (char c : encoding_view) {
|
||||
u8 byte = static_cast<u8>(c);
|
||||
if ((byte < 0x01 || byte > 0x21) && (byte < 0x23 || byte > 0x7F)) {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
return TextCodec::get_standardized_encoding(encoding_view);
|
||||
};
|
||||
// If the return value was utf-16be or utf-16le, return utf-8; if it was anything else except failure, return it.
|
||||
auto byte_stream_value = check_stylesheets_byte_stream();
|
||||
if (byte_stream_value.has_value() && (byte_stream_value == "UTF-16BE"sv || byte_stream_value == "UTF-16LE"))
|
||||
return "utf-8"sv;
|
||||
if (byte_stream_value.has_value())
|
||||
return byte_stream_value.value();
|
||||
|
||||
// 3. Otherwise, if an environment encoding is provided by the referring document, return it.
|
||||
if (environment_encoding.has_value())
|
||||
return environment_encoding.value();
|
||||
|
||||
// 4. Otherwise, return utf-8.
|
||||
return "utf-8"sv;
|
||||
};
|
||||
|
||||
// 1. Determine the fallback encoding of stylesheet, and let fallback be the result.
|
||||
auto fallback = determine_the_fallback_encoding();
|
||||
auto decoder = TextCodec::decoder_for(fallback);
|
||||
if (!decoder.has_value()) {
|
||||
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
|
||||
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", fallback);
|
||||
return Error::from_string_literal("No Decoder found");
|
||||
}
|
||||
// 2. Decode stylesheet’s stream of bytes with fallback encoding fallback, and return the result.
|
||||
return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -602,5 +602,6 @@ Vector<NonnullRefPtr<CSS::MediaQuery>> parse_media_query_list(CSS::Parser::Parsi
|
|||
RefPtr<CSS::Supports> parse_css_supports(CSS::Parser::ParsingParams const&, StringView);
|
||||
Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::ParsingParams const&, StringView);
|
||||
GC::Ref<JS::Realm> internal_css_realm();
|
||||
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string);
|
||||
|
||||
}
|
||||
|
|
|
@ -422,10 +422,13 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
|||
{
|
||||
// 1. If the resource's Content-Type metadata is not text/css, then set success to false.
|
||||
auto mime_type_string = m_mime_type;
|
||||
if (!mime_type_string.has_value()) {
|
||||
Optional<String> mime_type_charset;
|
||||
auto extracted_mime_type = response.header_list()->extract_mime_type();
|
||||
if (extracted_mime_type.has_value())
|
||||
if (extracted_mime_type.has_value()) {
|
||||
if (!mime_type_string.has_value())
|
||||
mime_type_string = extracted_mime_type->essence();
|
||||
if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
|
||||
mime_type_charset = charset.value();
|
||||
}
|
||||
|
||||
if (mime_type_string.has_value() && mime_type_string != "text/css"sv) {
|
||||
|
@ -469,25 +472,17 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
|||
// The CSS environment encoding is the result of running the following steps: [CSSSYNTAX]
|
||||
// 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING]
|
||||
// 2. Otherwise, return the document's character encoding. [DOM]
|
||||
Optional<StringView> environment_encoding;
|
||||
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) {
|
||||
if (auto environment_encoding = TextCodec::get_standardized_encoding(charset.release_value()); environment_encoding.has_value())
|
||||
environment_encoding = environment_encoding.value();
|
||||
}
|
||||
if (!environment_encoding.has_value() && document().encoding().has_value())
|
||||
environment_encoding = document().encoding().value();
|
||||
|
||||
Optional<String> encoding;
|
||||
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value())
|
||||
encoding = charset.release_value();
|
||||
|
||||
if (!encoding.has_value())
|
||||
encoding = document().encoding_or_default();
|
||||
|
||||
auto decoder = TextCodec::decoder_for(*encoding);
|
||||
|
||||
if (!decoder.has_value()) {
|
||||
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
|
||||
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", encoding);
|
||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
||||
} else {
|
||||
auto const& encoded_string = body_bytes.get<ByteBuffer>();
|
||||
auto maybe_decoded_string = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
|
||||
auto maybe_decoded_string = css_decode_bytes(environment_encoding, mime_type_charset, body_bytes.get<ByteBuffer>());
|
||||
if (maybe_decoded_string.is_error()) {
|
||||
dbgln("Style sheet {} claimed to be '{}' but decoding failed", response.url().value_or(URL::URL()), encoding);
|
||||
dbgln("Failed to decode CSS file: {}", response.url().value_or(URL::URL()));
|
||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
||||
} else {
|
||||
VERIFY(!response.url_list().is_empty());
|
||||
|
@ -507,7 +502,6 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
|||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load));
|
||||
}
|
||||
}
|
||||
}
|
||||
// 5. Otherwise, fire an event named error at el.
|
||||
else {
|
||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
<?xml version="1.0" encoding="us-ascii"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
||||
<head>
|
||||
<title>CSS Test: Stylesheet encodings: KOI8-R</title>
|
||||
<link rel="author" title="Ian Hickson" href="mailto:ian@hixie.ch"/>
|
||||
<link rel="alternate" href="http://www.hixie.ch/tests/adhoc/css/parsing/encoding/007.html" type="text/html"/>
|
||||
<link rel="help" href="http://www.w3.org/TR/CSS21/syndata.html#charset" />
|
||||
<link rel="match" href="../../../../../expected/wpt-import/css/CSS2/syntax/../reference/ref-green-background.xht"/>
|
||||
<meta name="flags" content="http" />
|
||||
<style type="text/css">
|
||||
p { background: red; color: yellow; }
|
||||
</style>
|
||||
<link rel="stylesheet" href="support/at-charset-077.css"/>
|
||||
</head>
|
||||
<body>
|
||||
<p class="tИst">This should have a green background.</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,2 @@
|
|||
@charset "koi8-r";
|
||||
.tést { color: white; background: green; }
|
|
@ -0,0 +1,6 @@
|
|||
Harness status: OK
|
||||
|
||||
Found 1 tests
|
||||
|
||||
1 Pass
|
||||
Pass The character encoding of the page can be set by a meta element with charset attribute.
|
|
@ -0,0 +1,4 @@
|
|||
@charset "utf-8";
|
||||
.test div.ÜÀÚ {
|
||||
width: 100px;
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en" >
|
||||
<head>
|
||||
<meta charset="iso-8859-15"> <title>meta charset attribute</title>
|
||||
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
|
||||
<link rel='help' href='https://html.spec.whatwg.org/multipage/#the-input-byte-stream'>
|
||||
<script src="../../../resources/testharness.js"></script>
|
||||
<script src="../../../resources/testharnessreport.js"></script>
|
||||
<meta name='flags' content='http'>
|
||||
<style type='text/css'>
|
||||
.test div { width: 50px; }</style>
|
||||
<link rel="stylesheet" type="text/css" href="support/encodingtests-15.css">
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
<div class='test'><div id='box' class='ýäè'> </div></div>
|
||||
|
||||
|
||||
<!--Notes:
|
||||
|
||||
The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.
|
||||
|
||||
The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.ÜÀÚ</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
|
||||
|
||||
-->
|
||||
<script>
|
||||
test(function() {
|
||||
assert_equals(document.getElementById('box').offsetWidth, 100);
|
||||
}, "The character encoding of the page can be set by a meta element with charset attribute.");
|
||||
</script>
|
||||
|
||||
<div id='log'></div>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Add table
Add a link
Reference in a new issue