mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-10-19 07:33:20 +00:00
LibWeb: Implement CSS decode bytes algo
This commit is contained in:
parent
4b989b8efd
commit
e73e0b3c92
Notes:
github-actions[bot]
2025-10-16 14:46:26 +00:00
Author: https://github.com/lpas
Commit: e73e0b3c92
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6216
Reviewed-by: https://github.com/AtkinsSJ
Reviewed-by: https://github.com/gmta ✅
9 changed files with 182 additions and 46 deletions
|
@ -2,6 +2,7 @@
|
||||||
* Copyright (c) 2021, the SerenityOS developers.
|
* Copyright (c) 2021, the SerenityOS developers.
|
||||||
* Copyright (c) 2021-2024, Sam Atkins <sam@ladybird.org>
|
* Copyright (c) 2021-2024, Sam Atkins <sam@ladybird.org>
|
||||||
* Copyright (c) 2022-2024, Andreas Kling <andreas@ladybird.org>
|
* Copyright (c) 2022-2024, Andreas Kling <andreas@ladybird.org>
|
||||||
|
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -146,18 +147,17 @@ void CSSImportRule::fetch()
|
||||||
|
|
||||||
// 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl.
|
// 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl.
|
||||||
// FIXME: Tidy up our parsing API. For now, do the decoding here.
|
// FIXME: Tidy up our parsing API. For now, do the decoding here.
|
||||||
// FIXME: Get the encoding from the response somehow.
|
Optional<String> mime_type_charset;
|
||||||
auto encoding = "utf-8"sv;
|
if (auto extracted_mime_type = response->header_list()->extract_mime_type(); extracted_mime_type.has_value()) {
|
||||||
auto maybe_decoder = TextCodec::decoder_for(encoding);
|
if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
|
||||||
if (!maybe_decoder.has_value()) {
|
mime_type_charset = charset.value();
|
||||||
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Unsupported encoding: {}", parsed_url, encoding);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
auto& decoder = maybe_decoder.release_value();
|
// The environment encoding of an imported style sheet is the encoding of the style sheet that imported it. [css-syntax-3]
|
||||||
|
// FIXME: Save encoding on Stylesheet to get it here
|
||||||
auto decoded_or_error = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(decoder, *byte_stream);
|
Optional<StringView> environment_encoding;
|
||||||
|
auto decoded_or_error = css_decode_bytes(environment_encoding, mime_type_charset, *byte_stream);
|
||||||
if (decoded_or_error.is_error()) {
|
if (decoded_or_error.is_error()) {
|
||||||
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Encoding was: {}", parsed_url, encoding);
|
dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {}", parsed_url);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto decoded = decoded_or_error.release_value();
|
auto decoded = decoded_or_error.release_value();
|
||||||
|
|
|
@ -4,10 +4,12 @@
|
||||||
* Copyright (c) 2021-2024, Sam Atkins <atkinssj@serenityos.org>
|
* Copyright (c) 2021-2024, Sam Atkins <atkinssj@serenityos.org>
|
||||||
* Copyright (c) 2021, Tobias Christiansen <tobyase@serenityos.org>
|
* Copyright (c) 2021, Tobias Christiansen <tobyase@serenityos.org>
|
||||||
* Copyright (c) 2022, MacDue <macdue@dueutil.tech>
|
* Copyright (c) 2022, MacDue <macdue@dueutil.tech>
|
||||||
|
* Copyright (c) 2025, Lorenz Ackermann <me@lorenzackermann.xyz>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <LibTextCodec/Decoder.h>
|
||||||
#include <LibWeb/Bindings/MainThreadVM.h>
|
#include <LibWeb/Bindings/MainThreadVM.h>
|
||||||
#include <LibWeb/Bindings/PrincipalHostDefined.h>
|
#include <LibWeb/Bindings/PrincipalHostDefined.h>
|
||||||
#include <LibWeb/CSS/CSSMediaRule.h>
|
#include <LibWeb/CSS/CSSMediaRule.h>
|
||||||
|
@ -138,4 +140,75 @@ Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::Par
|
||||||
return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values();
|
return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://drafts.csswg.org/css-syntax/#css-decode-bytes
|
||||||
|
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string)
|
||||||
|
{
|
||||||
|
// https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
|
||||||
|
auto determine_the_fallback_encoding = [&mime_type_charset, &environment_encoding, &encoded_string]() -> StringView {
|
||||||
|
// 1. If HTTP or equivalent protocol provides an encoding label (e.g. via the charset parameter of the Content-Type header) for the stylesheet,
|
||||||
|
// get an encoding from encoding label. If that does not return failure, return it.
|
||||||
|
if (mime_type_charset.has_value()) {
|
||||||
|
if (auto encoding = TextCodec::get_standardized_encoding(mime_type_charset.value()); encoding.has_value())
|
||||||
|
return encoding.value();
|
||||||
|
}
|
||||||
|
// 2. Otherwise, check stylesheet’s byte stream. If the first 1024 bytes of the stream begin with the hex sequence
|
||||||
|
// 40 63 68 61 72 73 65 74 20 22 XX* 22 3B
|
||||||
|
// where each XX byte is a value between 0x16 and 0x21 inclusive or a value between 0x23 and 0x7F inclusive,
|
||||||
|
// then get an encoding from a string formed out of the sequence of XX bytes, interpreted as ASCII.
|
||||||
|
auto check_stylesheets_byte_stream = [&encoded_string]() -> Optional<StringView> {
|
||||||
|
size_t scan_length = min(encoded_string.size(), 1024);
|
||||||
|
auto pattern_start = "@charset \""sv;
|
||||||
|
auto pattern_end = "\";"sv;
|
||||||
|
|
||||||
|
if (scan_length < pattern_start.length())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
StringView buffer_view = encoded_string.bytes().slice(0, scan_length);
|
||||||
|
if (!buffer_view.starts_with(pattern_start))
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto encoding_start = pattern_start.length();
|
||||||
|
auto end_index = buffer_view.find(pattern_end, encoding_start);
|
||||||
|
if (!end_index.has_value())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
size_t encoding_length = end_index.value() - encoding_start;
|
||||||
|
auto encoding_view = buffer_view.substring_view(encoding_start, encoding_length);
|
||||||
|
|
||||||
|
for (char c : encoding_view) {
|
||||||
|
u8 byte = static_cast<u8>(c);
|
||||||
|
if ((byte < 0x01 || byte > 0x21) && (byte < 0x23 || byte > 0x7F)) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return TextCodec::get_standardized_encoding(encoding_view);
|
||||||
|
};
|
||||||
|
// If the return value was utf-16be or utf-16le, return utf-8; if it was anything else except failure, return it.
|
||||||
|
auto byte_stream_value = check_stylesheets_byte_stream();
|
||||||
|
if (byte_stream_value.has_value() && (byte_stream_value == "UTF-16BE"sv || byte_stream_value == "UTF-16LE"))
|
||||||
|
return "utf-8"sv;
|
||||||
|
if (byte_stream_value.has_value())
|
||||||
|
return byte_stream_value.value();
|
||||||
|
|
||||||
|
// 3. Otherwise, if an environment encoding is provided by the referring document, return it.
|
||||||
|
if (environment_encoding.has_value())
|
||||||
|
return environment_encoding.value();
|
||||||
|
|
||||||
|
// 4. Otherwise, return utf-8.
|
||||||
|
return "utf-8"sv;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 1. Determine the fallback encoding of stylesheet, and let fallback be the result.
|
||||||
|
auto fallback = determine_the_fallback_encoding();
|
||||||
|
auto decoder = TextCodec::decoder_for(fallback);
|
||||||
|
if (!decoder.has_value()) {
|
||||||
|
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
|
||||||
|
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", fallback);
|
||||||
|
return Error::from_string_literal("No Decoder found");
|
||||||
|
}
|
||||||
|
// 2. Decode stylesheet’s stream of bytes with fallback encoding fallback, and return the result.
|
||||||
|
return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -602,5 +602,6 @@ Vector<NonnullRefPtr<CSS::MediaQuery>> parse_media_query_list(CSS::Parser::Parsi
|
||||||
RefPtr<CSS::Supports> parse_css_supports(CSS::Parser::ParsingParams const&, StringView);
|
RefPtr<CSS::Supports> parse_css_supports(CSS::Parser::ParsingParams const&, StringView);
|
||||||
Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::ParsingParams const&, StringView);
|
Vector<CSS::Parser::ComponentValue> parse_component_values_list(CSS::Parser::ParsingParams const&, StringView);
|
||||||
GC::Ref<JS::Realm> internal_css_realm();
|
GC::Ref<JS::Realm> internal_css_realm();
|
||||||
|
ErrorOr<String> css_decode_bytes(Optional<StringView> const& environment_encoding, Optional<String> mime_type_charset, ByteBuffer const& encoded_string);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -422,10 +422,13 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
||||||
{
|
{
|
||||||
// 1. If the resource's Content-Type metadata is not text/css, then set success to false.
|
// 1. If the resource's Content-Type metadata is not text/css, then set success to false.
|
||||||
auto mime_type_string = m_mime_type;
|
auto mime_type_string = m_mime_type;
|
||||||
if (!mime_type_string.has_value()) {
|
Optional<String> mime_type_charset;
|
||||||
auto extracted_mime_type = response.header_list()->extract_mime_type();
|
auto extracted_mime_type = response.header_list()->extract_mime_type();
|
||||||
if (extracted_mime_type.has_value())
|
if (extracted_mime_type.has_value()) {
|
||||||
|
if (!mime_type_string.has_value())
|
||||||
mime_type_string = extracted_mime_type->essence();
|
mime_type_string = extracted_mime_type->essence();
|
||||||
|
if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value())
|
||||||
|
mime_type_charset = charset.value();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mime_type_string.has_value() && mime_type_string != "text/css"sv) {
|
if (mime_type_string.has_value() && mime_type_string != "text/css"sv) {
|
||||||
|
@ -469,25 +472,17 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
||||||
// The CSS environment encoding is the result of running the following steps: [CSSSYNTAX]
|
// The CSS environment encoding is the result of running the following steps: [CSSSYNTAX]
|
||||||
// 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING]
|
// 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING]
|
||||||
// 2. Otherwise, return the document's character encoding. [DOM]
|
// 2. Otherwise, return the document's character encoding. [DOM]
|
||||||
|
Optional<StringView> environment_encoding;
|
||||||
|
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) {
|
||||||
|
if (auto environment_encoding = TextCodec::get_standardized_encoding(charset.release_value()); environment_encoding.has_value())
|
||||||
|
environment_encoding = environment_encoding.value();
|
||||||
|
}
|
||||||
|
if (!environment_encoding.has_value() && document().encoding().has_value())
|
||||||
|
environment_encoding = document().encoding().value();
|
||||||
|
|
||||||
Optional<String> encoding;
|
auto maybe_decoded_string = css_decode_bytes(environment_encoding, mime_type_charset, body_bytes.get<ByteBuffer>());
|
||||||
if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value())
|
|
||||||
encoding = charset.release_value();
|
|
||||||
|
|
||||||
if (!encoding.has_value())
|
|
||||||
encoding = document().encoding_or_default();
|
|
||||||
|
|
||||||
auto decoder = TextCodec::decoder_for(*encoding);
|
|
||||||
|
|
||||||
if (!decoder.has_value()) {
|
|
||||||
// If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not.
|
|
||||||
dbgln("FIXME: Style sheet encoding '{}' is not supported yet", encoding);
|
|
||||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
|
||||||
} else {
|
|
||||||
auto const& encoded_string = body_bytes.get<ByteBuffer>();
|
|
||||||
auto maybe_decoded_string = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string);
|
|
||||||
if (maybe_decoded_string.is_error()) {
|
if (maybe_decoded_string.is_error()) {
|
||||||
dbgln("Style sheet {} claimed to be '{}' but decoding failed", response.url().value_or(URL::URL()), encoding);
|
dbgln("Failed to decode CSS file: {}", response.url().value_or(URL::URL()));
|
||||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
||||||
} else {
|
} else {
|
||||||
VERIFY(!response.url_list().is_empty());
|
VERIFY(!response.url_list().is_empty());
|
||||||
|
@ -507,7 +502,6 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru
|
||||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load));
|
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// 5. Otherwise, fire an event named error at el.
|
// 5. Otherwise, fire an event named error at el.
|
||||||
else {
|
else {
|
||||||
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error));
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
<?xml version="1.0" encoding="us-ascii"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
||||||
|
<head>
|
||||||
|
<title>CSS Test: Stylesheet encodings: KOI8-R</title>
|
||||||
|
<link rel="author" title="Ian Hickson" href="mailto:ian@hixie.ch"/>
|
||||||
|
<link rel="alternate" href="http://www.hixie.ch/tests/adhoc/css/parsing/encoding/007.html" type="text/html"/>
|
||||||
|
<link rel="help" href="http://www.w3.org/TR/CSS21/syndata.html#charset" />
|
||||||
|
<link rel="match" href="../../../../../expected/wpt-import/css/CSS2/syntax/../reference/ref-green-background.xht"/>
|
||||||
|
<meta name="flags" content="http" />
|
||||||
|
<style type="text/css">
|
||||||
|
p { background: red; color: yellow; }
|
||||||
|
</style>
|
||||||
|
<link rel="stylesheet" href="support/at-charset-077.css"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p class="tИst">This should have a green background.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,2 @@
|
||||||
|
@charset "koi8-r";
|
||||||
|
.tést { color: white; background: green; }
|
|
@ -0,0 +1,6 @@
|
||||||
|
Harness status: OK
|
||||||
|
|
||||||
|
Found 1 tests
|
||||||
|
|
||||||
|
1 Pass
|
||||||
|
Pass The character encoding of the page can be set by a meta element with charset attribute.
|
|
@ -0,0 +1,4 @@
|
||||||
|
@charset "utf-8";
|
||||||
|
.test div.ÜÀÚ {
|
||||||
|
width: 100px;
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en" >
|
||||||
|
<head>
|
||||||
|
<meta charset="iso-8859-15"> <title>meta charset attribute</title>
|
||||||
|
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
|
||||||
|
<link rel='help' href='https://html.spec.whatwg.org/multipage/#the-input-byte-stream'>
|
||||||
|
<script src="../../../resources/testharness.js"></script>
|
||||||
|
<script src="../../../resources/testharnessreport.js"></script>
|
||||||
|
<meta name='flags' content='http'>
|
||||||
|
<style type='text/css'>
|
||||||
|
.test div { width: 50px; }</style>
|
||||||
|
<link rel="stylesheet" type="text/css" href="support/encodingtests-15.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class='test'><div id='box' class='ýäè'> </div></div>
|
||||||
|
|
||||||
|
|
||||||
|
<!--Notes:
|
||||||
|
|
||||||
|
The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.
|
||||||
|
|
||||||
|
The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.ÜÀÚ</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
|
||||||
|
|
||||||
|
-->
|
||||||
|
<script>
|
||||||
|
test(function() {
|
||||||
|
assert_equals(document.getElementById('box').offsetWidth, 100);
|
||||||
|
}, "The character encoding of the page can be set by a meta element with charset attribute.");
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div id='log'></div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Add table
Add a link
Reference in a new issue