diff --git a/Libraries/LibWeb/CSS/CSSImportRule.cpp b/Libraries/LibWeb/CSS/CSSImportRule.cpp index 975c9215f56..5e7e2ce18bf 100644 --- a/Libraries/LibWeb/CSS/CSSImportRule.cpp +++ b/Libraries/LibWeb/CSS/CSSImportRule.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2021, the SerenityOS developers. * Copyright (c) 2021-2024, Sam Atkins * Copyright (c) 2022-2024, Andreas Kling + * Copyright (c) 2025, Lorenz Ackermann * * SPDX-License-Identifier: BSD-2-Clause */ @@ -146,18 +147,17 @@ void CSSImportRule::fetch() // 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl. // FIXME: Tidy up our parsing API. For now, do the decoding here. - // FIXME: Get the encoding from the response somehow. - auto encoding = "utf-8"sv; - auto maybe_decoder = TextCodec::decoder_for(encoding); - if (!maybe_decoder.has_value()) { - dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Unsupported encoding: {}", parsed_url, encoding); - return; + Optional mime_type_charset; + if (auto extracted_mime_type = response->header_list()->extract_mime_type(); extracted_mime_type.has_value()) { + if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value()) + mime_type_charset = charset.value(); } - auto& decoder = maybe_decoder.release_value(); - - auto decoded_or_error = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(decoder, *byte_stream); + // The environment encoding of an imported style sheet is the encoding of the style sheet that imported it. [css-syntax-3] + // FIXME: Save encoding on Stylesheet to get it here + Optional environment_encoding; + auto decoded_or_error = css_decode_bytes(environment_encoding, mime_type_charset, *byte_stream); if (decoded_or_error.is_error()) { - dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Encoding was: {}", parsed_url, encoding); + dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {}", parsed_url); return; } auto decoded = decoded_or_error.release_value(); diff --git a/Libraries/LibWeb/CSS/Parser/Helpers.cpp b/Libraries/LibWeb/CSS/Parser/Helpers.cpp index cc34e08feba..1a8c31cd9fd 100644 --- a/Libraries/LibWeb/CSS/Parser/Helpers.cpp +++ b/Libraries/LibWeb/CSS/Parser/Helpers.cpp @@ -4,10 +4,12 @@ * Copyright (c) 2021-2024, Sam Atkins * Copyright (c) 2021, Tobias Christiansen * Copyright (c) 2022, MacDue + * Copyright (c) 2025, Lorenz Ackermann * * SPDX-License-Identifier: BSD-2-Clause */ +#include #include #include #include @@ -138,4 +140,75 @@ Vector parse_component_values_list(CSS::Parser::Par return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values(); } +// https://drafts.csswg.org/css-syntax/#css-decode-bytes +ErrorOr css_decode_bytes(Optional const& environment_encoding, Optional mime_type_charset, ByteBuffer const& encoded_string) +{ + // https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding + auto determine_the_fallback_encoding = [&mime_type_charset, &environment_encoding, &encoded_string]() -> StringView { + // 1. If HTTP or equivalent protocol provides an encoding label (e.g. via the charset parameter of the Content-Type header) for the stylesheet, + // get an encoding from encoding label. If that does not return failure, return it. + if (mime_type_charset.has_value()) { + if (auto encoding = TextCodec::get_standardized_encoding(mime_type_charset.value()); encoding.has_value()) + return encoding.value(); + } + // 2. Otherwise, check stylesheet’s byte stream. If the first 1024 bytes of the stream begin with the hex sequence + // 40 63 68 61 72 73 65 74 20 22 XX* 22 3B + // where each XX byte is a value between 0x16 and 0x21 inclusive or a value between 0x23 and 0x7F inclusive, + // then get an encoding from a string formed out of the sequence of XX bytes, interpreted as ASCII. + auto check_stylesheets_byte_stream = [&encoded_string]() -> Optional { + size_t scan_length = min(encoded_string.size(), 1024); + auto pattern_start = "@charset \""sv; + auto pattern_end = "\";"sv; + + if (scan_length < pattern_start.length()) + return {}; + + StringView buffer_view = encoded_string.bytes().slice(0, scan_length); + if (!buffer_view.starts_with(pattern_start)) + return {}; + + auto encoding_start = pattern_start.length(); + auto end_index = buffer_view.find(pattern_end, encoding_start); + if (!end_index.has_value()) + return {}; + + size_t encoding_length = end_index.value() - encoding_start; + auto encoding_view = buffer_view.substring_view(encoding_start, encoding_length); + + for (char c : encoding_view) { + u8 byte = static_cast(c); + if ((byte < 0x01 || byte > 0x21) && (byte < 0x23 || byte > 0x7F)) { + return {}; + } + } + + return TextCodec::get_standardized_encoding(encoding_view); + }; + // If the return value was utf-16be or utf-16le, return utf-8; if it was anything else except failure, return it. + auto byte_stream_value = check_stylesheets_byte_stream(); + if (byte_stream_value.has_value() && (byte_stream_value == "UTF-16BE"sv || byte_stream_value == "UTF-16LE")) + return "utf-8"sv; + if (byte_stream_value.has_value()) + return byte_stream_value.value(); + + // 3. Otherwise, if an environment encoding is provided by the referring document, return it. + if (environment_encoding.has_value()) + return environment_encoding.value(); + + // 4. Otherwise, return utf-8. + return "utf-8"sv; + }; + + // 1. Determine the fallback encoding of stylesheet, and let fallback be the result. + auto fallback = determine_the_fallback_encoding(); + auto decoder = TextCodec::decoder_for(fallback); + if (!decoder.has_value()) { + // If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not. + dbgln("FIXME: Style sheet encoding '{}' is not supported yet", fallback); + return Error::from_string_literal("No Decoder found"); + } + // 2. Decode stylesheet’s stream of bytes with fallback encoding fallback, and return the result. + return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string); +} + } diff --git a/Libraries/LibWeb/CSS/Parser/Parser.h b/Libraries/LibWeb/CSS/Parser/Parser.h index 89ee489a775..29c3ad96337 100644 --- a/Libraries/LibWeb/CSS/Parser/Parser.h +++ b/Libraries/LibWeb/CSS/Parser/Parser.h @@ -602,5 +602,6 @@ Vector> parse_media_query_list(CSS::Parser::Parsi RefPtr parse_css_supports(CSS::Parser::ParsingParams const&, StringView); Vector parse_component_values_list(CSS::Parser::ParsingParams const&, StringView); GC::Ref internal_css_realm(); +ErrorOr css_decode_bytes(Optional const& environment_encoding, Optional mime_type_charset, ByteBuffer const& encoded_string); } diff --git a/Libraries/LibWeb/HTML/HTMLLinkElement.cpp b/Libraries/LibWeb/HTML/HTMLLinkElement.cpp index f170a16dd4c..0905ef6cfdd 100644 --- a/Libraries/LibWeb/HTML/HTMLLinkElement.cpp +++ b/Libraries/LibWeb/HTML/HTMLLinkElement.cpp @@ -422,10 +422,13 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru { // 1. If the resource's Content-Type metadata is not text/css, then set success to false. auto mime_type_string = m_mime_type; - if (!mime_type_string.has_value()) { - auto extracted_mime_type = response.header_list()->extract_mime_type(); - if (extracted_mime_type.has_value()) + Optional mime_type_charset; + auto extracted_mime_type = response.header_list()->extract_mime_type(); + if (extracted_mime_type.has_value()) { + if (!mime_type_string.has_value()) mime_type_string = extracted_mime_type->essence(); + if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value()) + mime_type_charset = charset.value(); } if (mime_type_string.has_value() && mime_type_string != "text/css"sv) { @@ -469,43 +472,34 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru // The CSS environment encoding is the result of running the following steps: [CSSSYNTAX] // 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING] // 2. Otherwise, return the document's character encoding. [DOM] + Optional environment_encoding; + if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) { + if (auto environment_encoding = TextCodec::get_standardized_encoding(charset.release_value()); environment_encoding.has_value()) + environment_encoding = environment_encoding.value(); + } + if (!environment_encoding.has_value() && document().encoding().has_value()) + environment_encoding = document().encoding().value(); - Optional encoding; - if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) - encoding = charset.release_value(); - - if (!encoding.has_value()) - encoding = document().encoding_or_default(); - - auto decoder = TextCodec::decoder_for(*encoding); - - if (!decoder.has_value()) { - // If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not. - dbgln("FIXME: Style sheet encoding '{}' is not supported yet", encoding); + auto maybe_decoded_string = css_decode_bytes(environment_encoding, mime_type_charset, body_bytes.get()); + if (maybe_decoded_string.is_error()) { + dbgln("Failed to decode CSS file: {}", response.url().value_or(URL::URL())); dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); } else { - auto const& encoded_string = body_bytes.get(); - auto maybe_decoded_string = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string); - if (maybe_decoded_string.is_error()) { - dbgln("Style sheet {} claimed to be '{}' but decoding failed", response.url().value_or(URL::URL()), encoding); - dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); - } else { - VERIFY(!response.url_list().is_empty()); - m_loaded_style_sheet = document_or_shadow_root_style_sheets().create_a_css_style_sheet( - maybe_decoded_string.release_value(), - "text/css"_string, - this, - attribute(HTML::AttributeNames::media).value_or({}), - in_a_document_tree() ? attribute(HTML::AttributeNames::title).value_or({}) : String {}, - (m_relationship & Relationship::Alternate && !m_explicitly_enabled) ? CSS::StyleSheetList::Alternate::Yes : CSS::StyleSheetList::Alternate::No, - CSS::StyleSheetList::OriginClean::Yes, - response.url_list().first(), - nullptr, - nullptr); + VERIFY(!response.url_list().is_empty()); + m_loaded_style_sheet = document_or_shadow_root_style_sheets().create_a_css_style_sheet( + maybe_decoded_string.release_value(), + "text/css"_string, + this, + attribute(HTML::AttributeNames::media).value_or({}), + in_a_document_tree() ? attribute(HTML::AttributeNames::title).value_or({}) : String {}, + (m_relationship & Relationship::Alternate && !m_explicitly_enabled) ? CSS::StyleSheetList::Alternate::Yes : CSS::StyleSheetList::Alternate::No, + CSS::StyleSheetList::OriginClean::Yes, + response.url_list().first(), + nullptr, + nullptr); - // 2. Fire an event named load at el. - dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load)); - } + // 2. Fire an event named load at el. + dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load)); } } // 5. Otherwise, fire an event named error at el. diff --git a/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht new file mode 100644 index 00000000000..141b7b567f1 --- /dev/null +++ b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht @@ -0,0 +1,19 @@ + + + + + CSS Test: Stylesheet encodings: KOI8-R + + + + + + + + + +

This should have a green background.

+ + diff --git a/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css new file mode 100644 index 00000000000..9a978061caf --- /dev/null +++ b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css @@ -0,0 +1,2 @@ +@charset "koi8-r"; +.tést { color: white; background: green; } \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt new file mode 100644 index 00000000000..5c1440a139a --- /dev/null +++ b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt @@ -0,0 +1,6 @@ +Harness status: OK + +Found 1 tests + +1 Pass +Pass The character encoding of the page can be set by a meta element with charset attribute. \ No newline at end of file diff --git a/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css new file mode 100644 index 00000000000..ec907a1a94e --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css @@ -0,0 +1,4 @@ +@charset "utf-8"; +.test div.ÜÀÚ { + width: 100px; +} diff --git a/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html new file mode 100644 index 00000000000..93e83b7cf62 --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html @@ -0,0 +1,37 @@ + + + + meta charset attribute + + + + + + + + + + + + +
 
+ + + + + +
+ + +