mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2026-06-18 07:43:37 +00:00
Keep decoded CSS text separate from tokenizer byte input. CSSOM and already-decoded stylesheet text preserve code point preprocessing, so a lone surrogate maps to one replacement character instead of being re-decoded as malformed UTF-8 bytes. Decode tokenizer byte input with the requested encoding unless that encoding is UTF-8 and the byte stream is strictly valid UTF-8. Keep the fast path by constructing the decoded string without validating twice after strict validation succeeds. Preserve UTF-8 decoder behavior on the byte fast path by stripping an initial UTF-8 BOM and rejecting encoded surrogate bytes. Invalid UTF-8 still goes through the decoder. Add tokenizer coverage for both the C++ and Rust backends across decoded text, UTF-8 aliases, BOM-prefixed input, invalid UTF-8, and non-UTF requested encodings.
31 lines
585 B
C++
31 lines
585 B
C++
/*
|
|
* Copyright (c) 2026, the Ladybird developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <AK/StringView.h>
|
|
#include <AK/Vector.h>
|
|
#include <LibWeb/CSS/Parser/Token.h>
|
|
#include <LibWeb/CSS/Parser/Tokenizer.h>
|
|
#include <LibWeb/Export.h>
|
|
|
|
namespace Web::CSS::Parser::FFI {
|
|
|
|
struct CssToken;
|
|
|
|
}
|
|
|
|
namespace Web::CSS::Parser {
|
|
|
|
class WEB_API RustTokenizer {
|
|
public:
|
|
static Vector<Token> tokenize(StringView input, StringView encoding, TokenizerInput = TokenizerInput::DecodedText);
|
|
|
|
private:
|
|
static Token token_from_ffi(FFI::CssToken const&);
|
|
};
|
|
|
|
}
|