ladybird/Libraries/LibURL/Pattern/Component.cpp
Andreas Kling 34d954e2d7 LibRegex: Add ECMAScriptRegex and migrate callers
Add `ECMAScriptRegex`, LibRegex's C++ facade for ECMAScript regexes.

The facade owns compilation, execution, captures, named groups, and
error translation for the Rust backend, which lets callers stop
depending on the legacy parser and matcher types directly. Use it in the
remaining non-LibJS callers: URLPattern, HTML input pattern handling,
and the places in LibHTTP that only needed token validation.

Where a full regex engine was unnecessary, replace those call sites with
direct character checks. Also update focused LibURL, LibHTTP, and WPT
coverage for the migrated callers and corrected surrogate handling.
2026-03-27 17:32:19 +01:00

340 lines
13 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025-2026, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Utf16String.h>
#include <LibRegex/ECMAScriptRegex.h>
#include <LibURL/Pattern/Component.h>
#include <LibURL/Pattern/PatternParser.h>
#include <LibURL/Pattern/String.h>
#include <LibURL/URL.h>
namespace URL::Pattern {
// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme
bool protocol_component_matches_a_special_scheme(Component const& protocol_component)
{
// 1. Let special scheme list be a list populated with all of the special schemes.
// 2. For each scheme of special scheme list:
for (StringView scheme : special_schemes()) {
// 1. Let test result be RegExpBuiltinExec(protocol components regular expression, scheme).
auto test_result = protocol_component.matches(scheme);
// 2. If test result is not null, then return true.
if (test_result)
return true;
}
// 3. Return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list
struct RegularExpressionAndNameList {
String regular_expression;
Vector<String> name_list;
};
static RegularExpressionAndNameList generate_a_regular_expression_and_name_list(Vector<Part> const& part_list, Options const& options)
{
// 1. Let result be "^".
StringBuilder result;
result.append('^');
// 2. Let name list be a new list.
Vector<String> name_list;
// 3. For each part of part list:
for (auto const& part : part_list) {
// 1. If parts type is "fixed-text":
if (part.type == Part::Type::FixedText) {
// 1. If parts modifier is "none", then append the result of running escape a regexp string given parts
// value to the end of result.
if (part.modifier == Part::Modifier::None) {
result.append(escape_a_regexp_string(part.value));
}
// 2. Otherwise:
else {
// 1. Append "(?:" to the end of result.
result.append("(?:"sv);
// 2. Append the result of running escape a regexp string given parts value to the end of result.
result.append(escape_a_regexp_string(part.value));
// 3. Append ")" to the end of result.
result.append(')');
// 4. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
}
// 3. Continue.
continue;
}
// 2. Assert: parts name is not the empty string.
VERIFY(!part.name.is_empty());
// 3. Append parts name to name list.
name_list.append(part.name);
// 4. Let regexp value be parts value.
auto regexp_value = part.value;
// 5. If parts type is "segment-wildcard", then set regexp value to the result of running generate a segment wildcard regexp given options.
if (part.type == Part::Type::SegmentWildcard) {
regexp_value = generate_a_segment_wildcard_regexp(options);
}
// 6. Otherwise if parts type is "full-wildcard", then set regexp value to full wildcard regexp value.
else if (part.type == Part::Type::FullWildcard) {
regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value));
}
// 7. If parts prefix is the empty string and parts suffix is the empty string:
if (part.prefix.is_empty() && part.suffix.is_empty()) {
// 1. If parts modifier is "none" or "optional", then:
if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) {
// 1. Append "(" to the end of result.
result.append('(');
// 2. Append regexp value to the end of result.
result.append(regexp_value);
// 3. Append ")" to the end of result.
result.append(')');
// 4. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
}
// 2. Otherwise:
else {
// 1. Append "((?:" to the end of result.
result.append("((?:"sv);
// 2. Append regexp value to the end of result.
result.append(regexp_value);
// 3. Append ")" to the end of result.
result.append(')');
// 4. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
// 5. Append ")" to the end of result.
result.append(')');
}
// 3. Continue.
continue;
}
// 8. If parts modifier is "none" or "optional":
if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) {
// 1. Append "(?:" to the end of result.
result.append("(?:"sv);
// 2. Append the result of running escape a regexp string given parts prefix to the end of result.
result.append(escape_a_regexp_string(part.prefix));
// 3. Append "(" to the end of result.
result.append('(');
// 4. Append regexp value to the end of result.
result.append(regexp_value);
// 5. Append ")" to the end of result.
result.append(')');
// 6. Append the result of running escape a regexp string given parts suffix to the end of result.
result.append(escape_a_regexp_string(part.suffix));
// 7. Append ")" to the end of result.
result.append(')');
// 8. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
// 9. Continue.
continue;
}
// 9. Assert: parts modifier is "zero-or-more" or "one-or-more".
VERIFY(part.modifier == Part::Modifier::ZeroOrMore || part.modifier == Part::Modifier::OneOrMore);
// 10. Assert: parts prefix is not the empty string or parts suffix is not the empty string.
VERIFY(!part.prefix.is_empty() || !part.suffix.is_empty());
// 11. Append "(?:" to the end of result.
result.append("(?:"sv);
// 12. Append the result of running escape a regexp string given parts prefix to the end of result.
result.append(escape_a_regexp_string(part.prefix));
// 13. Append "((?:" to the end of result.
result.append("((?:"sv);
// 14. Append regexp value to the end of result.
result.append(regexp_value);
// 15. Append ")(?:" to the end of result.
result.append(")(?:"sv);
// 16. Append the result of running escape a regexp string given parts suffix to the end of result.
result.append(escape_a_regexp_string(part.suffix));
// 17. Append the result of running escape a regexp string given parts prefix to the end of result.
result.append(escape_a_regexp_string(part.prefix));
// 18. Append "(?:" to the end of result.
result.append("(?:"sv);
// 19. Append regexp value to the end of result.
result.append(regexp_value);
// 20. Append "))*)" to the end of result.
result.append("))*)"sv);
// 21. Append the result of running escape a regexp string given parts suffix to the end of result.
result.append(escape_a_regexp_string(part.suffix));
// 22. Append ")" to the end of result.
result.append(')');
// 23. If parts modifier is "zero-or-more" then append "?" to the end of result.
if (part.modifier == Part::Modifier::ZeroOrMore)
result.append('?');
}
// 4. Append "$" to the end of result.
result.append('$');
// 5. Return (result, name list).
return { result.to_string_without_validation(), move(name_list) };
}
// https://urlpattern.spec.whatwg.org/#compile-a-component
PatternErrorOr<Component> Component::compile(Utf8View const& input, PatternParser::EncodingCallback encoding_callback, Options const& options)
{
// 1. Let part list be the result of running parse a pattern string given input, options, and encoding callback.
auto part_list = TRY(PatternParser::parse(input, options, move(encoding_callback)));
// 2. Let (regular expression string, name list) be the result of running generate a regular expression and name
// list given part list and options.
auto [regular_expression_string, name_list] = generate_a_regular_expression_and_name_list(part_list, options);
// 3. Let flags be an empty string.
// NOTE: These flags match the flags for the empty string of the LibJS RegExp implementation.
regex::ECMAScriptCompileFlags flags {};
// 4. If optionss ignore case is true then set flags to "vi".
if (options.ignore_case) {
flags.unicode_sets = true;
flags.ignore_case = true;
}
// 5. Otherwise set flags to "v"
else {
flags.unicode_sets = true;
}
// 6. Let regular expression be RegExpCreate(regular expression string, flags). If this throws an exception, catch
// it, and throw a TypeError.
auto regex = regex::ECMAScriptRegex::compile(regular_expression_string.bytes_as_string_view(), flags);
if (regex.is_error())
return ErrorInfo { MUST(String::formatted("RegExp compile error: {}", regex.release_error())) };
// 7. Let pattern string be the result of running generate a pattern string given part list and options.
auto pattern_string = generate_a_pattern_string(part_list, options);
// 8. Let has regexp groups be false.
bool has_regexp_groups = false;
// 9. For each part of part list:
for (auto const& part : part_list) {
// 1. If parts type is "regexp", then set has regexp groups to true.
if (part.type == Part::Type::Regexp) {
has_regexp_groups = true;
break;
}
}
// 10. Return a new component whose pattern string is pattern string, regular expression is regular expression,
// group name list is name list, and has regexp groups is has regexp groups.
return Component {
.pattern_string = move(pattern_string),
.regular_expression = adopt_own(*new regex::ECMAScriptRegex(regex.release_value())),
.group_name_list = move(name_list),
.has_regexp_groups = has_regexp_groups,
};
}
Component::ExecutionResult Component::execute(String const& input) const
{
auto utf16_input = Utf16String::from_utf8(input);
auto match_result = regular_expression->exec(utf16_input.utf16_view(), 0);
if (match_result != regex::MatchResult::Match)
return {};
ExecutionResult result;
result.success = true;
result.captures.ensure_capacity(group_name_list.size());
for (size_t index = 1; index <= group_name_list.size(); ++index) {
auto start = regular_expression->capture_slot(index * 2);
auto end = regular_expression->capture_slot(index * 2 + 1);
if (start < 0 || end < 0) {
result.captures.append({});
continue;
}
auto capture = utf16_input.substring_view(static_cast<size_t>(start), static_cast<size_t>(end - start));
result.captures.append(MUST(capture.to_utf8()));
}
return result;
}
bool Component::matches(StringView input) const
{
auto utf16_input = Utf16String::from_utf8(input);
return regular_expression->test(utf16_input.utf16_view(), 0) == regex::MatchResult::Match;
}
// https://urlpattern.spec.whatwg.org/#create-a-component-match-result
Component::Result Component::create_match_result(String const& input, ExecutionResult const& exec_result) const
{
// 1. Let result be a new URLPatternComponentResult.
Component::Result result;
// 2. Set result["input"] to input.
result.input = input;
// 3. Let groups be a record<USVString, (USVString or undefined)>.
OrderedHashMap<String, Variant<String, Empty>> groups;
// 4. Let index be 1.
// 5. While index is less than or equal to components group name lists size:
VERIFY(exec_result.captures.size() == group_name_list.size());
for (size_t index = 1; index <= group_name_list.size(); ++index) {
// 1. Let name be components group name list[index 1].
auto name = group_name_list[index - 1];
// 2. Let value be Get(execResult, ToString(index)).
// 3. Set groups[name] to value.
auto const& capture = exec_result.captures[index - 1];
if (!capture.has_value())
groups.set(name, Empty {});
else
groups.set(name, *capture);
// 4. Increment index by 1.
}
// 6. Set result["groups"] to groups.
result.groups = move(groups);
// 7. Return result.
return result;
}
}