mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2026-04-18 18:00:31 +00:00
Add `ECMAScriptRegex`, LibRegex's C++ facade for ECMAScript regexes. The facade owns compilation, execution, captures, named groups, and error translation for the Rust backend, which lets callers stop depending on the legacy parser and matcher types directly. Use it in the remaining non-LibJS callers: URLPattern, HTML input pattern handling, and the places in LibHTTP that only needed token validation. Where a full regex engine was unnecessary, replace those call sites with direct character checks. Also update focused LibURL, LibHTTP, and WPT coverage for the migrated callers and corrected surrogate handling.
340 lines
13 KiB
C++
340 lines
13 KiB
C++
/*
|
||
* Copyright (c) 2025-2026, Shannon Booth <shannon@serenityos.org>
|
||
*
|
||
* SPDX-License-Identifier: BSD-2-Clause
|
||
*/
|
||
|
||
#include <AK/Utf16String.h>
|
||
#include <LibRegex/ECMAScriptRegex.h>
|
||
#include <LibURL/Pattern/Component.h>
|
||
#include <LibURL/Pattern/PatternParser.h>
|
||
#include <LibURL/Pattern/String.h>
|
||
#include <LibURL/URL.h>
|
||
|
||
namespace URL::Pattern {
|
||
|
||
// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme
|
||
bool protocol_component_matches_a_special_scheme(Component const& protocol_component)
|
||
{
|
||
// 1. Let special scheme list be a list populated with all of the special schemes.
|
||
// 2. For each scheme of special scheme list:
|
||
for (StringView scheme : special_schemes()) {
|
||
// 1. Let test result be RegExpBuiltinExec(protocol component’s regular expression, scheme).
|
||
auto test_result = protocol_component.matches(scheme);
|
||
|
||
// 2. If test result is not null, then return true.
|
||
if (test_result)
|
||
return true;
|
||
}
|
||
|
||
// 3. Return false.
|
||
return false;
|
||
}
|
||
|
||
// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list
|
||
struct RegularExpressionAndNameList {
|
||
String regular_expression;
|
||
Vector<String> name_list;
|
||
};
|
||
|
||
static RegularExpressionAndNameList generate_a_regular_expression_and_name_list(Vector<Part> const& part_list, Options const& options)
|
||
{
|
||
// 1. Let result be "^".
|
||
StringBuilder result;
|
||
result.append('^');
|
||
|
||
// 2. Let name list be a new list.
|
||
Vector<String> name_list;
|
||
|
||
// 3. For each part of part list:
|
||
for (auto const& part : part_list) {
|
||
// 1. If part’s type is "fixed-text":
|
||
if (part.type == Part::Type::FixedText) {
|
||
// 1. If part’s modifier is "none", then append the result of running escape a regexp string given part’s
|
||
// value to the end of result.
|
||
if (part.modifier == Part::Modifier::None) {
|
||
result.append(escape_a_regexp_string(part.value));
|
||
}
|
||
// 2. Otherwise:
|
||
else {
|
||
// 1. Append "(?:" to the end of result.
|
||
result.append("(?:"sv);
|
||
|
||
// 2. Append the result of running escape a regexp string given part’s value to the end of result.
|
||
result.append(escape_a_regexp_string(part.value));
|
||
|
||
// 3. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result.
|
||
result.append(Part::convert_modifier_to_string(part.modifier));
|
||
}
|
||
|
||
// 3. Continue.
|
||
continue;
|
||
}
|
||
|
||
// 2. Assert: part’s name is not the empty string.
|
||
VERIFY(!part.name.is_empty());
|
||
|
||
// 3. Append part’s name to name list.
|
||
name_list.append(part.name);
|
||
|
||
// 4. Let regexp value be part’s value.
|
||
auto regexp_value = part.value;
|
||
|
||
// 5. If part’s type is "segment-wildcard", then set regexp value to the result of running generate a segment wildcard regexp given options.
|
||
if (part.type == Part::Type::SegmentWildcard) {
|
||
regexp_value = generate_a_segment_wildcard_regexp(options);
|
||
}
|
||
// 6. Otherwise if part’s type is "full-wildcard", then set regexp value to full wildcard regexp value.
|
||
else if (part.type == Part::Type::FullWildcard) {
|
||
regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value));
|
||
}
|
||
|
||
// 7. If part’s prefix is the empty string and part’s suffix is the empty string:
|
||
if (part.prefix.is_empty() && part.suffix.is_empty()) {
|
||
// 1. If part’s modifier is "none" or "optional", then:
|
||
if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) {
|
||
// 1. Append "(" to the end of result.
|
||
result.append('(');
|
||
|
||
// 2. Append regexp value to the end of result.
|
||
result.append(regexp_value);
|
||
|
||
// 3. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result.
|
||
result.append(Part::convert_modifier_to_string(part.modifier));
|
||
}
|
||
// 2. Otherwise:
|
||
else {
|
||
// 1. Append "((?:" to the end of result.
|
||
result.append("((?:"sv);
|
||
|
||
// 2. Append regexp value to the end of result.
|
||
result.append(regexp_value);
|
||
|
||
// 3. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result.
|
||
result.append(Part::convert_modifier_to_string(part.modifier));
|
||
|
||
// 5. Append ")" to the end of result.
|
||
result.append(')');
|
||
}
|
||
|
||
// 3. Continue.
|
||
continue;
|
||
}
|
||
|
||
// 8. If part’s modifier is "none" or "optional":
|
||
if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) {
|
||
// 1. Append "(?:" to the end of result.
|
||
result.append("(?:"sv);
|
||
|
||
// 2. Append the result of running escape a regexp string given part’s prefix to the end of result.
|
||
result.append(escape_a_regexp_string(part.prefix));
|
||
|
||
// 3. Append "(" to the end of result.
|
||
result.append('(');
|
||
|
||
// 4. Append regexp value to the end of result.
|
||
result.append(regexp_value);
|
||
|
||
// 5. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 6. Append the result of running escape a regexp string given part’s suffix to the end of result.
|
||
result.append(escape_a_regexp_string(part.suffix));
|
||
|
||
// 7. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 8. Append the result of running convert a modifier to a string given part’s modifier to the end of result.
|
||
result.append(Part::convert_modifier_to_string(part.modifier));
|
||
|
||
// 9. Continue.
|
||
continue;
|
||
}
|
||
|
||
// 9. Assert: part’s modifier is "zero-or-more" or "one-or-more".
|
||
VERIFY(part.modifier == Part::Modifier::ZeroOrMore || part.modifier == Part::Modifier::OneOrMore);
|
||
|
||
// 10. Assert: part’s prefix is not the empty string or part’s suffix is not the empty string.
|
||
VERIFY(!part.prefix.is_empty() || !part.suffix.is_empty());
|
||
|
||
// 11. Append "(?:" to the end of result.
|
||
result.append("(?:"sv);
|
||
|
||
// 12. Append the result of running escape a regexp string given part’s prefix to the end of result.
|
||
result.append(escape_a_regexp_string(part.prefix));
|
||
|
||
// 13. Append "((?:" to the end of result.
|
||
result.append("((?:"sv);
|
||
|
||
// 14. Append regexp value to the end of result.
|
||
result.append(regexp_value);
|
||
|
||
// 15. Append ")(?:" to the end of result.
|
||
result.append(")(?:"sv);
|
||
|
||
// 16. Append the result of running escape a regexp string given part’s suffix to the end of result.
|
||
result.append(escape_a_regexp_string(part.suffix));
|
||
|
||
// 17. Append the result of running escape a regexp string given part’s prefix to the end of result.
|
||
result.append(escape_a_regexp_string(part.prefix));
|
||
|
||
// 18. Append "(?:" to the end of result.
|
||
result.append("(?:"sv);
|
||
|
||
// 19. Append regexp value to the end of result.
|
||
result.append(regexp_value);
|
||
|
||
// 20. Append "))*)" to the end of result.
|
||
result.append("))*)"sv);
|
||
|
||
// 21. Append the result of running escape a regexp string given part’s suffix to the end of result.
|
||
result.append(escape_a_regexp_string(part.suffix));
|
||
|
||
// 22. Append ")" to the end of result.
|
||
result.append(')');
|
||
|
||
// 23. If part’s modifier is "zero-or-more" then append "?" to the end of result.
|
||
if (part.modifier == Part::Modifier::ZeroOrMore)
|
||
result.append('?');
|
||
}
|
||
|
||
// 4. Append "$" to the end of result.
|
||
result.append('$');
|
||
|
||
// 5. Return (result, name list).
|
||
return { result.to_string_without_validation(), move(name_list) };
|
||
}
|
||
|
||
// https://urlpattern.spec.whatwg.org/#compile-a-component
|
||
PatternErrorOr<Component> Component::compile(Utf8View const& input, PatternParser::EncodingCallback encoding_callback, Options const& options)
|
||
{
|
||
// 1. Let part list be the result of running parse a pattern string given input, options, and encoding callback.
|
||
auto part_list = TRY(PatternParser::parse(input, options, move(encoding_callback)));
|
||
|
||
// 2. Let (regular expression string, name list) be the result of running generate a regular expression and name
|
||
// list given part list and options.
|
||
auto [regular_expression_string, name_list] = generate_a_regular_expression_and_name_list(part_list, options);
|
||
|
||
// 3. Let flags be an empty string.
|
||
// NOTE: These flags match the flags for the empty string of the LibJS RegExp implementation.
|
||
regex::ECMAScriptCompileFlags flags {};
|
||
|
||
// 4. If options’s ignore case is true then set flags to "vi".
|
||
if (options.ignore_case) {
|
||
flags.unicode_sets = true;
|
||
flags.ignore_case = true;
|
||
}
|
||
// 5. Otherwise set flags to "v"
|
||
else {
|
||
flags.unicode_sets = true;
|
||
}
|
||
|
||
// 6. Let regular expression be RegExpCreate(regular expression string, flags). If this throws an exception, catch
|
||
// it, and throw a TypeError.
|
||
auto regex = regex::ECMAScriptRegex::compile(regular_expression_string.bytes_as_string_view(), flags);
|
||
if (regex.is_error())
|
||
return ErrorInfo { MUST(String::formatted("RegExp compile error: {}", regex.release_error())) };
|
||
|
||
// 7. Let pattern string be the result of running generate a pattern string given part list and options.
|
||
auto pattern_string = generate_a_pattern_string(part_list, options);
|
||
|
||
// 8. Let has regexp groups be false.
|
||
bool has_regexp_groups = false;
|
||
|
||
// 9. For each part of part list:
|
||
for (auto const& part : part_list) {
|
||
// 1. If part’s type is "regexp", then set has regexp groups to true.
|
||
if (part.type == Part::Type::Regexp) {
|
||
has_regexp_groups = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// 10. Return a new component whose pattern string is pattern string, regular expression is regular expression,
|
||
// group name list is name list, and has regexp groups is has regexp groups.
|
||
return Component {
|
||
.pattern_string = move(pattern_string),
|
||
.regular_expression = adopt_own(*new regex::ECMAScriptRegex(regex.release_value())),
|
||
.group_name_list = move(name_list),
|
||
.has_regexp_groups = has_regexp_groups,
|
||
};
|
||
}
|
||
|
||
Component::ExecutionResult Component::execute(String const& input) const
|
||
{
|
||
auto utf16_input = Utf16String::from_utf8(input);
|
||
auto match_result = regular_expression->exec(utf16_input.utf16_view(), 0);
|
||
if (match_result != regex::MatchResult::Match)
|
||
return {};
|
||
|
||
ExecutionResult result;
|
||
result.success = true;
|
||
result.captures.ensure_capacity(group_name_list.size());
|
||
for (size_t index = 1; index <= group_name_list.size(); ++index) {
|
||
auto start = regular_expression->capture_slot(index * 2);
|
||
auto end = regular_expression->capture_slot(index * 2 + 1);
|
||
if (start < 0 || end < 0) {
|
||
result.captures.append({});
|
||
continue;
|
||
}
|
||
|
||
auto capture = utf16_input.substring_view(static_cast<size_t>(start), static_cast<size_t>(end - start));
|
||
result.captures.append(MUST(capture.to_utf8()));
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
bool Component::matches(StringView input) const
|
||
{
|
||
auto utf16_input = Utf16String::from_utf8(input);
|
||
return regular_expression->test(utf16_input.utf16_view(), 0) == regex::MatchResult::Match;
|
||
}
|
||
|
||
// https://urlpattern.spec.whatwg.org/#create-a-component-match-result
|
||
Component::Result Component::create_match_result(String const& input, ExecutionResult const& exec_result) const
|
||
{
|
||
// 1. Let result be a new URLPatternComponentResult.
|
||
Component::Result result;
|
||
|
||
// 2. Set result["input"] to input.
|
||
result.input = input;
|
||
|
||
// 3. Let groups be a record<USVString, (USVString or undefined)>.
|
||
OrderedHashMap<String, Variant<String, Empty>> groups;
|
||
|
||
// 4. Let index be 1.
|
||
// 5. While index is less than or equal to component’s group name list’s size:
|
||
VERIFY(exec_result.captures.size() == group_name_list.size());
|
||
for (size_t index = 1; index <= group_name_list.size(); ++index) {
|
||
// 1. Let name be component’s group name list[index − 1].
|
||
auto name = group_name_list[index - 1];
|
||
|
||
// 2. Let value be Get(execResult, ToString(index)).
|
||
// 3. Set groups[name] to value.
|
||
auto const& capture = exec_result.captures[index - 1];
|
||
if (!capture.has_value())
|
||
groups.set(name, Empty {});
|
||
else
|
||
groups.set(name, *capture);
|
||
|
||
// 4. Increment index by 1.
|
||
}
|
||
|
||
// 6. Set result["groups"] to groups.
|
||
result.groups = move(groups);
|
||
|
||
// 7. Return result.
|
||
return result;
|
||
}
|
||
|
||
}
|