ladybird/Libraries/LibJS/Runtime/RegExpObject.cpp
Andreas Kling 50b137f527 LibJS: Reject mixed surrogate forms in RegExp names
Reject surrogate pairs in named group names unless both halves come
from the same raw form. A literal surrogate half was being
normalized into \uXXXX before LibRegex parsed the pattern, which let
mixed literal and escaped forms sneak through.

Validate surrogate handling on the UTF-16 pattern before
normalization, but only treat \k<...> as a named backreference when
the parser would do that too. Legacy regexes without named groups
still use \k as an identity escape, so their literal text must not be
rejected by the pre-scan.

Add runtime and syntax tests for the mixed forms, the valid literal,
fixed-width, and braced escape cases, and the legacy \k literals.
2026-03-31 15:59:04 +02:00

588 lines
23 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
* Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/Function.h>
#include <AK/UnicodeUtils.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/PrimitiveString.h>
#include <LibJS/Runtime/RegExpConstructor.h>
#include <LibJS/Runtime/RegExpObject.h>
#include <LibJS/Runtime/StringPrototype.h>
#include <LibJS/Runtime/Value.h>
#include <LibJS/Token.h>
namespace JS {
GC_DEFINE_ALLOCATOR(RegExpObject);
namespace {
enum class RegExpNameElementKind {
CodePoint,
HighSurrogate,
LowSurrogate,
};
enum class RegExpNameElementOrigin {
Literal,
FixedEscape,
BracedEscape,
};
struct RegExpNameElement {
RegExpNameElementKind kind;
RegExpNameElementOrigin origin;
size_t next_index { 0 };
};
static ParseRegexPatternError invalid_group_name_error()
{
return ParseRegexPatternError { "invalid group name"_string };
}
static ErrorOr<RegExpNameElement, ParseRegexPatternError> parse_regexp_name_element(Utf16View const& pattern, size_t index)
{
auto const length = pattern.length_in_code_units();
if (index >= length)
return invalid_group_name_error();
auto code_unit = pattern.code_unit_at(index);
if (code_unit != '\\') {
if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
if (index + 1 < length) {
auto next_code_unit = pattern.code_unit_at(index + 1);
if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 2 };
}
return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
}
if (AK::UnicodeUtils::is_utf16_low_surrogate(code_unit))
return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 1 };
}
if (index + 1 >= length || pattern.code_unit_at(index + 1) != 'u')
return invalid_group_name_error();
auto escape_index = index + 2;
if (escape_index < length && pattern.code_unit_at(escape_index) == '{') {
++escape_index;
u32 value = 0;
size_t digits = 0;
while (escape_index < length && pattern.code_unit_at(escape_index) != '}') {
auto digit = pattern.code_unit_at(escape_index);
if (!is_ascii_hex_digit(digit))
return invalid_group_name_error();
value = value * 16 + parse_ascii_hex_digit(digit);
if (value > 0x10FFFF)
return invalid_group_name_error();
++digits;
++escape_index;
}
if (digits == 0 || escape_index >= length || pattern.code_unit_at(escape_index) != '}')
return invalid_group_name_error();
++escape_index;
if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::BracedEscape, escape_index };
}
if (escape_index + 4 > length)
return invalid_group_name_error();
u32 value = 0;
for (size_t offset = 0; offset < 4; ++offset) {
auto digit = pattern.code_unit_at(escape_index + offset);
if (!is_ascii_hex_digit(digit))
return invalid_group_name_error();
value = value * 16 + parse_ascii_hex_digit(digit);
}
auto next_index = escape_index + 4;
if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::FixedEscape, next_index };
}
static ErrorOr<size_t, ParseRegexPatternError> validate_regexp_name_surrogates(Utf16View const& pattern, size_t name_start)
{
auto const length = pattern.length_in_code_units();
auto index = name_start;
while (index < length) {
if (pattern.code_unit_at(index) == '>')
return index + 1;
auto element = TRY(parse_regexp_name_element(pattern, index));
if (element.kind == RegExpNameElementKind::CodePoint) {
index = element.next_index;
continue;
}
if (element.kind == RegExpNameElementKind::LowSurrogate)
return invalid_group_name_error();
auto next_element = TRY(parse_regexp_name_element(pattern, element.next_index));
if (next_element.kind != RegExpNameElementKind::LowSurrogate)
return invalid_group_name_error();
if (element.origin != next_element.origin)
return invalid_group_name_error();
if (element.origin == RegExpNameElementOrigin::BracedEscape)
return invalid_group_name_error();
index = next_element.next_index;
}
return invalid_group_name_error();
}
static bool pattern_has_named_capture_groups(Utf16View const& pattern)
{
auto const length = pattern.length_in_code_units();
bool in_character_class = false;
for (size_t index = 0; index < length; ++index) {
auto code_unit = pattern.code_unit_at(index);
if (code_unit == '\\') {
if (index + 1 < length)
++index;
continue;
}
if (code_unit == '[' && !in_character_class) {
in_character_class = true;
continue;
}
if (code_unit == ']' && in_character_class) {
in_character_class = false;
continue;
}
if (in_character_class)
continue;
if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
if (index + 3 >= length || (pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!'))
return true;
}
}
return false;
}
static ErrorOr<void, ParseRegexPatternError> validate_named_group_name_surrogates(Utf16View const& pattern, bool unicode_aware)
{
auto const length = pattern.length_in_code_units();
bool in_character_class = false;
bool has_named_groups_or_unicode = unicode_aware || pattern_has_named_capture_groups(pattern);
for (size_t index = 0; index < length; ++index) {
auto code_unit = pattern.code_unit_at(index);
if (code_unit == '\\') {
if (has_named_groups_or_unicode && !in_character_class && index + 2 < length && pattern.code_unit_at(index + 1) == 'k' && pattern.code_unit_at(index + 2) == '<') {
index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
continue;
}
if (index + 1 < length)
++index;
continue;
}
if (code_unit == '[' && !in_character_class) {
in_character_class = true;
continue;
}
if (code_unit == ']' && in_character_class) {
in_character_class = false;
continue;
}
if (in_character_class)
continue;
if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
if (index + 3 < length && pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!') {
index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
}
}
}
return {};
}
}
static Result<RegExpObject::Flags, String> validate_flags(Utf16View const& flags)
{
bool seen[128] {};
RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
for (size_t index = 0; index < flags.length_in_code_units(); ++index) {
auto ch = flags.code_unit_at(index);
switch (ch) {
#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
case #flag_char[0]: \
if (seen[ch]) \
return MUST(String::formatted(ErrorType::RegExpObjectRepeatedFlag.format(), ch)); \
seen[ch] = true; \
flag_bits |= RegExpObject::Flags::FlagName; \
break;
JS_ENUMERATE_REGEXP_FLAGS
#undef __JS_ENUMERATE
default:
return MUST(String::formatted(ErrorType::RegExpObjectBadFlag.format(), ch));
}
}
if (has_flag(flag_bits, RegExpObject::Flags::Unicode) && has_flag(flag_bits, RegExpObject::Flags::UnicodeSets))
return MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v'));
return flag_bits;
}
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets)
{
if (unicode && unicode_sets)
return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v')) };
TRY(validate_named_group_name_surrogates(pattern, unicode || unicode_sets));
StringBuilder builder;
auto previous_code_unit_was_backslash = false;
for (size_t i = 0; i < pattern.length_in_code_units(); ++i) {
u16 code_unit = pattern.code_unit_at(i);
if (code_unit > 0x7f) {
// Incorrectly escaping this code unit will result in a wildly different regex than intended
// as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
// leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
// As such, we're going to remove the (invalid) backslash and pretend it never existed.
if (!previous_code_unit_was_backslash)
builder.append('\\');
if ((unicode || unicode_sets) && AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && i + 1 < pattern.length_in_code_units()) {
u16 next_code_unit = pattern.code_unit_at(i + 1);
if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
u32 combined = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
builder.appendff("u{{{:x}}}", combined);
++i;
previous_code_unit_was_backslash = false;
continue;
}
}
if (unicode || unicode_sets)
builder.appendff("u{{{:04x}}}", code_unit);
else
builder.appendff("u{:04x}", code_unit);
} else {
builder.append_code_point(code_unit);
}
if (code_unit == '\\')
previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
else
previous_code_unit_was_backslash = false;
}
return builder.to_string_without_validation();
}
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ThrowCompletionOr<String> parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets)
{
auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
if (result.is_error())
return vm.throw_completion<JS::SyntaxError>(result.release_error().error);
return result.release_value();
}
GC::Ref<RegExpObject> RegExpObject::create(Realm& realm)
{
return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype());
}
GC::Ref<RegExpObject> RegExpObject::create(Realm& realm, Utf16String pattern, Utf16String flags)
{
return realm.create<RegExpObject>(move(pattern), move(flags), realm.intrinsics().regexp_prototype());
}
RegExpObject::RegExpObject(Object& prototype)
: Object(ConstructWithPrototypeTag::Tag, prototype)
{
}
static RegExpObject::Flags to_flag_bits(Utf16View const& flags)
{
RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
for (size_t i = 0; i < flags.length_in_code_units(); ++i) {
auto ch = flags.code_unit_at(i);
switch (ch) {
#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
case #flag_char[0]: \
flag_bits |= RegExpObject::Flags::FlagName; \
break;
JS_ENUMERATE_REGEXP_FLAGS
#undef __JS_ENUMERATE
default:
break;
}
}
return flag_bits;
}
RegExpObject::RegExpObject(Utf16String pattern, Utf16String flags, Object& prototype)
: Object(ConstructWithPrototypeTag::Tag, prototype)
, m_pattern(move(pattern))
, m_flags(move(flags))
, m_flag_bits(to_flag_bits(m_flags))
{
}
void RegExpObject::initialize(Realm& realm)
{
auto& vm = this->vm();
Base::initialize(realm);
define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
}
// 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
ThrowCompletionOr<GC::Ref<RegExpObject>> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value)
{
// Invalidate the cached compiled regex since the pattern/flags may change.
m_cached_regex = nullptr;
// 1. If pattern is undefined, let P be the empty String.
// 2. Else, let P be ? ToString(pattern).
auto pattern = pattern_value.is_undefined()
? Utf16String {}
: TRY(pattern_value.to_utf16_string(vm));
// 3. If flags is undefined, let F be the empty String.
// 4. Else, let F be ? ToString(flags).
auto flags = flags_value.is_undefined()
? Utf16String {}
: TRY(flags_value.to_utf16_string(vm));
// 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
// 6. If F contains "i", let i be true; else let i be false.
// 7. If F contains "m", let m be true; else let m be false.
// 8. If F contains "s", let s be true; else let s be false.
// 9. If F contains "u", let u be true; else let u be false.
// 10. If F contains "v", let v be true; else let v be false.
auto validated_flags_or_error = validate_flags(flags);
if (validated_flags_or_error.is_error())
return vm.throw_completion<SyntaxError>(validated_flags_or_error.release_error());
auto flag_bits = validated_flags_or_error.release_value();
bool unicode = has_flag(flag_bits, Flags::Unicode);
bool unicode_sets = has_flag(flag_bits, Flags::UnicodeSets);
auto parsed_pattern = String {};
// Convert UTF-16 pattern to UTF-8 (with escape normalization for non-ASCII).
if (!pattern.is_empty()) {
auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
if (result.is_error())
return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, result.release_error().error);
parsed_pattern = result.release_value();
}
// 11. If u is true and v is true, throw a SyntaxError exception.
// NB: Already handled by validate_flags above.
// Validate by trial-compiling the pattern.
regex::ECMAScriptCompileFlags compile_flags {};
compile_flags.global = has_flag(flag_bits, Flags::Global);
compile_flags.ignore_case = has_flag(flag_bits, Flags::IgnoreCase);
compile_flags.multiline = has_flag(flag_bits, Flags::Multiline);
compile_flags.dot_all = has_flag(flag_bits, Flags::DotAll);
compile_flags.unicode = unicode;
compile_flags.unicode_sets = unicode_sets;
compile_flags.sticky = has_flag(flag_bits, Flags::Sticky);
auto compiled = regex::ECMAScriptRegex::compile(parsed_pattern.bytes_as_string_view(), compile_flags);
if (compiled.is_error())
return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, compiled.release_error());
// 16. Set obj.[[OriginalSource]] to P.
m_pattern = move(pattern);
// 17. Set obj.[[OriginalFlags]] to F.
m_flag_bits = to_flag_bits(flags);
m_flags = move(flags);
// 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
// 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
// 20. Set obj.[[RegExpRecord]] to rer.
// 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.
// 22. Perform ? Set(obj, "lastIndex", +0๐”ฝ, true).
TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
// 23. Return obj.
return GC::Ref { *this };
}
// 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
String RegExpObject::escape_regexp_pattern() const
{
// 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
// to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
// described below. S may or may not be identical to P; however, the Abstract Closure that would result from
// evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
// the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
// this abstract operation using the same values for P and F must produce identical results.
// 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
// that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
// RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
// "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
// would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
// specification can be met by letting S be "(?:)".
// 3. Return S.
if (m_pattern.is_empty())
return "(?:)"_string;
// FIXME: Check the 'u' and 'v' flags and escape accordingly
StringBuilder builder;
auto escaped = false;
auto in_character_class = false;
for (auto code_point : m_pattern) {
if (escaped) {
escaped = false;
builder.append_code_point('\\');
switch (code_point) {
case '\n':
builder.append_code_point('n');
break;
case '\r':
builder.append_code_point('r');
break;
case LINE_SEPARATOR:
builder.append("u2028"sv);
break;
case PARAGRAPH_SEPARATOR:
builder.append("u2029"sv);
break;
default:
builder.append_code_point(code_point);
break;
}
continue;
}
if (code_point == '\\') {
escaped = true;
continue;
}
if (code_point == '[') {
in_character_class = true;
} else if (code_point == ']') {
in_character_class = false;
}
switch (code_point) {
case '/':
if (in_character_class)
builder.append_code_point('/');
else
builder.append("\\/"sv);
break;
case '\n':
builder.append("\\n"sv);
break;
case '\r':
builder.append("\\r"sv);
break;
case LINE_SEPARATOR:
builder.append("\\u2028"sv);
break;
case PARAGRAPH_SEPARATOR:
builder.append("\\u2029"sv);
break;
default:
builder.append_code_point(code_point);
break;
}
}
return builder.to_string_without_validation();
}
void RegExpObject::visit_edges(JS::Cell::Visitor& visitor)
{
Base::visit_edges(visitor);
visitor.visit(m_realm);
}
// 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_create(VM& vm, Value pattern, Value flags)
{
auto& realm = *vm.current_realm();
// 1. Let obj be ! RegExpAlloc(%RegExp%).
auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor()));
// 2. Return ? RegExpInitialize(obj, P, F).
return TRY(regexp_object->regexp_initialize(vm, pattern, flags));
}
// 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
// 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_alloc(VM& vm, FunctionObject& new_target)
{
// 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", ยซ [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] ยป).
auto regexp_object = TRY(ordinary_create_from_constructor<RegExpObject>(vm, new_target, &Intrinsics::regexp_prototype));
// 2. Let thisRealm be the current Realm Record.
auto& this_realm = *vm.current_realm();
// 3. Set the value of objโ€™s [[Realm]] internal slot to thisRealm.
regexp_object->set_realm(this_realm);
// 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) {
// i. Set the value of objโ€™s [[LegacyFeaturesEnabled]] internal slot to true.
regexp_object->set_legacy_features_enabled(true);
}
// 5. Else,
else {
// i. Set the value of objโ€™s [[LegacyFeaturesEnabled]] internal slot to false.
regexp_object->set_legacy_features_enabled(false);
}
// 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
PropertyDescriptor descriptor { .writable = true, .enumerable = false, .configurable = false };
MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, descriptor));
// 7. Return obj.
return regexp_object;
}
}