/* * Copyright (c) 2020, Matthew Olsson * Copyright (c) 2024, Andreas Kling * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include namespace JS { GC_DEFINE_ALLOCATOR(RegExpObject); namespace { enum class RegExpNameElementKind { CodePoint, HighSurrogate, LowSurrogate, }; enum class RegExpNameElementOrigin { Literal, FixedEscape, BracedEscape, }; struct RegExpNameElement { RegExpNameElementKind kind; RegExpNameElementOrigin origin; size_t next_index { 0 }; }; static ParseRegexPatternError invalid_group_name_error() { return ParseRegexPatternError { "invalid group name"_string }; } static ErrorOr parse_regexp_name_element(Utf16View const& pattern, size_t index) { auto const length = pattern.length_in_code_units(); if (index >= length) return invalid_group_name_error(); auto code_unit = pattern.code_unit_at(index); if (code_unit != '\\') { if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit)) { if (index + 1 < length) { auto next_code_unit = pattern.code_unit_at(index + 1); if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 2 }; } return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::Literal, index + 1 }; } if (AK::UnicodeUtils::is_utf16_low_surrogate(code_unit)) return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::Literal, index + 1 }; return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 1 }; } if (index + 1 >= length || pattern.code_unit_at(index + 1) != 'u') return invalid_group_name_error(); auto escape_index = index + 2; if (escape_index < length && pattern.code_unit_at(escape_index) == '{') { ++escape_index; u32 value = 0; size_t digits = 0; while (escape_index < length && pattern.code_unit_at(escape_index) != '}') { auto digit = pattern.code_unit_at(escape_index); if (!is_ascii_hex_digit(digit)) return invalid_group_name_error(); value = value * 16 + parse_ascii_hex_digit(digit); if (value > 0x10FFFF) return invalid_group_name_error(); ++digits; ++escape_index; } if (digits == 0 || escape_index >= length || pattern.code_unit_at(escape_index) != '}') return invalid_group_name_error(); ++escape_index; if (AK::UnicodeUtils::is_utf16_high_surrogate(value)) return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index }; if (AK::UnicodeUtils::is_utf16_low_surrogate(value)) return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index }; return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::BracedEscape, escape_index }; } if (escape_index + 4 > length) return invalid_group_name_error(); u32 value = 0; for (size_t offset = 0; offset < 4; ++offset) { auto digit = pattern.code_unit_at(escape_index + offset); if (!is_ascii_hex_digit(digit)) return invalid_group_name_error(); value = value * 16 + parse_ascii_hex_digit(digit); } auto next_index = escape_index + 4; if (AK::UnicodeUtils::is_utf16_high_surrogate(value)) return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::FixedEscape, next_index }; if (AK::UnicodeUtils::is_utf16_low_surrogate(value)) return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::FixedEscape, next_index }; return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::FixedEscape, next_index }; } static ErrorOr validate_regexp_name_surrogates(Utf16View const& pattern, size_t name_start) { auto const length = pattern.length_in_code_units(); auto index = name_start; while (index < length) { if (pattern.code_unit_at(index) == '>') return index + 1; auto element = TRY(parse_regexp_name_element(pattern, index)); if (element.kind == RegExpNameElementKind::CodePoint) { index = element.next_index; continue; } if (element.kind == RegExpNameElementKind::LowSurrogate) return invalid_group_name_error(); auto next_element = TRY(parse_regexp_name_element(pattern, element.next_index)); if (next_element.kind != RegExpNameElementKind::LowSurrogate) return invalid_group_name_error(); if (element.origin != next_element.origin) return invalid_group_name_error(); if (element.origin == RegExpNameElementOrigin::BracedEscape) return invalid_group_name_error(); index = next_element.next_index; } return invalid_group_name_error(); } static bool pattern_has_named_capture_groups(Utf16View const& pattern) { auto const length = pattern.length_in_code_units(); bool in_character_class = false; for (size_t index = 0; index < length; ++index) { auto code_unit = pattern.code_unit_at(index); if (code_unit == '\\') { if (index + 1 < length) ++index; continue; } if (code_unit == '[' && !in_character_class) { in_character_class = true; continue; } if (code_unit == ']' && in_character_class) { in_character_class = false; continue; } if (in_character_class) continue; if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') { if (index + 3 >= length || (pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!')) return true; } } return false; } static ErrorOr validate_named_group_name_surrogates(Utf16View const& pattern, bool unicode_aware) { auto const length = pattern.length_in_code_units(); bool in_character_class = false; bool has_named_groups_or_unicode = unicode_aware || pattern_has_named_capture_groups(pattern); for (size_t index = 0; index < length; ++index) { auto code_unit = pattern.code_unit_at(index); if (code_unit == '\\') { if (has_named_groups_or_unicode && !in_character_class && index + 2 < length && pattern.code_unit_at(index + 1) == 'k' && pattern.code_unit_at(index + 2) == '<') { index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1; continue; } if (index + 1 < length) ++index; continue; } if (code_unit == '[' && !in_character_class) { in_character_class = true; continue; } if (code_unit == ']' && in_character_class) { in_character_class = false; continue; } if (in_character_class) continue; if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') { if (index + 3 < length && pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!') { index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1; } } } return {}; } } static Result validate_flags(Utf16View const& flags) { bool seen[128] {}; RegExpObject::Flags flag_bits = static_cast(0); for (size_t index = 0; index < flags.length_in_code_units(); ++index) { auto ch = flags.code_unit_at(index); switch (ch) { #define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \ case #flag_char[0]: \ if (seen[ch]) \ return MUST(String::formatted(ErrorType::RegExpObjectRepeatedFlag.format(), ch)); \ seen[ch] = true; \ flag_bits |= RegExpObject::Flags::FlagName; \ break; JS_ENUMERATE_REGEXP_FLAGS #undef __JS_ENUMERATE default: return MUST(String::formatted(ErrorType::RegExpObjectBadFlag.format(), ch)); } } if (has_flag(flag_bits, RegExpObject::Flags::Unicode) && has_flag(flag_bits, RegExpObject::Flags::UnicodeSets)) return MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v')); return flag_bits; } // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern ErrorOr parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets) { if (unicode && unicode_sets) return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v')) }; TRY(validate_named_group_name_surrogates(pattern, unicode || unicode_sets)); StringBuilder builder; auto previous_code_unit_was_backslash = false; for (size_t i = 0; i < pattern.length_in_code_units(); ++i) { u16 code_unit = pattern.code_unit_at(i); if (code_unit > 0x7f) { // Incorrectly escaping this code unit will result in a wildly different regex than intended // as we're converting to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again, // leading to a matcher for the literal string "\uhhhh" instead of the intended code unit . // As such, we're going to remove the (invalid) backslash and pretend it never existed. if (!previous_code_unit_was_backslash) builder.append('\\'); if ((unicode || unicode_sets) && AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && i + 1 < pattern.length_in_code_units()) { u16 next_code_unit = pattern.code_unit_at(i + 1); if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) { u32 combined = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit); builder.appendff("u{{{:x}}}", combined); ++i; previous_code_unit_was_backslash = false; continue; } } if (unicode || unicode_sets) builder.appendff("u{{{:04x}}}", code_unit); else builder.appendff("u{:04x}", code_unit); } else { builder.append_code_point(code_unit); } if (code_unit == '\\') previous_code_unit_was_backslash = !previous_code_unit_was_backslash; else previous_code_unit_was_backslash = false; } return builder.to_string_without_validation(); } // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern ThrowCompletionOr parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets) { auto result = parse_regex_pattern(pattern, unicode, unicode_sets); if (result.is_error()) return vm.throw_completion(result.release_error().error); return result.release_value(); } GC::Ref RegExpObject::create(Realm& realm) { return realm.create(realm.intrinsics().regexp_prototype()); } GC::Ref RegExpObject::create(Realm& realm, Utf16String pattern, Utf16String flags) { return realm.create(move(pattern), move(flags), realm.intrinsics().regexp_prototype()); } RegExpObject::RegExpObject(Object& prototype) : Object(ConstructWithPrototypeTag::Tag, prototype) { } static RegExpObject::Flags to_flag_bits(Utf16View const& flags) { RegExpObject::Flags flag_bits = static_cast(0); for (size_t i = 0; i < flags.length_in_code_units(); ++i) { auto ch = flags.code_unit_at(i); switch (ch) { #define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \ case #flag_char[0]: \ flag_bits |= RegExpObject::Flags::FlagName; \ break; JS_ENUMERATE_REGEXP_FLAGS #undef __JS_ENUMERATE default: break; } } return flag_bits; } RegExpObject::RegExpObject(Utf16String pattern, Utf16String flags, Object& prototype) : Object(ConstructWithPrototypeTag::Tag, prototype) , m_pattern(move(pattern)) , m_flags(move(flags)) , m_flag_bits(to_flag_bits(m_flags)) { } void RegExpObject::initialize(Realm& realm) { auto& vm = this->vm(); Base::initialize(realm); define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable); } // 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize ThrowCompletionOr> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value) { // Invalidate the cached compiled regex since the pattern/flags may change. m_cached_regex = nullptr; // 1. If pattern is undefined, let P be the empty String. // 2. Else, let P be ? ToString(pattern). auto pattern = pattern_value.is_undefined() ? Utf16String {} : TRY(pattern_value.to_utf16_string(vm)); // 3. If flags is undefined, let F be the empty String. // 4. Else, let F be ? ToString(flags). auto flags = flags_value.is_undefined() ? Utf16String {} : TRY(flags_value.to_utf16_string(vm)); // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception. // 6. If F contains "i", let i be true; else let i be false. // 7. If F contains "m", let m be true; else let m be false. // 8. If F contains "s", let s be true; else let s be false. // 9. If F contains "u", let u be true; else let u be false. // 10. If F contains "v", let v be true; else let v be false. auto validated_flags_or_error = validate_flags(flags); if (validated_flags_or_error.is_error()) return vm.throw_completion(validated_flags_or_error.release_error()); auto flag_bits = validated_flags_or_error.release_value(); bool unicode = has_flag(flag_bits, Flags::Unicode); bool unicode_sets = has_flag(flag_bits, Flags::UnicodeSets); auto parsed_pattern = String {}; // Convert UTF-16 pattern to UTF-8 (with escape normalization for non-ASCII). if (!pattern.is_empty()) { auto result = parse_regex_pattern(pattern, unicode, unicode_sets); if (result.is_error()) return vm.throw_completion(ErrorType::RegExpCompileError, result.release_error().error); parsed_pattern = result.release_value(); } // 11. If u is true and v is true, throw a SyntaxError exception. // NB: Already handled by validate_flags above. // Validate by trial-compiling the pattern. regex::ECMAScriptCompileFlags compile_flags {}; compile_flags.global = has_flag(flag_bits, Flags::Global); compile_flags.ignore_case = has_flag(flag_bits, Flags::IgnoreCase); compile_flags.multiline = has_flag(flag_bits, Flags::Multiline); compile_flags.dot_all = has_flag(flag_bits, Flags::DotAll); compile_flags.unicode = unicode; compile_flags.unicode_sets = unicode_sets; compile_flags.sticky = has_flag(flag_bits, Flags::Sticky); auto compiled = regex::ECMAScriptRegex::compile(parsed_pattern.bytes_as_string_view(), compile_flags); if (compiled.is_error()) return vm.throw_completion(ErrorType::RegExpCompileError, compiled.release_error()); // 16. Set obj.[[OriginalSource]] to P. m_pattern = move(pattern); // 17. Set obj.[[OriginalFlags]] to F. m_flag_bits = to_flag_bits(flags); m_flags = move(flags); // 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult). // 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }. // 20. Set obj.[[RegExpRecord]] to rer. // 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer. // 22. Perform ? Set(obj, "lastIndex", +0𝔽, true). TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes)); // 23. Return obj. return GC::Ref { *this }; } // 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern String RegExpObject::escape_regexp_pattern() const { // 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent // to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as // described below. S may or may not be identical to P; however, the Abstract Closure that would result from // evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to // the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to // this abstract operation using the same values for P and F must produce identical results. // 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure // that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a // RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is // "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F // would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this // specification can be met by letting S be "(?:)". // 3. Return S. if (m_pattern.is_empty()) return "(?:)"_string; // FIXME: Check the 'u' and 'v' flags and escape accordingly StringBuilder builder; auto escaped = false; auto in_character_class = false; for (auto code_point : m_pattern) { if (escaped) { escaped = false; builder.append_code_point('\\'); switch (code_point) { case '\n': builder.append_code_point('n'); break; case '\r': builder.append_code_point('r'); break; case LINE_SEPARATOR: builder.append("u2028"sv); break; case PARAGRAPH_SEPARATOR: builder.append("u2029"sv); break; default: builder.append_code_point(code_point); break; } continue; } if (code_point == '\\') { escaped = true; continue; } if (code_point == '[') { in_character_class = true; } else if (code_point == ']') { in_character_class = false; } switch (code_point) { case '/': if (in_character_class) builder.append_code_point('/'); else builder.append("\\/"sv); break; case '\n': builder.append("\\n"sv); break; case '\r': builder.append("\\r"sv); break; case LINE_SEPARATOR: builder.append("\\u2028"sv); break; case PARAGRAPH_SEPARATOR: builder.append("\\u2029"sv); break; default: builder.append_code_point(code_point); break; } } return builder.to_string_without_validation(); } void RegExpObject::visit_edges(JS::Cell::Visitor& visitor) { Base::visit_edges(visitor); visitor.visit(m_realm); } // 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate ThrowCompletionOr> regexp_create(VM& vm, Value pattern, Value flags) { auto& realm = *vm.current_realm(); // 1. Let obj be ! RegExpAlloc(%RegExp%). auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor())); // 2. Return ? RegExpInitialize(obj, P, F). return TRY(regexp_object->regexp_initialize(vm, pattern, flags)); } // 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc // 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget- ThrowCompletionOr> regexp_alloc(VM& vm, FunctionObject& new_target) { // 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »). auto regexp_object = TRY(ordinary_create_from_constructor(vm, new_target, &Intrinsics::regexp_prototype)); // 2. Let thisRealm be the current Realm Record. auto& this_realm = *vm.current_realm(); // 3. Set the value of obj’s [[Realm]] internal slot to thisRealm. regexp_object->set_realm(this_realm); // 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) { // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to true. regexp_object->set_legacy_features_enabled(true); } // 5. Else, else { // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to false. regexp_object->set_legacy_features_enabled(false); } // 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }). PropertyDescriptor descriptor { .writable = true, .enumerable = false, .configurable = false }; MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, descriptor)); // 7. Return obj. return regexp_object; } }