ladybird/Libraries/LibJS/Runtime/RegExpObject.cpp

/*
 * Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
 * Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/CharacterTypes.h>
#include <AK/Function.h>
#include <AK/UnicodeUtils.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/PrimitiveString.h>
#include <LibJS/Runtime/RegExpConstructor.h>
#include <LibJS/Runtime/RegExpObject.h>
#include <LibJS/Runtime/StringPrototype.h>
#include <LibJS/Runtime/Value.h>
#include <LibJS/Token.h>

namespace JS {

GC_DEFINE_ALLOCATOR(RegExpObject);

namespace {

enum class RegExpNameElementKind {
    CodePoint,
    HighSurrogate,
    LowSurrogate,
};

enum class RegExpNameElementOrigin {
    Literal,
    FixedEscape,
    BracedEscape,
};

struct RegExpNameElement {
    RegExpNameElementKind kind;
    RegExpNameElementOrigin origin;
    size_t next_index { 0 };
};

static ParseRegexPatternError invalid_group_name_error()
{
    return ParseRegexPatternError { "invalid group name"_string };
}

static ErrorOr<RegExpNameElement, ParseRegexPatternError> parse_regexp_name_element(Utf16View const& pattern, size_t index)
{
    auto const length = pattern.length_in_code_units();
    if (index >= length)
        return invalid_group_name_error();

    auto code_unit = pattern.code_unit_at(index);
    if (code_unit != '\\') {
        if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
            if (index + 1 < length) {
                auto next_code_unit = pattern.code_unit_at(index + 1);
                if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
                    return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 2 };
            }
            return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
        }
        if (AK::UnicodeUtils::is_utf16_low_surrogate(code_unit))
            return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
        return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 1 };
    }

    if (index + 1 >= length || pattern.code_unit_at(index + 1) != 'u')
        return invalid_group_name_error();

    auto escape_index = index + 2;
    if (escape_index < length && pattern.code_unit_at(escape_index) == '{') {
        ++escape_index;

        u32 value = 0;
        size_t digits = 0;
        while (escape_index < length && pattern.code_unit_at(escape_index) != '}') {
            auto digit = pattern.code_unit_at(escape_index);
            if (!is_ascii_hex_digit(digit))
                return invalid_group_name_error();
            value = value * 16 + parse_ascii_hex_digit(digit);
            if (value > 0x10FFFF)
                return invalid_group_name_error();
            ++digits;
            ++escape_index;
        }

        if (digits == 0 || escape_index >= length || pattern.code_unit_at(escape_index) != '}')
            return invalid_group_name_error();

        ++escape_index;
        if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
            return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
        if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
            return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
        return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::BracedEscape, escape_index };
    }

    if (escape_index + 4 > length)
        return invalid_group_name_error();

    u32 value = 0;
    for (size_t offset = 0; offset < 4; ++offset) {
        auto digit = pattern.code_unit_at(escape_index + offset);
        if (!is_ascii_hex_digit(digit))
            return invalid_group_name_error();
        value = value * 16 + parse_ascii_hex_digit(digit);
    }

    auto next_index = escape_index + 4;
    if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
        return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
    if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
        return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
    return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::FixedEscape, next_index };
}

static ErrorOr<size_t, ParseRegexPatternError> validate_regexp_name_surrogates(Utf16View const& pattern, size_t name_start)
{
    auto const length = pattern.length_in_code_units();
    auto index = name_start;

    while (index < length) {
        if (pattern.code_unit_at(index) == '>')
            return index + 1;

        auto element = TRY(parse_regexp_name_element(pattern, index));
        if (element.kind == RegExpNameElementKind::CodePoint) {
            index = element.next_index;
            continue;
        }

        if (element.kind == RegExpNameElementKind::LowSurrogate)
            return invalid_group_name_error();

        auto next_element = TRY(parse_regexp_name_element(pattern, element.next_index));
        if (next_element.kind != RegExpNameElementKind::LowSurrogate)
            return invalid_group_name_error();
        if (element.origin != next_element.origin)
            return invalid_group_name_error();
        if (element.origin == RegExpNameElementOrigin::BracedEscape)
            return invalid_group_name_error();

        index = next_element.next_index;
    }

    return invalid_group_name_error();
}

static bool pattern_has_named_capture_groups(Utf16View const& pattern)
{
    auto const length = pattern.length_in_code_units();
    bool in_character_class = false;

    for (size_t index = 0; index < length; ++index) {
        auto code_unit = pattern.code_unit_at(index);

        if (code_unit == '\\') {
            if (index + 1 < length)
                ++index;
            continue;
        }

        if (code_unit == '[' && !in_character_class) {
            in_character_class = true;
            continue;
        }

        if (code_unit == ']' && in_character_class) {
            in_character_class = false;
            continue;
        }

        if (in_character_class)
            continue;

        if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
            if (index + 3 >= length || (pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!'))
                return true;
        }
    }

    return false;
}

static ErrorOr<void, ParseRegexPatternError> validate_named_group_name_surrogates(Utf16View const& pattern, bool unicode_aware)
{
    auto const length = pattern.length_in_code_units();
    bool in_character_class = false;
    bool has_named_groups_or_unicode = unicode_aware || pattern_has_named_capture_groups(pattern);

    for (size_t index = 0; index < length; ++index) {
        auto code_unit = pattern.code_unit_at(index);

        if (code_unit == '\\') {
            if (has_named_groups_or_unicode && !in_character_class && index + 2 < length && pattern.code_unit_at(index + 1) == 'k' && pattern.code_unit_at(index + 2) == '<') {
                index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
                continue;
            }

            if (index + 1 < length)
                ++index;
            continue;
        }

        if (code_unit == '[' && !in_character_class) {
            in_character_class = true;
            continue;
        }

        if (code_unit == ']' && in_character_class) {
            in_character_class = false;
            continue;
        }

        if (in_character_class)
            continue;

        if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
            if (index + 3 < length && pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!') {
                index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
            }
        }
    }

    return {};
}

}

static Result<RegExpObject::Flags, String> validate_flags(Utf16View const& flags)
{
    bool seen[128] {};
    RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);

    for (size_t index = 0; index < flags.length_in_code_units(); ++index) {
        auto ch = flags.code_unit_at(index);

        switch (ch) {
#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char)                              \
    case #flag_char[0]:                                                                       \
        if (seen[ch])                                                                         \
            return MUST(String::formatted(ErrorType::RegExpObjectRepeatedFlag.format(), ch)); \
        seen[ch] = true;                                                                      \
        flag_bits |= RegExpObject::Flags::FlagName;                                           \
        break;
            JS_ENUMERATE_REGEXP_FLAGS
#undef __JS_ENUMERATE
        default:
            return MUST(String::formatted(ErrorType::RegExpObjectBadFlag.format(), ch));
        }
    }

    if (has_flag(flag_bits, RegExpObject::Flags::Unicode) && has_flag(flag_bits, RegExpObject::Flags::UnicodeSets))
        return MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v'));

    return flag_bits;
}

// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets)
{
    if (unicode && unicode_sets)
        return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v')) };

    TRY(validate_named_group_name_surrogates(pattern, unicode || unicode_sets));

    StringBuilder builder;

    auto previous_code_unit_was_backslash = false;
    for (size_t i = 0; i < pattern.length_in_code_units(); ++i) {
        u16 code_unit = pattern.code_unit_at(i);

        if (code_unit > 0x7f) {
            // Incorrectly escaping this code unit will result in a wildly different regex than intended
            // as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
            // leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
            // As such, we're going to remove the (invalid) backslash and pretend it never existed.
            if (!previous_code_unit_was_backslash)
                builder.append('\\');

            if ((unicode || unicode_sets) && AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && i + 1 < pattern.length_in_code_units()) {
                u16 next_code_unit = pattern.code_unit_at(i + 1);
                if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
                    u32 combined = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
                    builder.appendff("u{{{:x}}}", combined);
                    ++i;
                    previous_code_unit_was_backslash = false;
                    continue;
                }
            }

            if (unicode || unicode_sets)
                builder.appendff("u{{{:04x}}}", code_unit);
            else
                builder.appendff("u{:04x}", code_unit);
        } else {
            builder.append_code_point(code_unit);
        }

        if (code_unit == '\\')
            previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
        else
            previous_code_unit_was_backslash = false;
    }

    return builder.to_string_without_validation();
}

// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ThrowCompletionOr<String> parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets)
{
    auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
    if (result.is_error())
        return vm.throw_completion<JS::SyntaxError>(result.release_error().error);

    return result.release_value();
}

GC::Ref<RegExpObject> RegExpObject::create(Realm& realm)
{
    return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype());
}

GC::Ref<RegExpObject> RegExpObject::create(Realm& realm, Utf16String pattern, Utf16String flags)
{
    return realm.create<RegExpObject>(move(pattern), move(flags), realm.intrinsics().regexp_prototype());
}

RegExpObject::RegExpObject(Object& prototype)
    : Object(ConstructWithPrototypeTag::Tag, prototype)
{
}

static RegExpObject::Flags to_flag_bits(Utf16View const& flags)
{
    RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);

    for (size_t i = 0; i < flags.length_in_code_units(); ++i) {
        auto ch = flags.code_unit_at(i);
        switch (ch) {
#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
    case #flag_char[0]:                                          \
        flag_bits |= RegExpObject::Flags::FlagName;              \
        break;
            JS_ENUMERATE_REGEXP_FLAGS
#undef __JS_ENUMERATE
        default:
            break;
        }
    }
    return flag_bits;
}

RegExpObject::RegExpObject(Utf16String pattern, Utf16String flags, Object& prototype)
    : Object(ConstructWithPrototypeTag::Tag, prototype)
    , m_pattern(move(pattern))
    , m_flags(move(flags))
    , m_flag_bits(to_flag_bits(m_flags))
{
}

void RegExpObject::initialize(Realm& realm)
{
    auto& vm = this->vm();
    Base::initialize(realm);

    define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
}

// 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
ThrowCompletionOr<GC::Ref<RegExpObject>> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value)
{
    // Invalidate the cached compiled regex since the pattern/flags may change.
    m_cached_regex = nullptr;

    // 1. If pattern is undefined, let P be the empty String.
    // 2. Else, let P be ? ToString(pattern).
    auto pattern = pattern_value.is_undefined()
        ? Utf16String {}
        : TRY(pattern_value.to_utf16_string(vm));

    // 3. If flags is undefined, let F be the empty String.
    // 4. Else, let F be ? ToString(flags).
    auto flags = flags_value.is_undefined()
        ? Utf16String {}
        : TRY(flags_value.to_utf16_string(vm));

    // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
    // 6. If F contains "i", let i be true; else let i be false.
    // 7. If F contains "m", let m be true; else let m be false.
    // 8. If F contains "s", let s be true; else let s be false.
    // 9. If F contains "u", let u be true; else let u be false.
    // 10. If F contains "v", let v be true; else let v be false.
    auto validated_flags_or_error = validate_flags(flags);
    if (validated_flags_or_error.is_error())
        return vm.throw_completion<SyntaxError>(validated_flags_or_error.release_error());
    auto flag_bits = validated_flags_or_error.release_value();
    bool unicode = has_flag(flag_bits, Flags::Unicode);
    bool unicode_sets = has_flag(flag_bits, Flags::UnicodeSets);

    auto parsed_pattern = String {};

    // Convert UTF-16 pattern to UTF-8 (with escape normalization for non-ASCII).
    if (!pattern.is_empty()) {
        auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
        if (result.is_error())
            return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, result.release_error().error);
        parsed_pattern = result.release_value();
    }

    // 11. If u is true and v is true, throw a SyntaxError exception.
    // NB: Already handled by validate_flags above.

    // Validate by trial-compiling the pattern.
    regex::ECMAScriptCompileFlags compile_flags {};
    compile_flags.global = has_flag(flag_bits, Flags::Global);
    compile_flags.ignore_case = has_flag(flag_bits, Flags::IgnoreCase);
    compile_flags.multiline = has_flag(flag_bits, Flags::Multiline);
    compile_flags.dot_all = has_flag(flag_bits, Flags::DotAll);
    compile_flags.unicode = unicode;
    compile_flags.unicode_sets = unicode_sets;
    compile_flags.sticky = has_flag(flag_bits, Flags::Sticky);

    auto compiled = regex::ECMAScriptRegex::compile(parsed_pattern.bytes_as_string_view(), compile_flags);
    if (compiled.is_error())
        return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, compiled.release_error());

    // 16. Set obj.[[OriginalSource]] to P.
    m_pattern = move(pattern);

    // 17. Set obj.[[OriginalFlags]] to F.
    m_flag_bits = to_flag_bits(flags);
    m_flags = move(flags);

    // 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
    // 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
    // 20. Set obj.[[RegExpRecord]] to rer.
    // 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.

    // 22. Perform ? Set(obj, "lastIndex", +0𝔽, true).
    TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));

    // 23. Return obj.
    return GC::Ref { *this };
}

// 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
String RegExpObject::escape_regexp_pattern() const
{
    // 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
    //    to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
    //    described below. S may or may not be identical to P; however, the Abstract Closure that would result from
    //    evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
    //    the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
    //    this abstract operation using the same values for P and F must produce identical results.
    // 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
    //    that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
    //    RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
    //    "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
    //    would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
    //    specification can be met by letting S be "(?:)".
    // 3. Return S.
    if (m_pattern.is_empty())
        return "(?:)"_string;

    // FIXME: Check the 'u' and 'v' flags and escape accordingly
    StringBuilder builder;
    auto escaped = false;
    auto in_character_class = false;

    for (auto code_point : m_pattern) {
        if (escaped) {
            escaped = false;
            builder.append_code_point('\\');

            switch (code_point) {
            case '\n':
                builder.append_code_point('n');
                break;
            case '\r':
                builder.append_code_point('r');
                break;
            case LINE_SEPARATOR:
                builder.append("u2028"sv);
                break;
            case PARAGRAPH_SEPARATOR:
                builder.append("u2029"sv);
                break;
            default:
                builder.append_code_point(code_point);
                break;
            }
            continue;
        }

        if (code_point == '\\') {
            escaped = true;
            continue;
        }

        if (code_point == '[') {
            in_character_class = true;
        } else if (code_point == ']') {
            in_character_class = false;
        }

        switch (code_point) {
        case '/':
            if (in_character_class)
                builder.append_code_point('/');
            else
                builder.append("\\/"sv);
            break;
        case '\n':
            builder.append("\\n"sv);
            break;
        case '\r':
            builder.append("\\r"sv);
            break;
        case LINE_SEPARATOR:
            builder.append("\\u2028"sv);
            break;
        case PARAGRAPH_SEPARATOR:
            builder.append("\\u2029"sv);
            break;
        default:
            builder.append_code_point(code_point);
            break;
        }
    }

    return builder.to_string_without_validation();
}

void RegExpObject::visit_edges(JS::Cell::Visitor& visitor)
{
    Base::visit_edges(visitor);
    visitor.visit(m_realm);
}

// 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_create(VM& vm, Value pattern, Value flags)
{
    auto& realm = *vm.current_realm();

    // 1. Let obj be ! RegExpAlloc(%RegExp%).
    auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor()));

    // 2. Return ? RegExpInitialize(obj, P, F).
    return TRY(regexp_object->regexp_initialize(vm, pattern, flags));
}

// 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
// 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_alloc(VM& vm, FunctionObject& new_target)
{
    // 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »).
    auto regexp_object = TRY(ordinary_create_from_constructor<RegExpObject>(vm, new_target, &Intrinsics::regexp_prototype));

    // 2. Let thisRealm be the current Realm Record.
    auto& this_realm = *vm.current_realm();

    // 3. Set the value of obj’s [[Realm]] internal slot to thisRealm.
    regexp_object->set_realm(this_realm);

    // 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
    if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) {
        // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to true.
        regexp_object->set_legacy_features_enabled(true);
    }
    // 5. Else,
    else {
        // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to false.
        regexp_object->set_legacy_features_enabled(false);
    }

    // 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
    PropertyDescriptor descriptor { .writable = true, .enumerable = false, .configurable = false };
    MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, descriptor));

    // 7. Return obj.
    return regexp_object;
}

}
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								/*
-												Userland: Use mattco@serenityos.org for my copyright headers

											
										
										
											2021-04-22 16:53:07 -07:00
+								 * Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
-												LibJS: Store RegExp flags as a bitmask

This avoids having to do O(n) contains() in the various flag accessors.

Yields a ~20% speed-up on the following microbenchmark:

    const re = /foo/dgimsvy;
    for (let i = 0; i < 1_000_000; ++i)
        re.flags;

											
										
										
											2024-10-25 17:29:03 +02:00
+								 * Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								 *
-												Everything: Move to SPDX license identifiers in all files.

SPDX License Identifiers are a more compact / standardized
way of representing file license information.

See: https://spdx.dev/resources/use/#identifiers

This was done with the `ambr` search and replace tool.

 ambr --no-parent-ignore --key-from-file --rep-from-file key.txt rep.txt *

											
										
										
											2021-04-22 01:24:48 -07:00
+								 * SPDX-License-Identifier: BSD-2-Clause
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								 */
-												LibJS: Reject mixed surrogate forms in RegExp names

Reject surrogate pairs in named group names unless both halves come
from the same raw form. A literal surrogate half was being
normalized into \uXXXX before LibRegex parsed the pattern, which let
mixed literal and escaped forms sneak through.

Validate surrogate handling on the UTF-16 pattern before
normalization, but only treat \k<...> as a named backreference when
the parser would do that too. Legacy regexes without named groups
still use \k as an identity escape, so their literal text must not be
rejected by the pre-scan.

Add runtime and syntax tests for the mixed forms, the valid literal,
fixed-width, and braced escape cases, and the legacy \k literals.

											
										
										
											2026-03-30 14:38:59 +02:00
+								#include <AK/CharacterTypes.h>
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								#include <AK/Function.h>
-												LibJS: Prevent escaped surrogates from combining in Unicode regexes

Escaped surrogate sequences should not combine with adjacent literal
surrogates in Unicode mode.

We now use `\u{XXXX}` braces instead of `\uXXXX` when escaping code
units in Unicode mode, so LibRegex treats each as a standalone code
point. Also prevent GenericLexer from combining `\uXXXX` and `\u{XXXX}`.

											
										
										
											2026-02-10 14:26:53 +01:00
+								#include <AK/UnicodeUtils.h>
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
+								#include <LibJS/Runtime/AbstractOperations.h>
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								#include <LibJS/Runtime/GlobalObject.h>
 								#include <LibJS/Runtime/PrimitiveString.h>
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
+								#include <LibJS/Runtime/RegExpConstructor.h>
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								#include <LibJS/Runtime/RegExpObject.h>
-												LibJS: Implement RegExpCreate/RegExpInitialize closer to the spec

RegExpInitialize specifies how the pattern string should be created
before passing it to [[RegExpMatcher]]. Rather than passing it as-is,
the string should be converted to code points and back to a "List" (if
the Unicode flag is present), or as a "List" of UTF-16 code units.
Further. the spec requires that we keep both the original pattern string
and this parsed string in the RegExp object.

The caveat is that the LibRegex parser further requires any multi-byte
code units to be escaped (as "\unnnn"). Otherwise, the code unit is
recognized as individual UTF-8 bytes.

											
										
										
											2021-07-22 08:04:31 -04:00
+								#include <LibJS/Runtime/StringPrototype.h>
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								#include <LibJS/Runtime/Value.h>
-												LibJS: Make escape_regexp_pattern() a RegExpObject member function

Similarly to regexp_initialize() this can be a member function instead
of taking a RegExpObject argument.
Having it available outside RegExpPrototype is also useful for other
things that need RegExp.prototype.source behavior - e.g. the REPL for
pretty-printing.

											
										
										
											2021-10-05 18:33:28 +01:00
+								#include <LibJS/Token.h>
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
 								namespace JS {
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								GC_DEFINE_ALLOCATOR(RegExpObject);
-												LibJS: Segregate GC-allocated objects by type

This patch adds two macros to declare per-type allocators:

- JS_DECLARE_ALLOCATOR(TypeName)
- JS_DEFINE_ALLOCATOR(TypeName)

When used, they add a type-specific CellAllocator that the Heap will
delegate allocation requests to.

The result of this is that GC objects of the same type always end up
within the same HeapBlock, drastically reducing the ability to perform
type confusion attacks.

It also improves HeapBlock utilization, since each block now has cells
sized exactly to the type used within that block. (Previously we only
had a handful of block sizes available, and most GC allocations ended
up with a large amount of slack in their tails.)

There is a small performance hit from this, but I'm sure we can make
up for it elsewhere.

Note that the old size-based allocators still exist, and we fall back
to them for any type that doesn't have its own CellAllocator.

											
										
										
											2023-11-19 09:45:05 +01:00
-												LibJS: Reject mixed surrogate forms in RegExp names

Reject surrogate pairs in named group names unless both halves come
from the same raw form. A literal surrogate half was being
normalized into \uXXXX before LibRegex parsed the pattern, which let
mixed literal and escaped forms sneak through.

Validate surrogate handling on the UTF-16 pattern before
normalization, but only treat \k<...> as a named backreference when
the parser would do that too. Legacy regexes without named groups
still use \k as an identity escape, so their literal text must not be
rejected by the pre-scan.

Add runtime and syntax tests for the mixed forms, the valid literal,
fixed-width, and braced escape cases, and the legacy \k literals.

											
										
										
											2026-03-30 14:38:59 +02:00
+								namespace {
 								enum class RegExpNameElementKind {
 								    CodePoint,
 								    HighSurrogate,
 								    LowSurrogate,
 								};
 								enum class RegExpNameElementOrigin {
 								    Literal,
 								    FixedEscape,
 								    BracedEscape,
 								};
 								struct RegExpNameElement {
 								    RegExpNameElementKind kind;
 								    RegExpNameElementOrigin origin;
 								    size_t next_index { 0 };
 								};
 								static ParseRegexPatternError invalid_group_name_error()
 								{
 								    return ParseRegexPatternError { "invalid group name"_string };
 								}
 								static ErrorOr<RegExpNameElement, ParseRegexPatternError> parse_regexp_name_element(Utf16View const& pattern, size_t index)
 								{
 								    auto const length = pattern.length_in_code_units();
 								    if (index >= length)
 								        return invalid_group_name_error();
 								    auto code_unit = pattern.code_unit_at(index);
 								    if (code_unit != '\\') {
 								        if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
 								            if (index + 1 < length) {
 								                auto next_code_unit = pattern.code_unit_at(index + 1);
 								                if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
 								                    return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 2 };
 								            }
 								            return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
 								        }
 								        if (AK::UnicodeUtils::is_utf16_low_surrogate(code_unit))
 								            return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::Literal, index + 1 };
 								        return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::Literal, index + 1 };
 								    }
 								    if (index + 1 >= length || pattern.code_unit_at(index + 1) != 'u')
 								        return invalid_group_name_error();
 								    auto escape_index = index + 2;
 								    if (escape_index < length && pattern.code_unit_at(escape_index) == '{') {
 								        ++escape_index;
 								        u32 value = 0;
 								        size_t digits = 0;
 								        while (escape_index < length && pattern.code_unit_at(escape_index) != '}') {
 								            auto digit = pattern.code_unit_at(escape_index);
 								            if (!is_ascii_hex_digit(digit))
 								                return invalid_group_name_error();
 								            value = value * 16 + parse_ascii_hex_digit(digit);
 								            if (value > 0x10FFFF)
 								                return invalid_group_name_error();
 								            ++digits;
 								            ++escape_index;
 								        }
 								        if (digits == 0 || escape_index >= length || pattern.code_unit_at(escape_index) != '}')
 								            return invalid_group_name_error();
 								        ++escape_index;
 								        if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
 								            return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
 								        if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
 								            return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::BracedEscape, escape_index };
 								        return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::BracedEscape, escape_index };
 								    }
 								    if (escape_index + 4 > length)
 								        return invalid_group_name_error();
 								    u32 value = 0;
 								    for (size_t offset = 0; offset < 4; ++offset) {
 								        auto digit = pattern.code_unit_at(escape_index + offset);
 								        if (!is_ascii_hex_digit(digit))
 								            return invalid_group_name_error();
 								        value = value * 16 + parse_ascii_hex_digit(digit);
 								    }
 								    auto next_index = escape_index + 4;
 								    if (AK::UnicodeUtils::is_utf16_high_surrogate(value))
 								        return RegExpNameElement { RegExpNameElementKind::HighSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
 								    if (AK::UnicodeUtils::is_utf16_low_surrogate(value))
 								        return RegExpNameElement { RegExpNameElementKind::LowSurrogate, RegExpNameElementOrigin::FixedEscape, next_index };
 								    return RegExpNameElement { RegExpNameElementKind::CodePoint, RegExpNameElementOrigin::FixedEscape, next_index };
 								}
 								static ErrorOr<size_t, ParseRegexPatternError> validate_regexp_name_surrogates(Utf16View const& pattern, size_t name_start)
 								{
 								    auto const length = pattern.length_in_code_units();
 								    auto index = name_start;
 								    while (index < length) {
 								        if (pattern.code_unit_at(index) == '>')
 								            return index + 1;
 								        auto element = TRY(parse_regexp_name_element(pattern, index));
 								        if (element.kind == RegExpNameElementKind::CodePoint) {
 								            index = element.next_index;
 								            continue;
 								        }
 								        if (element.kind == RegExpNameElementKind::LowSurrogate)
 								            return invalid_group_name_error();
 								        auto next_element = TRY(parse_regexp_name_element(pattern, element.next_index));
 								        if (next_element.kind != RegExpNameElementKind::LowSurrogate)
 								            return invalid_group_name_error();
 								        if (element.origin != next_element.origin)
 								            return invalid_group_name_error();
 								        if (element.origin == RegExpNameElementOrigin::BracedEscape)
 								            return invalid_group_name_error();
 								        index = next_element.next_index;
 								    }
 								    return invalid_group_name_error();
 								}
 								static bool pattern_has_named_capture_groups(Utf16View const& pattern)
 								{
 								    auto const length = pattern.length_in_code_units();
 								    bool in_character_class = false;
 								    for (size_t index = 0; index < length; ++index) {
 								        auto code_unit = pattern.code_unit_at(index);
 								        if (code_unit == '\\') {
 								            if (index + 1 < length)
 								                ++index;
 								            continue;
 								        }
 								        if (code_unit == '[' && !in_character_class) {
 								            in_character_class = true;
 								            continue;
 								        }
 								        if (code_unit == ']' && in_character_class) {
 								            in_character_class = false;
 								            continue;
 								        }
 								        if (in_character_class)
 								            continue;
 								        if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
 								            if (index + 3 >= length || (pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!'))
 								                return true;
 								        }
 								    }
 								    return false;
 								}
 								static ErrorOr<void, ParseRegexPatternError> validate_named_group_name_surrogates(Utf16View const& pattern, bool unicode_aware)
 								{
 								    auto const length = pattern.length_in_code_units();
 								    bool in_character_class = false;
 								    bool has_named_groups_or_unicode = unicode_aware || pattern_has_named_capture_groups(pattern);
 								    for (size_t index = 0; index < length; ++index) {
 								        auto code_unit = pattern.code_unit_at(index);
 								        if (code_unit == '\\') {
 								            if (has_named_groups_or_unicode && !in_character_class && index + 2 < length && pattern.code_unit_at(index + 1) == 'k' && pattern.code_unit_at(index + 2) == '<') {
 								                index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
 								                continue;
 								            }
 								            if (index + 1 < length)
 								                ++index;
 								            continue;
 								        }
 								        if (code_unit == '[' && !in_character_class) {
 								            in_character_class = true;
 								            continue;
 								        }
 								        if (code_unit == ']' && in_character_class) {
 								            in_character_class = false;
 								            continue;
 								        }
 								        if (in_character_class)
 								            continue;
 								        if (code_unit == '(' && index + 2 < length && pattern.code_unit_at(index + 1) == '?' && pattern.code_unit_at(index + 2) == '<') {
 								            if (index + 3 < length && pattern.code_unit_at(index + 3) != '=' && pattern.code_unit_at(index + 3) != '!') {
 								                index = TRY(validate_regexp_name_surrogates(pattern, index + 3)) - 1;
 								            }
 								        }
 								    }
 								    return {};
 								}
 								}
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								static Result<RegExpObject::Flags, String> validate_flags(Utf16View const& flags)
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								{
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    bool seen[128] {};
 								    RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								    for (size_t index = 0; index < flags.length_in_code_units(); ++index) {
 								        auto ch = flags.code_unit_at(index);
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								        switch (ch) {
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char)                              \
 								    case #flag_char[0]:                                                                       \
 								        if (seen[ch])                                                                         \
 								            return MUST(String::formatted(ErrorType::RegExpObjectRepeatedFlag.format(), ch)); \
 								        seen[ch] = true;                                                                      \
 								        flag_bits |= RegExpObject::Flags::FlagName;                                           \
 								        break;
 								            JS_ENUMERATE_REGEXP_FLAGS
 								#undef __JS_ENUMERATE
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								        default:
-												LibJS+LibWeb: Port interned bytecode strings to UTF-16

This was almost a no-op, except we intern JS exception messages. So the
bulk of this patch is porting exception messages to UTF-16.

											
										
										
											2025-08-07 19:31:52 -04:00
+								            return MUST(String::formatted(ErrorType::RegExpObjectBadFlag.format(), ch));
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								        }
 								    }
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    if (has_flag(flag_bits, RegExpObject::Flags::Unicode) && has_flag(flag_bits, RegExpObject::Flags::UnicodeSets))
 								        return MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v'));
 								    return flag_bits;
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								}
-												LibJS: Update RegExp spec numbers to match re-ordering within the spec

This is an editorial change in the ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/abee2e6
https://github.com/tc39/ecma262/commit/77256bf

											
										
										
											2023-06-23 10:05:38 -04:00
+								// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets)
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								{
-												LibJS: Hook up the 'v' (unicodeSets) RegExp flag

											
										
										
											2022-07-16 10:14:03 +04:30
+								    if (unicode && unicode_sets)
-												LibJS+LibWeb: Port interned bytecode strings to UTF-16

This was almost a no-op, except we intern JS exception messages. So the
bulk of this patch is porting exception messages to UTF-16.

											
										
										
											2025-08-07 19:31:52 -04:00
+								        return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.format(), 'u', 'v')) };
-												LibJS: Hook up the 'v' (unicodeSets) RegExp flag

											
										
										
											2022-07-16 10:14:03 +04:30
-												LibJS: Reject mixed surrogate forms in RegExp names

Reject surrogate pairs in named group names unless both halves come
from the same raw form. A literal surrogate half was being
normalized into \uXXXX before LibRegex parsed the pattern, which let
mixed literal and escaped forms sneak through.

Validate surrogate handling on the UTF-16 pattern before
normalization, but only treat \k<...> as a named backreference when
the parser would do that too. Legacy regexes without named groups
still use \k as an identity escape, so their literal text must not be
rejected by the pre-scan.

Add runtime and syntax tests for the mixed forms, the valid literal,
fixed-width, and braced escape cases, and the legacy \k literals.

											
										
										
											2026-03-30 14:38:59 +02:00
+								    TRY(validate_named_group_name_surrogates(pattern, unicode || unicode_sets));
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								    StringBuilder builder;
-												LibJS: Unescape incorrectly escaped code units in regex patterns

We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which
is a very different pattern; as a code unit converted to the \uhhh
format has no meaning when escaped, this commit makes us simply skip
escaping it when translating the pattern.

											
										
										
											2023-09-16 16:03:54 +03:30
+								    auto previous_code_unit_was_backslash = false;
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								    for (size_t i = 0; i < pattern.length_in_code_units(); ++i) {
 								        u16 code_unit = pattern.code_unit_at(i);
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
-												LibJS: Unescape incorrectly escaped code units in regex patterns

We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which
is a very different pattern; as a code unit converted to the \uhhh
format has no meaning when escaped, this commit makes us simply skip
escaping it when translating the pattern.

											
										
										
											2023-09-16 16:03:54 +03:30
+								        if (code_unit > 0x7f) {
 								            // Incorrectly escaping this code unit will result in a wildly different regex than intended
 								            // as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
 								            // leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
 								            // As such, we're going to remove the (invalid) backslash and pretend it never existed.
 								            if (!previous_code_unit_was_backslash)
 								                builder.append('\\');
-												LibJS: Prevent escaped surrogates from combining in Unicode regexes

Escaped surrogate sequences should not combine with adjacent literal
surrogates in Unicode mode.

We now use `\u{XXXX}` braces instead of `\uXXXX` when escaping code
units in Unicode mode, so LibRegex treats each as a standalone code
point. Also prevent GenericLexer from combining `\uXXXX` and `\u{XXXX}`.

											
										
										
											2026-02-10 14:26:53 +01:00
 								            if ((unicode || unicode_sets) && AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && i + 1 < pattern.length_in_code_units()) {
 								                u16 next_code_unit = pattern.code_unit_at(i + 1);
 								                if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
 								                    u32 combined = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
 								                    builder.appendff("u{{{:x}}}", combined);
 								                    ++i;
 								                    previous_code_unit_was_backslash = false;
 								                    continue;
 								                }
 								            }
 								            if (unicode || unicode_sets)
 								                builder.appendff("u{{{:04x}}}", code_unit);
 								            else
 								                builder.appendff("u{:04x}", code_unit);
-												LibJS: Unescape incorrectly escaped code units in regex patterns

We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which
is a very different pattern; as a code unit converted to the \uhhh
format has no meaning when escaped, this commit makes us simply skip
escaping it when translating the pattern.

											
										
										
											2023-09-16 16:03:54 +03:30
+								        } else {
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								            builder.append_code_point(code_unit);
-												LibJS: Unescape incorrectly escaped code units in regex patterns

We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which
is a very different pattern; as a code unit converted to the \uhhh
format has no meaning when escaped, this commit makes us simply skip
escaping it when translating the pattern.

											
										
										
											2023-09-16 16:03:54 +03:30
+								        }
 								        if (code_unit == '\\')
 								            previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
 								        else
 								            previous_code_unit_was_backslash = false;
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								    }
-												LibJS: Use FlyString in PropertyKey instead of DeprecatedFlyString

This required dealing with *substantial* fallout.

											
										
										
											2025-03-18 18:08:02 -05:00
+								    return builder.to_string_without_validation();
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								}
-												LibJS: Update RegExp spec numbers to match re-ordering within the spec

This is an editorial change in the ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/abee2e6
https://github.com/tc39/ecma262/commit/77256bf

											
										
										
											2023-06-23 10:05:38 -04:00
+								// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								ThrowCompletionOr<String> parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets)
-												LibJS: Hook up the 'v' (unicodeSets) RegExp flag

											
										
										
											2022-07-16 10:14:03 +04:30
+								{
 								    auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
 								    if (result.is_error())
-												LibJS: Remove GlobalObject from VM::throw_completion()

This is a continuation of the previous five commits.

A first big step into the direction of no longer having to pass a realm
(or currently, a global object) trough layers upon layers of AOs!
Unlike the create() APIs we can safely assume that this is only ever
called when a running execution context and therefore current realm
exists. If not, you can always manually allocate the Error and put it in
a Completion :^)

In the spec, throw exceptions implicitly use the current realm's
intrinsics as well: https://tc39.es/ecma262/#sec-throw-an-exception

											
										
										
											2022-08-16 20:33:17 +01:00
+								        return vm.throw_completion<JS::SyntaxError>(result.release_error().error);
-												LibJS: Hook up the 'v' (unicodeSets) RegExp flag

											
										
										
											2022-07-16 10:14:03 +04:30
 								    return result.release_value();
 								}
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								GC::Ref<RegExpObject> RegExpObject::create(Realm& realm)
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								{
-												LibJS+LibWeb: Use realm.create<T> instead of heap.allocate<T>

The main motivation behind this is to remove JS specifics of the Realm
from the implementation of the Heap.

As a side effect of this change, this is a bit nicer to read than the
previous approach, and in my opinion, also makes it a little more clear
that this method is specific to a JavaScript Realm.

											
										
										
											2024-11-14 05:50:17 +13:00
+								    return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype());
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								}
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								GC::Ref<RegExpObject> RegExpObject::create(Realm& realm, Utf16String pattern, Utf16String flags)
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								{
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    return realm.create<RegExpObject>(move(pattern), move(flags), realm.intrinsics().regexp_prototype());
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								}
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								RegExpObject::RegExpObject(Object& prototype)
-												LibJS: Remove Object(Object& prototype) footgun

This constructor was easily confused with a copy constructor, and it was
possible to accidentally copy-construct Objects in at least one way that
we dicovered (via generic ThrowCompletionOr construction).

This patch adds a mandatory ConstructWithPrototypeTag parameter to the
constructor to disambiguate it.

											
										
										
											2022-12-14 12:17:58 +01:00
+								    : Object(ConstructWithPrototypeTag::Tag, prototype)
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								{
 								}
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								static RegExpObject::Flags to_flag_bits(Utf16View const& flags)
-												LibJS: Store RegExp flags as a bitmask

This avoids having to do O(n) contains() in the various flag accessors.

Yields a ~20% speed-up on the following microbenchmark:

    const re = /foo/dgimsvy;
    for (let i = 0; i < 1_000_000; ++i)
        re.flags;

											
										
										
											2024-10-25 17:29:03 +02:00
+								{
 								    RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
 								    for (size_t i = 0; i < flags.length_in_code_units(); ++i) {
 								        auto ch = flags.code_unit_at(i);
-												LibJS: Store RegExp flags as a bitmask

This avoids having to do O(n) contains() in the various flag accessors.

Yields a ~20% speed-up on the following microbenchmark:

    const re = /foo/dgimsvy;
    for (let i = 0; i < 1_000_000; ++i)
        re.flags;

											
										
										
											2024-10-25 17:29:03 +02:00
+								        switch (ch) {
 								#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
 								    case #flag_char[0]:                                          \
 								        flag_bits |= RegExpObject::Flags::FlagName;              \
 								        break;
 								            JS_ENUMERATE_REGEXP_FLAGS
 								#undef __JS_ENUMERATE
 								        default:
 								            break;
 								        }
 								    }
 								    return flag_bits;
 								}
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								RegExpObject::RegExpObject(Utf16String pattern, Utf16String flags, Object& prototype)
-												LibJS: Remove Object(Object& prototype) footgun

This constructor was easily confused with a copy constructor, and it was
possible to accidentally copy-construct Objects in at least one way that
we dicovered (via generic ThrowCompletionOr construction).

This patch adds a mandatory ConstructWithPrototypeTag parameter to the
constructor to disambiguate it.

											
										
										
											2022-12-14 12:17:58 +01:00
+								    : Object(ConstructWithPrototypeTag::Tag, prototype)
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								    , m_pattern(move(pattern))
-												LibJS: Implement RegExpCreate/RegExpInitialize closer to the spec

RegExpInitialize specifies how the pattern string should be created
before passing it to [[RegExpMatcher]]. Rather than passing it as-is,
the string should be converted to code points and back to a "List" (if
the Unicode flag is present), or as a "List" of UTF-16 code units.
Further. the spec requires that we keep both the original pattern string
and this parsed string in the RegExp object.

The caveat is that the LibRegex parser further requires any multi-byte
code units to be escaped (as "\unnnn"). Otherwise, the code unit is
recognized as individual UTF-8 bytes.

											
										
										
											2021-07-22 08:04:31 -04:00
+								    , m_flags(move(flags))
-												LibJS: Store RegExp flags as a bitmask

This avoids having to do O(n) contains() in the various flag accessors.

Yields a ~20% speed-up on the following microbenchmark:

    const re = /foo/dgimsvy;
    for (let i = 0; i < 1_000_000; ++i)
        re.flags;

											
										
										
											2024-10-25 17:29:03 +02:00
+								    , m_flag_bits(to_flag_bits(m_flags))
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								{
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								}
-												LibJS: Make Cell::initialize() return void

Stop worrying about tiny OOMs.

Work towards #20405

											
										
										
											2023-08-07 08:41:28 +02:00
+								void RegExpObject::initialize(Realm& realm)
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								{
-												LibJS: Stop using a native property for RegExp's lastIndex property

This is not a functional change, the exposed (incorrect) behaviour is
the same as it was before, this simply removes the last user of
NativeProperties, allowing us to remove them completely from LibJS.

											
										
										
											2021-07-07 19:15:52 +03:00
+								    auto& vm = this->vm();
-												LibJS: Make Cell::initialize() return void

Stop worrying about tiny OOMs.

Work towards #20405

											
										
										
											2023-08-07 08:41:28 +02:00
+								    Base::initialize(realm);
-												LibJS: Implement RegExp legacy static properties

RegExp legacy static properties Spec url is https://github.com/tc39/proposal-regexp-legacy-features

											
										
										
											2022-10-17 08:59:27 +08:00
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
+								    define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
-												LibJS: Hook up Regex<ECMA262> to RegExpObject and implement `test()'

This makes RegExpObject compile and store a Regex<ECMA262>, adds
all flag-related properties, and implements `RegExpPrototype.test()`
(complete with 'lastIndex' support) :^)
It should be noted that this only implements `test()' using the builtin
`exec()'.

											
										
										
											2020-11-19 01:50:00 +03:30
+								}
-												LibJS: Update RegExp spec numbers to match re-ordering within the spec

This is an editorial change in the ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/abee2e6
https://github.com/tc39/ecma262/commit/77256bf

											
										
										
											2023-06-23 10:05:38 -04:00
+								// 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								ThrowCompletionOr<GC::Ref<RegExpObject>> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value)
-												LibJS: Implement (mostly) String.prototype.match

JavaScript has a couple of different ways to run a regular expression
on a string. This adds support for one more. :^)

											
										
										
											2021-03-14 11:03:11 +01:00
+								{
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    // Invalidate the cached compiled regex since the pattern/flags may change.
 								    m_cached_regex = nullptr;
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    // 1. If pattern is undefined, let P be the empty String.
 								    // 2. Else, let P be ? ToString(pattern).
 								    auto pattern = pattern_value.is_undefined()
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								        ? Utf16String {}
 								        : TRY(pattern_value.to_utf16_string(vm));
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
 								    // 3. If flags is undefined, let F be the empty String.
 								    // 4. Else, let F be ? ToString(flags).
 								    auto flags = flags_value.is_undefined()
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
+								        ? Utf16String {}
 								        : TRY(flags_value.to_utf16_string(vm));
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
-												LibJS: Update spec numbers/steps for RegExp unicode sets flag proposal

This proposal has been merged into the main ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/26b2369

											
										
										
											2023-06-23 10:39:08 -04:00
+								    // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    // 6. If F contains "i", let i be true; else let i be false.
 								    // 7. If F contains "m", let m be true; else let m be false.
 								    // 8. If F contains "s", let s be true; else let s be false.
 								    // 9. If F contains "u", let u be true; else let u be false.
 								    // 10. If F contains "v", let v be true; else let v be false.
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    auto validated_flags_or_error = validate_flags(flags);
 								    if (validated_flags_or_error.is_error())
 								        return vm.throw_completion<SyntaxError>(validated_flags_or_error.release_error());
 								    auto flag_bits = validated_flags_or_error.release_value();
 								    bool unicode = has_flag(flag_bits, Flags::Unicode);
 								    bool unicode_sets = has_flag(flag_bits, Flags::UnicodeSets);
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
-												LibJS: Use FlyString in PropertyKey instead of DeprecatedFlyString

This required dealing with *substantial* fallout.

											
										
										
											2025-03-18 18:08:02 -05:00
+								    auto parsed_pattern = String {};
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
 								    // Convert UTF-16 pattern to UTF-8 (with escape normalization for non-ASCII).
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    if (!pattern.is_empty()) {
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								        auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
 								        if (result.is_error())
 								            return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, result.release_error().error);
 								        parsed_pattern = result.release_value();
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    }
-												LibJS: Parse RegExp literals at AST creation time, not execution time

The spec requires that invalid RegExp literals must cause a Syntax Error
before the JavaScript is executed. See:
https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors

This is explicitly tested in the RegExp/property-escapes test262 tests.
For example, see unsupported-property-Line_Break.js:

    $DONOTEVALUATE();
    /\p{Line_Break}/u;

That RegExp literal is invalid because Line_Break is not a supported
Unicode property. $DONOTEVALUATE() just throws an exception when it is
executed. The test expects that this file will fail to be parsed.

Note that RegExp patterns can still be parsed at execution time by way
of "new RegExp(...)".

											
										
										
											2021-07-29 10:34:37 -04:00
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    // 11. If u is true and v is true, throw a SyntaxError exception.
 								    // NB: Already handled by validate_flags above.
 								    // Validate by trial-compiling the pattern.
 								    regex::ECMAScriptCompileFlags compile_flags {};
 								    compile_flags.global = has_flag(flag_bits, Flags::Global);
 								    compile_flags.ignore_case = has_flag(flag_bits, Flags::IgnoreCase);
 								    compile_flags.multiline = has_flag(flag_bits, Flags::Multiline);
 								    compile_flags.dot_all = has_flag(flag_bits, Flags::DotAll);
 								    compile_flags.unicode = unicode;
 								    compile_flags.unicode_sets = unicode_sets;
 								    compile_flags.sticky = has_flag(flag_bits, Flags::Sticky);
-												LibJS: Implement RegExpCreate/RegExpInitialize closer to the spec

RegExpInitialize specifies how the pattern string should be created
before passing it to [[RegExpMatcher]]. Rather than passing it as-is,
the string should be converted to code points and back to a "List" (if
the Unicode flag is present), or as a "List" of UTF-16 code units.
Further. the spec requires that we keep both the original pattern string
and this parsed string in the RegExp object.

The caveat is that the LibRegex parser further requires any multi-byte
code units to be escaped (as "\unnnn"). Otherwise, the code unit is
recognized as individual UTF-8 bytes.

											
										
										
											2021-07-22 08:04:31 -04:00
-												LibJS+LibRegex: Switch RegExp over to the Rust engine

Switch LibJS `RegExp` over to the Rust-backed `ECMAScriptRegex` APIs.

Route `new RegExp()`, regex literals, and the RegExp builtins through
the new compile and exec APIs, and stop re-validating patterns with the
deleted C++ parser on the way in. Preserve the observable error
behavior by carrying structured compile errors and backtracking-limit
failures across the FFI boundary. Cache compiled regex state and named
capture metadata on `RegExpObject` in the new representation.

Use the new API surface to simplify and speed up the builtin paths too:
share `exec_internal`, cache compiled regex pointers, keep the legacy
RegExp statics lazy, run global replace through batch `find_all`, and
optimize replace, test, split, and String helper paths. Add regression
tests for those JavaScript-visible paths.

											
										
										
											2026-03-25 10:52:40 +01:00
+								    auto compiled = regex::ECMAScriptRegex::compile(parsed_pattern.bytes_as_string_view(), compile_flags);
 								    if (compiled.is_error())
 								        return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, compiled.release_error());
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
 								    // 16. Set obj.[[OriginalSource]] to P.
 								    m_pattern = move(pattern);
 								    // 17. Set obj.[[OriginalFlags]] to F.
-												LibJS: Store RegExp flags as a bitmask

This avoids having to do O(n) contains() in the various flag accessors.

Yields a ~20% speed-up on the following microbenchmark:

    const re = /foo/dgimsvy;
    for (let i = 0; i < 1_000_000; ++i)
        re.flags;

											
										
										
											2024-10-25 17:29:03 +02:00
+								    m_flag_bits = to_flag_bits(flags);
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    m_flags = move(flags);
 								    // 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
 								    // 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
 								    // 20. Set obj.[[RegExpRecord]] to rer.
 								    // 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    // 22. Perform ? Set(obj, "lastIndex", +0𝔽, true).
-												LibJS: Convert the RegExpInitialize AO to ThrowCompletionOr

											
										
										
											2021-10-23 03:49:29 +03:00
+								    TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    // 23. Return obj.
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								    return GC::Ref { *this };
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								}
-												LibJS: Update RegExp spec numbers to match re-ordering within the spec

This is an editorial change in the ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/abee2e6
https://github.com/tc39/ecma262/commit/77256bf

											
										
										
											2023-06-23 10:05:38 -04:00
+								// 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
-												LibJS: Use FlyString in PropertyKey instead of DeprecatedFlyString

This required dealing with *substantial* fallout.

											
										
										
											2025-03-18 18:08:02 -05:00
+								String RegExpObject::escape_regexp_pattern() const
-												LibJS: Make escape_regexp_pattern() a RegExpObject member function

Similarly to regexp_initialize() this can be a member function instead
of taking a RegExpObject argument.
Having it available outside RegExpPrototype is also useful for other
things that need RegExp.prototype.source behavior - e.g. the REPL for
pretty-printing.

											
										
										
											2021-10-05 18:33:28 +01:00
+								{
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
+								    // 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
 								    //    to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
 								    //    described below. S may or may not be identical to P; however, the Abstract Closure that would result from
 								    //    evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
 								    //    the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
 								    //    this abstract operation using the same values for P and F must produce identical results.
 								    // 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
 								    //    that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
 								    //    RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
 								    //    "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
 								    //    would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
 								    //    specification can be met by letting S be "(?:)".
 								    // 3. Return S.
-												LibJS: Make escape_regexp_pattern() a RegExpObject member function

Similarly to regexp_initialize() this can be a member function instead
of taking a RegExpObject argument.
Having it available outside RegExpPrototype is also useful for other
things that need RegExp.prototype.source behavior - e.g. the REPL for
pretty-printing.

											
										
										
											2021-10-05 18:33:28 +01:00
+								    if (m_pattern.is_empty())
-												LibJS: Use FlyString in PropertyKey instead of DeprecatedFlyString

This required dealing with *substantial* fallout.

											
										
										
											2025-03-18 18:08:02 -05:00
+								        return "(?:)"_string;
-												LibJS: Manually loop over escaped regex pattern instead of ::replace()

This makes it ever-so-slightly faster, but more importantly, it fixes
the bug where a `/\//` regex's `source` property would return `\\/`
("\\\\/") instead of `\/` due to the existing '/' -> '\/' replace()
call.

											
										
										
											2023-02-15 17:55:13 +03:30
-												LibJS: Hook up the 'v' (unicodeSets) RegExp flag

											
										
										
											2022-07-16 10:14:03 +04:30
+								    // FIXME: Check the 'u' and 'v' flags and escape accordingly
-												LibJS: Manually loop over escaped regex pattern instead of ::replace()

This makes it ever-so-slightly faster, but more importantly, it fixes
the bug where a `/\//` regex's `source` property would return `\\/`
("\\\\/") instead of `\/` due to the existing '/' -> '\/' replace()
call.

											
										
										
											2023-02-15 17:55:13 +03:30
+								    StringBuilder builder;
 								    auto escaped = false;
-												LibJS: Escape line terminators in regex source

											
										
										
											2025-10-17 14:57:07 +02:00
+								    auto in_character_class = false;
-												LibJS: Port RegExp flags and patterns to UTF-16

											
										
										
											2025-08-06 11:28:18 -04:00
 								    for (auto code_point : m_pattern) {
-												LibJS: Manually loop over escaped regex pattern instead of ::replace()

This makes it ever-so-slightly faster, but more importantly, it fixes
the bug where a `/\//` regex's `source` property would return `\\/`
("\\\\/") instead of `\/` due to the existing '/' -> '\/' replace()
call.

											
										
										
											2023-02-15 17:55:13 +03:30
+								        if (escaped) {
 								            escaped = false;
 								            builder.append_code_point('\\');
-												LibJS: Escape line terminators in regex source

											
										
										
											2025-10-17 14:57:07 +02:00
 								            switch (code_point) {
 								            case '\n':
 								                builder.append_code_point('n');
 								                break;
 								            case '\r':
 								                builder.append_code_point('r');
 								                break;
 								            case LINE_SEPARATOR:
 								                builder.append("u2028"sv);
 								                break;
 								            case PARAGRAPH_SEPARATOR:
 								                builder.append("u2029"sv);
 								                break;
 								            default:
 								                builder.append_code_point(code_point);
 								                break;
 								            }
-												LibJS: Manually loop over escaped regex pattern instead of ::replace()

This makes it ever-so-slightly faster, but more importantly, it fixes
the bug where a `/\//` regex's `source` property would return `\\/`
("\\\\/") instead of `\/` due to the existing '/' -> '\/' replace()
call.

											
										
										
											2023-02-15 17:55:13 +03:30
+								            continue;
 								        }
 								        if (code_point == '\\') {
 								            escaped = true;
 								            continue;
 								        }
-												LibJS: Escape line terminators in regex source

											
										
										
											2025-10-17 14:57:07 +02:00
+								        if (code_point == '[') {
 								            in_character_class = true;
 								        } else if (code_point == ']') {
 								            in_character_class = false;
 								        }
-												LibJS: Actually escape \n|\r|LS|PS when escaping RegExp.source

We were previously encoding them as `\<literal newline>`, which is just
all sorts of wrong :P

											
										
										
											2023-02-17 01:13:33 +03:30
+								        switch (code_point) {
 								        case '/':
-												LibJS: Escape line terminators in regex source

											
										
										
											2025-10-17 14:57:07 +02:00
+								            if (in_character_class)
 								                builder.append_code_point('/');
 								            else
 								                builder.append("\\/"sv);
-												LibJS: Actually escape \n|\r|LS|PS when escaping RegExp.source

We were previously encoding them as `\<literal newline>`, which is just
all sorts of wrong :P

											
										
										
											2023-02-17 01:13:33 +03:30
+								            break;
 								        case '\n':
 								            builder.append("\\n"sv);
 								            break;
 								        case '\r':
 								            builder.append("\\r"sv);
 								            break;
 								        case LINE_SEPARATOR:
 								            builder.append("\\u2028"sv);
 								            break;
 								        case PARAGRAPH_SEPARATOR:
 								            builder.append("\\u2029"sv);
 								            break;
 								        default:
 								            builder.append_code_point(code_point);
 								            break;
-												LibJS: Manually loop over escaped regex pattern instead of ::replace()

This makes it ever-so-slightly faster, but more importantly, it fixes
the bug where a `/\//` regex's `source` property would return `\\/`
("\\\\/") instead of `\/` due to the existing '/' -> '\/' replace()
call.

											
										
										
											2023-02-15 17:55:13 +03:30
+								        }
 								    }
-												LibJS: Use FlyString in PropertyKey instead of DeprecatedFlyString

This required dealing with *substantial* fallout.

											
										
										
											2025-03-18 18:08:02 -05:00
+								    return builder.to_string_without_validation();
-												LibJS: Make escape_regexp_pattern() a RegExpObject member function

Similarly to regexp_initialize() this can be a member function instead
of taking a RegExpObject argument.
Having it available outside RegExpPrototype is also useful for other
things that need RegExp.prototype.source behavior - e.g. the REPL for
pretty-printing.

											
										
										
											2021-10-05 18:33:28 +01:00
+								}
-												LibWeb: Add missing visit_edges implementation to RegExpObject

											
										
										
											2024-04-05 20:41:25 +03:00
+								void RegExpObject::visit_edges(JS::Cell::Visitor& visitor)
 								{
 								    Base::visit_edges(visitor);
 								    visitor.visit(m_realm);
 								}
-												LibJS: Update RegExp spec numbers to match re-ordering within the spec

This is an editorial change in the ECMA-262 spec. See:
https://github.com/tc39/ecma262/commit/abee2e6
https://github.com/tc39/ecma262/commit/77256bf

											
										
										
											2023-06-23 10:05:38 -04:00
+								// 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_create(VM& vm, Value pattern, Value flags)
-												LibJS: Separate RegExpCreate into RegExpAlloc and RegExpInitialize

RegExp.prototype.compile will require invoking RegExpInitialize on an
already-existing RegExpObject. Break up RegExpCreate into RegExpAlloc
and RegExpInitialize to support this.

											
										
										
											2021-08-20 09:14:27 -04:00
+								{
-												LibJS: Replace GlobalObject with VM in RegExp AOs [Part 9/19]

											
										
										
											2022-08-21 16:14:51 +01:00
+								    auto& realm = *vm.current_realm();
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
 								    // 1. Let obj be ! RegExpAlloc(%RegExp%).
-												LibJS: Make intrinsics getters return NonnullGCPtr

Some of these are allocated upon initialization of the intrinsics, and
some lazily, but in neither case the getters actually return a nullptr.

This saves us a whole bunch of pointer dereferences (as NonnullGCPtr has
an `operator T&()`), and also has the interesting side effect of forcing
us to explicitly use the FunctionObject& overload of call(), as passing
a NonnullGCPtr is ambigous - it could implicitly be turned into a Value
_or_ a FunctionObject& (so we have to dereference manually).

											
										
										
											2023-04-13 00:47:15 +02:00
+								    auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor()));
-												LibJS: Add spec comments to RegExpObject

Also fix the evaluation order of ToString for pattern and flags while
we're here, and improve some of the variable names.

											
										
										
											2022-10-16 15:17:01 +02:00
 								    // 2. Return ? RegExpInitialize(obj, P, F).
-												LibJS: Replace GlobalObject with VM in RegExp AOs [Part 9/19]

											
										
										
											2022-08-21 16:14:51 +01:00
+								    return TRY(regexp_object->regexp_initialize(vm, pattern, flags));
-												LibJS: Implement (mostly) String.prototype.match

JavaScript has a couple of different ways to run a regular expression
on a string. This adds support for one more. :^)

											
										
										
											2021-03-14 11:03:11 +01:00
+								}
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
+								// 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
-												LibJS: Implement RegExp legacy static properties

RegExp legacy static properties Spec url is https://github.com/tc39/proposal-regexp-legacy-features

											
										
										
											2022-10-17 08:59:27 +08:00
+								// 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
-												LibGC+Everywhere: Factor out a LibGC from LibJS

Resulting in a massive rename across almost everywhere! Alongside the
namespace change, we now have the following names:

 * JS::NonnullGCPtr -> GC::Ref
 * JS::GCPtr -> GC::Ptr
 * JS::HeapFunction -> GC::Function
 * JS::CellImpl -> GC::Cell
 * JS::Handle -> GC::Root

											
										
										
											2024-11-15 04:01:23 +13:00
+								ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_alloc(VM& vm, FunctionObject& new_target)
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
+								{
 								    // 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »).
-												LibJS: Convert ordinary_create_from_constructor() to NonnullGCPtr

											
										
										
											2022-12-14 18:34:32 +00:00
+								    auto regexp_object = TRY(ordinary_create_from_constructor<RegExpObject>(vm, new_target, &Intrinsics::regexp_prototype));
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
-												LibJS: Implement RegExp legacy static properties

RegExp legacy static properties Spec url is https://github.com/tc39/proposal-regexp-legacy-features

											
										
										
											2022-10-17 08:59:27 +08:00
+								    // 2. Let thisRealm be the current Realm Record.
 								    auto& this_realm = *vm.current_realm();
 								    // 3. Set the value of obj’s [[Realm]] internal slot to thisRealm.
 								    regexp_object->set_realm(this_realm);
 								    // 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
-												LibJS: Make intrinsics getters return NonnullGCPtr

Some of these are allocated upon initialization of the intrinsics, and
some lazily, but in neither case the getters actually return a nullptr.

This saves us a whole bunch of pointer dereferences (as NonnullGCPtr has
an `operator T&()`), and also has the interesting side effect of forcing
us to explicitly use the FunctionObject& overload of call(), as passing
a NonnullGCPtr is ambigous - it could implicitly be turned into a Value
_or_ a FunctionObject& (so we have to dereference manually).

											
										
										
											2023-04-13 00:47:15 +02:00
+								    if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) {
-												LibJS: Implement RegExp legacy static properties

RegExp legacy static properties Spec url is https://github.com/tc39/proposal-regexp-legacy-features

											
										
										
											2022-10-17 08:59:27 +08:00
+								        // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to true.
 								        regexp_object->set_legacy_features_enabled(true);
 								    }
 								    // 5. Else,
 								    else {
 								        // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to false.
 								        regexp_object->set_legacy_features_enabled(false);
 								    }
 								    // 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
-												LibJS: Make `internal_define_own_property()` save added property offset

...in `PropertyDescriptor`. This is required for the upcoming change
that needs to know offset of newly added properties to set up inline
caching.

											
										
										
											2025-09-15 16:43:27 +02:00
+								    PropertyDescriptor descriptor { .writable = true, .enumerable = false, .configurable = false };
 								    MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, descriptor));
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
-												LibJS: Implement RegExp legacy static properties

RegExp legacy static properties Spec url is https://github.com/tc39/proposal-regexp-legacy-features

											
										
										
											2022-10-17 08:59:27 +08:00
+								    // 7. Return obj.
-												LibJS: Convert ordinary_create_from_constructor() to NonnullGCPtr

											
										
										
											2022-12-14 18:34:32 +00:00
+								    return regexp_object;
-												LibJS: Implement the RegExpAlloc AO

											
										
										
											2022-10-16 14:57:29 +02:00
+								}
-												LibJS: Lex and parse regex literals, add RegExp objects

This adds regex parsing/lexing, as well as a relatively empty
RegExpObject. The purpose of this patch is to allow the engine to not
get hung up on parsing regexes. This will aid in finding new syntax
errors (say, from google or twitter) without having to replace all of
their regexes first!

											
										
										
											2020-06-03 16:05:49 -07:00
+								}