ladybird/Libraries/LibRegex/ECMAScriptRegex.cpp

/*
 * Copyright (c) 2026-present, the Ladybird developers.
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/OwnPtr.h>
#include <LibRegex/ECMAScriptRegex.h>
#include <LibRegex/RustRegex.h>

namespace regex {

struct ECMAScriptRegex::Impl {
    CompiledRustRegex rust_regex;
    Vector<ECMAScriptNamedCaptureGroup> named_groups;
};

ErrorOr<ECMAScriptRegex, String> ECMAScriptRegex::compile(StringView utf8_pattern, ECMAScriptCompileFlags flags)
{
    RustRegexFlags rust_flags {};
    rust_flags.global = flags.global;
    rust_flags.ignore_case = flags.ignore_case;
    rust_flags.multiline = flags.multiline;
    rust_flags.dot_all = flags.dot_all;
    rust_flags.unicode = flags.unicode;
    rust_flags.unicode_sets = flags.unicode_sets;
    rust_flags.sticky = flags.sticky;
    rust_flags.has_indices = flags.has_indices;

    auto compiled = CompiledRustRegex::compile(utf8_pattern, rust_flags);
    if (compiled.is_error())
        return compiled.release_error();

    auto rust_regex = compiled.release_value();

    Vector<ECMAScriptNamedCaptureGroup> named_groups;
    named_groups.ensure_capacity(rust_regex.named_groups().size());
    for (auto const& rg : rust_regex.named_groups())
        named_groups.unchecked_append({ .name = rg.name, .index = rg.index });

    auto impl = adopt_own(*new Impl {
        .rust_regex = move(rust_regex),
        .named_groups = move(named_groups),
    });
    return ECMAScriptRegex(move(impl));
}

ECMAScriptRegex::~ECMAScriptRegex() = default;

ECMAScriptRegex::ECMAScriptRegex(ECMAScriptRegex&& other) = default;
ECMAScriptRegex& ECMAScriptRegex::operator=(ECMAScriptRegex&& other) = default;

ECMAScriptRegex::ECMAScriptRegex(OwnPtr<Impl> impl)
    : m_impl(move(impl))
{
}

MatchResult ECMAScriptRegex::exec(Utf16View input, size_t start_pos) const
{
    auto result = m_impl->rust_regex.exec_internal(input, start_pos);
    if (result == 1)
        return MatchResult::Match;
    if (result == -1)
        return MatchResult::LimitExceeded;
    return MatchResult::NoMatch;
}

int ECMAScriptRegex::capture_slot(unsigned int slot) const
{
    return m_impl->rust_regex.capture_slot(slot);
}

MatchResult ECMAScriptRegex::test(Utf16View input, size_t start_pos) const
{
    auto result = m_impl->rust_regex.test(input, start_pos);
    if (result == 1)
        return MatchResult::Match;
    if (result == -1)
        return MatchResult::LimitExceeded;
    return MatchResult::NoMatch;
}

unsigned int ECMAScriptRegex::capture_count() const
{
    return m_impl->rust_regex.capture_count();
}

unsigned int ECMAScriptRegex::total_groups() const
{
    return m_impl->rust_regex.total_groups();
}

bool ECMAScriptRegex::is_single_non_bmp_literal() const
{
    return m_impl->rust_regex.is_single_non_bmp_literal();
}

Vector<ECMAScriptNamedCaptureGroup> const& ECMAScriptRegex::named_groups() const
{
    return m_impl->named_groups;
}

int ECMAScriptRegex::find_all(Utf16View input, size_t start_pos) const
{
    return m_impl->rust_regex.find_all(input, start_pos);
}

ECMAScriptRegex::MatchPair ECMAScriptRegex::find_all_match(int index) const
{
    auto pair = m_impl->rust_regex.find_all_match(index);
    return { pair.start, pair.end };
}

}
LibRegex: Add ECMAScriptRegex and migrate callers Add `ECMAScriptRegex`, LibRegex's C++ facade for ECMAScript regexes. The facade owns compilation, execution, captures, named groups, and error translation for the Rust backend, which lets callers stop depending on the legacy parser and matcher types directly. Use it in the remaining non-LibJS callers: URLPattern, HTML input pattern handling, and the places in LibHTTP that only needed token validation. Where a full regex engine was unnecessary, replace those call sites with direct character checks. Also update focused LibURL, LibHTTP, and WPT coverage for the migrated callers and corrected surrogate handling. 2026-03-25 10:52:20 +01:00			`/*`
			`* Copyright (c) 2026-present, the Ladybird developers.`
			`*`
			`* SPDX-License-Identifier: BSD-2-Clause`
			`*/`

			`#include <AK/OwnPtr.h>`
			`#include <LibRegex/ECMAScriptRegex.h>`
			`#include <LibRegex/RustRegex.h>`

			`namespace regex {`

			`struct ECMAScriptRegex::Impl {`
			`CompiledRustRegex rust_regex;`
			`Vector<ECMAScriptNamedCaptureGroup> named_groups;`
			`};`

			`ErrorOr<ECMAScriptRegex, String> ECMAScriptRegex::compile(StringView utf8_pattern, ECMAScriptCompileFlags flags)`
			`{`
			`RustRegexFlags rust_flags {};`
			`rust_flags.global = flags.global;`
			`rust_flags.ignore_case = flags.ignore_case;`
			`rust_flags.multiline = flags.multiline;`
			`rust_flags.dot_all = flags.dot_all;`
			`rust_flags.unicode = flags.unicode;`
			`rust_flags.unicode_sets = flags.unicode_sets;`
			`rust_flags.sticky = flags.sticky;`
			`rust_flags.has_indices = flags.has_indices;`

			`auto compiled = CompiledRustRegex::compile(utf8_pattern, rust_flags);`
			`if (compiled.is_error())`
			`return compiled.release_error();`

			`auto rust_regex = compiled.release_value();`

			`Vector<ECMAScriptNamedCaptureGroup> named_groups;`
			`named_groups.ensure_capacity(rust_regex.named_groups().size());`
			`for (auto const& rg : rust_regex.named_groups())`
			`named_groups.unchecked_append({ .name = rg.name, .index = rg.index });`

			`auto impl = adopt_own(*new Impl {`
			`.rust_regex = move(rust_regex),`
			`.named_groups = move(named_groups),`
			`});`
			`return ECMAScriptRegex(move(impl));`
			`}`

			`ECMAScriptRegex::~ECMAScriptRegex() = default;`

			`ECMAScriptRegex::ECMAScriptRegex(ECMAScriptRegex&& other) = default;`
			`ECMAScriptRegex& ECMAScriptRegex::operator=(ECMAScriptRegex&& other) = default;`

			`ECMAScriptRegex::ECMAScriptRegex(OwnPtr<Impl> impl)`
			`: m_impl(move(impl))`
			`{`
			`}`

			`MatchResult ECMAScriptRegex::exec(Utf16View input, size_t start_pos) const`
			`{`
			`auto result = m_impl->rust_regex.exec_internal(input, start_pos);`
			`if (result == 1)`
			`return MatchResult::Match;`
			`if (result == -1)`
			`return MatchResult::LimitExceeded;`
			`return MatchResult::NoMatch;`
			`}`

			`int ECMAScriptRegex::capture_slot(unsigned int slot) const`
			`{`
			`return m_impl->rust_regex.capture_slot(slot);`
			`}`

			`MatchResult ECMAScriptRegex::test(Utf16View input, size_t start_pos) const`
			`{`
			`auto result = m_impl->rust_regex.test(input, start_pos);`
			`if (result == 1)`
			`return MatchResult::Match;`
			`if (result == -1)`
			`return MatchResult::LimitExceeded;`
			`return MatchResult::NoMatch;`
			`}`

			`unsigned int ECMAScriptRegex::capture_count() const`
			`{`
			`return m_impl->rust_regex.capture_count();`
			`}`

			`unsigned int ECMAScriptRegex::total_groups() const`
			`{`
			`return m_impl->rust_regex.total_groups();`
			`}`

LibRegex: Respect V8 astral literal lastIndex behavior Preserve V8's behavior for bare single-astral literals when a unicode global search starts in the middle of a surrogate pair. We were snapping that lastIndex back to the pair start unconditionally, which let /😀/gu and /\u{1F600}/gu match where V8 returns null. Expose that literal shape from LibRegex to LibJS and add runtime coverage for the bare literal case alongside a grouped control. 2026-03-25 22:51:10 +01:00			`bool ECMAScriptRegex::is_single_non_bmp_literal() const`
			`{`
			`return m_impl->rust_regex.is_single_non_bmp_literal();`
			`}`

LibRegex: Add ECMAScriptRegex and migrate callers Add `ECMAScriptRegex`, LibRegex's C++ facade for ECMAScript regexes. The facade owns compilation, execution, captures, named groups, and error translation for the Rust backend, which lets callers stop depending on the legacy parser and matcher types directly. Use it in the remaining non-LibJS callers: URLPattern, HTML input pattern handling, and the places in LibHTTP that only needed token validation. Where a full regex engine was unnecessary, replace those call sites with direct character checks. Also update focused LibURL, LibHTTP, and WPT coverage for the migrated callers and corrected surrogate handling. 2026-03-25 10:52:20 +01:00			`Vector<ECMAScriptNamedCaptureGroup> const& ECMAScriptRegex::named_groups() const`
			`{`
			`return m_impl->named_groups;`
			`}`

			`int ECMAScriptRegex::find_all(Utf16View input, size_t start_pos) const`
			`{`
			`return m_impl->rust_regex.find_all(input, start_pos);`
			`}`

			`ECMAScriptRegex::MatchPair ECMAScriptRegex::find_all_match(int index) const`
			`{`
			`auto pair = m_impl->rust_regex.find_all_match(index);`
			`return { pair.start, pair.end };`
			`}`

			`}`