LibRegex: Support matching unicode multi-character sequences

This commit is contained in:
aplefull 2025-11-09 13:35:16 +01:00 committed by Ali Mohammad Pur
parent 5b7c9af340
commit a49c39de32
Notes: github-actions[bot] 2025-11-26 10:35:48 +00:00
7 changed files with 462 additions and 34 deletions

View file

@ -12,7 +12,9 @@
#include <LibUnicode/ICU.h>
#include <unicode/uchar.h>
#include <unicode/uniset.h>
#include <unicode/uscript.h>
#include <unicode/uset.h>
namespace Unicode {
@ -321,6 +323,39 @@ bool is_ecma262_string_property(Property property)
}
}
Vector<String> get_property_strings(Property property)
{
Vector<String> result;
if (!is_ecma262_string_property(property))
return result;
UErrorCode status = U_ZERO_ERROR;
auto const* icu_set = u_getBinaryPropertySet(static_cast<UProperty>(property.value()), &status);
if (!icu_success(status) || !icu_set)
return result;
auto const* unicode_set = icu::UnicodeSet::fromUSet(icu_set);
if (!unicode_set)
return result;
auto range_count = unicode_set->getRangeCount();
for (int32_t i = 0; i < range_count; ++i) {
auto start = unicode_set->getRangeStart(i);
auto end = unicode_set->getRangeEnd(i);
for (auto code_point = start; code_point <= end; ++code_point) {
result.append(String::from_code_point(code_point));
}
}
for (auto const& str : unicode_set->strings()) {
result.append(icu_string_to_string(str));
}
return result;
}
Optional<Script> script_from_string(StringView script)
{
static auto script_names = []() {

View file

@ -40,6 +40,7 @@ bool code_point_has_white_space_property(u32 code_point);
bool is_ecma262_property(Property);
bool is_ecma262_string_property(Property);
Vector<String> get_property_strings(Property);
Optional<Script> script_from_string(StringView);
bool code_point_has_script(u32 code_point, Script script);