LibRegex: Properly track code units in u-v modes

Previously, both string_position and view_index used code unit offsets
regardless of mode. Now in unicode mode, these variables track code
point positions while string_position_in_code_units is properly
updated to reflect code unit offsets.
This commit is contained in:
aplefull 2025-10-22 13:40:15 +02:00 committed by Ali Mohammad Pur
parent fb258639d1
commit 5632a52531
Notes: github-actions[bot] 2025-10-24 19:24:41 +00:00
2 changed files with 51 additions and 3 deletions

View file

@ -156,3 +156,37 @@ test("Unicode properties of strings", () => {
expect(re.test(str)).toBeFalse();
}
});
test("Unicode matching with u and v flags", () => {
const text = "𠮷a𠮷b𠮷";
const complexText = "a\u{20BB7}b\u{10FFFF}c";
const cases = [
{ pattern: /𠮷/, match: text, expected: ["𠮷"] },
{ pattern: /𠮷/u, match: text, expected: ["𠮷"] },
{ pattern: /𠮷/v, match: text, expected: ["𠮷"] },
{ pattern: /\p{Script=Han}/u, match: text, expected: ["𠮷"] },
{ pattern: /\p{Script=Han}/v, match: text, expected: ["𠮷"] },
{ pattern: /./u, match: text, expected: ["𠮷"] },
{ pattern: /./v, match: text, expected: ["𠮷"] },
{ pattern: /\p{ASCII}/u, match: text, expected: ["a"] },
{ pattern: /\p{ASCII}/v, match: text, expected: ["a"] },
{ pattern: /x/u, match: text, expected: null },
{ pattern: /x/v, match: text, expected: null },
{ pattern: /\p{Script=Han}(.)/gu, match: text, expected: ["𠮷a", "𠮷b"] },
{ pattern: /\p{Script=Han}(.)/gv, match: text, expected: ["𠮷a", "𠮷b"] },
{ pattern: /\P{ASCII}/u, match: complexText, expected: ["\u{20BB7}"] },
{ pattern: /\P{ASCII}/v, match: complexText, expected: ["\u{20BB7}"] },
{ pattern: /\P{ASCII}/gu, match: complexText, expected: ["\u{20BB7}", "\u{10FFFF}"] },
{ pattern: /\P{ASCII}/gv, match: complexText, expected: ["\u{20BB7}", "\u{10FFFF}"] },
{ pattern: /./gu, match: text, expected: ["𠮷", "a", "𠮷", "b", "𠮷"] },
{ pattern: /./gv, match: text, expected: ["𠮷", "a", "𠮷", "b", "𠮷"] },
{ pattern: /(?:)/gu, match: text, expected: ["", "", "", "", "", ""] },
{ pattern: /(?:)/gv, match: text, expected: ["", "", "", "", "", ""] },
];
for (const test of cases) {
const result = test.match.match(test.pattern);
expect(result).toEqual(test.expected);
}
});

View file

@ -237,10 +237,17 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
input.view = view;
dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
auto view_length = view.length_in_code_units();
auto view_length = view.length();
size_t view_index = m_pattern->start_offset;
state.string_position = view_index;
state.string_position_in_code_units = view_index;
if (view.unicode()) {
if (view_index < view_length)
state.string_position_in_code_units = view.code_unit_offset_of(view_index);
else
state.string_position_in_code_units = view.length_in_code_units();
} else {
state.string_position_in_code_units = view_index;
}
bool succeeded = false;
if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
@ -303,7 +310,14 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
input.match_index = match_count;
state.string_position = view_index;
state.string_position_in_code_units = view_index;
if (input.view.unicode()) {
if (view_index < view_length)
state.string_position_in_code_units = input.view.code_unit_offset_of(view_index);
else
state.string_position_in_code_units = input.view.length_in_code_units();
} else {
state.string_position_in_code_units = view_index;
}
state.instruction_position = 0;
state.repetition_marks.clear();