mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-12-07 21:59:54 +00:00
LibRegex+LibUnicode: Add unicode string properties
This commit is contained in:
parent
8c8961171c
commit
7ce4abe330
Notes:
github-actions[bot]
2025-10-24 17:26:08 +00:00
Author: https://github.com/aplefull
Commit: 7ce4abe330
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6545
Reviewed-by: https://github.com/alimpfard
Reviewed-by: https://github.com/gmta ✅
Reviewed-by: https://github.com/trflynn89
6 changed files with 139 additions and 40 deletions
|
|
@ -97,3 +97,62 @@ test("Unicode non-ASCII matching", () => {
|
|||
expect(result).toEqual(test.expected);
|
||||
}
|
||||
});
|
||||
|
||||
// Test from https://github.com/tc39/test262/blob/main/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape.js
|
||||
test("Unicode properties of strings", () => {
|
||||
const regexes = [
|
||||
/\p{Basic_Emoji}/v,
|
||||
/\p{Emoji_Keycap_Sequence}/v,
|
||||
/\p{RGI_Emoji_Modifier_Sequence}/v,
|
||||
/\p{RGI_Emoji_Flag_Sequence}/v,
|
||||
/\p{RGI_Emoji_Tag_Sequence}/v,
|
||||
/\p{RGI_Emoji_ZWJ_Sequence}/v,
|
||||
/\p{RGI_Emoji}/v,
|
||||
];
|
||||
|
||||
for (const re of regexes) {
|
||||
expect(() => {
|
||||
re.test("test");
|
||||
}).not.toThrow();
|
||||
}
|
||||
|
||||
const matchStrings = [
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"8",
|
||||
"A",
|
||||
"B",
|
||||
"D",
|
||||
"E",
|
||||
"F",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
];
|
||||
|
||||
const nonMatchStrings = [
|
||||
"6\uFE0F\u20E3",
|
||||
"7\uFE0F\u20E3",
|
||||
"9\uFE0F\u20E3",
|
||||
"\u2603",
|
||||
"\u{1D306}",
|
||||
"\u{1F1E7}\u{1F1EA}",
|
||||
];
|
||||
|
||||
const re = /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v;
|
||||
|
||||
for (const str of matchStrings) {
|
||||
expect(re.test(str)).toBeTrue();
|
||||
}
|
||||
|
||||
for (const str of nonMatchStrings) {
|
||||
expect(re.test(str)).toBeFalse();
|
||||
}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -9,25 +9,26 @@
|
|||
|
||||
enum __Regex_Error {
|
||||
__Regex_NoError,
|
||||
__Regex_InvalidPattern, // Invalid regular expression.
|
||||
__Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
__Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
__Regex_MismatchingBracket, // [ ] imbalance.
|
||||
__Regex_MismatchingParen, // ( ) imbalance.
|
||||
__Regex_MismatchingBrace, // { } imbalance.
|
||||
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
__Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
__Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
__Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
__Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
__Regex_DuplicateNamedCapture, // Duplicate named capture group
|
||||
__Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
__Regex_InvalidPattern, // Invalid regular expression.
|
||||
__Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
__Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
__Regex_MismatchingBracket, // [ ] imbalance.
|
||||
__Regex_MismatchingParen, // ( ) imbalance.
|
||||
__Regex_MismatchingBrace, // { } imbalance.
|
||||
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
__Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
__Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
__Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
__Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
__Regex_DuplicateNamedCapture, // Duplicate named capture group
|
||||
__Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
__Regex_NegatedCharacterClassStrings, // Negated character class cannot contain strings.
|
||||
};
|
||||
|
||||
enum __RegexAllFlags {
|
||||
|
|
|
|||
|
|
@ -14,25 +14,26 @@ namespace regex {
|
|||
|
||||
enum class Error : u8 {
|
||||
NoError = __Regex_NoError,
|
||||
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
|
||||
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
|
||||
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
|
||||
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
|
||||
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid.
|
||||
InvalidCharacterClassEscape = __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
|
||||
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
|
||||
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
|
||||
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
|
||||
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid.
|
||||
InvalidCharacterClassEscape = __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
NegatedCharacterClassStrings = __Regex_NegatedCharacterClassStrings, // Negated character class may contain strings.
|
||||
};
|
||||
|
||||
inline StringView get_error_string(Error error)
|
||||
|
|
@ -78,6 +79,8 @@ inline StringView get_error_string(Error error)
|
|||
return "Duplicate capture group name"sv;
|
||||
case Error::InvalidCharacterClassEscape:
|
||||
return "Invalid escaped entity in character class."sv;
|
||||
case Error::NegatedCharacterClassStrings:
|
||||
return "Negated character class cannot contain strings"sv;
|
||||
}
|
||||
return "Undefined error."sv;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2463,6 +2463,19 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
|
|||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
|
||||
property.visit(
|
||||
[&](Unicode::Property property) {
|
||||
if (Unicode::is_ecma262_string_property(property)) {
|
||||
if (negated) {
|
||||
set_error(Error::InvalidNameForProperty);
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto const& compare : compares) {
|
||||
if (compare.type == CharacterCompareType::Inverse) {
|
||||
set_error(Error::NegatedCharacterClassStrings);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
},
|
||||
[&](Unicode::GeneralCategory general_category) {
|
||||
|
|
@ -2506,8 +2519,13 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
|
|||
property = move(*parsed_property);
|
||||
|
||||
return property.visit(
|
||||
[this](Unicode::Property property) {
|
||||
if (!Unicode::is_ecma262_property(property)) {
|
||||
[this, negated](Unicode::Property property) {
|
||||
if (Unicode::is_ecma262_string_property(property)) {
|
||||
if (!m_parser_state.regex_options.has_flag_set(AllFlags::UnicodeSets) || negated) {
|
||||
set_error(Error::InvalidNameForProperty);
|
||||
return false;
|
||||
}
|
||||
} else if (!Unicode::is_ecma262_property(property)) {
|
||||
set_error(Error::InvalidNameForProperty);
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -304,6 +304,23 @@ bool is_ecma262_property(Property property)
|
|||
}
|
||||
}
|
||||
|
||||
// https://tc39.es/ecma262/#table-binary-unicode-properties-of-strings
|
||||
bool is_ecma262_string_property(Property property)
|
||||
{
|
||||
switch (property.value()) {
|
||||
case UCHAR_BASIC_EMOJI:
|
||||
case UCHAR_EMOJI_KEYCAP_SEQUENCE:
|
||||
case UCHAR_RGI_EMOJI:
|
||||
case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
|
||||
case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
|
||||
case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
|
||||
case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Optional<Script> script_from_string(StringView script)
|
||||
{
|
||||
static auto script_names = []() {
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ bool code_point_has_variation_selector_property(u32 code_point);
|
|||
bool code_point_has_white_space_property(u32 code_point);
|
||||
|
||||
bool is_ecma262_property(Property);
|
||||
bool is_ecma262_string_property(Property);
|
||||
|
||||
Optional<Script> script_from_string(StringView);
|
||||
bool code_point_has_script(u32 code_point, Script script);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue