2021-07-25 15:10:51 -04:00
/*
2024-06-19 16:39:30 -04:00
* Copyright ( c ) 2021 - 2024 , Tim Flynn < trflynn89 @ serenityos . org >
2021-07-25 15:10:51 -04:00
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2024-06-21 10:39:40 -04:00
# include <AK/Array.h>
# include <AK/CharacterTypes.h>
# include <AK/Find.h>
2026-02-15 15:08:24 +01:00
# include <AK/HashMap.h>
# include <AK/NonnullOwnPtr.h>
2024-06-21 10:39:40 -04:00
# include <AK/Traits.h>
2021-07-25 15:10:51 -04:00
# include <LibUnicode/CharacterTypes.h>
2024-06-23 09:14:27 -04:00
# include <LibUnicode/ICU.h>
2021-07-25 15:10:51 -04:00
2024-06-21 10:39:40 -04:00
# include <unicode/uchar.h>
2025-11-09 13:35:16 +01:00
# include <unicode/uniset.h>
2024-06-21 12:39:11 -04:00
# include <unicode/uscript.h>
2025-11-09 13:35:16 +01:00
# include <unicode/uset.h>
2024-06-21 10:39:40 -04:00
namespace Unicode {
template < typename PropertyType >
struct PropertyName {
Optional < StringView > long_name ;
Optional < StringView > short_name ;
Optional < StringView > additional_name ;
} ;
// From uchar.h:
// Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i
static constexpr auto ADDITIONAL_NAME = static_cast < UPropertyNameChoice > ( U_LONG_PROPERTY_NAME + 1 ) ;
}
template < typename PropertyType >
struct AK : : Traits < Unicode : : PropertyName < PropertyType > > {
static constexpr bool equals ( Unicode : : PropertyName < PropertyType > const & candidate , StringView property )
{
return property = = candidate . long_name | | property = = candidate . short_name | | property = = candidate . additional_name ;
}
} ;
2021-07-25 15:10:51 -04:00
namespace Unicode {
2024-06-21 11:24:00 -04:00
static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1 ;
static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2 ;
static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3 ;
static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4 ;
static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5 ;
static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6 ;
static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7 ;
static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8 ;
static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9 ;
2026-02-15 15:08:24 +01:00
static HashMap < GeneralCategory , NonnullOwnPtr < icu : : UnicodeSet > > s_category_sets_with_case_closure ;
static HashMap < Property , NonnullOwnPtr < icu : : UnicodeSet > > s_property_sets_with_case_closure ;
2024-06-21 11:24:00 -04:00
Optional < GeneralCategory > general_category_from_string ( StringView general_category )
{
static auto general_category_names = [ ] ( ) {
Array < PropertyName < GeneralCategory > , GENERAL_CATEGORY_LIMIT . value ( ) > names ;
auto set_names = [ & ] ( auto property , auto index , auto general_category ) {
if ( char const * name = u_getPropertyValueName ( property , general_category , U_LONG_PROPERTY_NAME ) )
names [ index . value ( ) ] . long_name = StringView { name , strlen ( name ) } ;
if ( char const * name = u_getPropertyValueName ( property , general_category , U_SHORT_PROPERTY_NAME ) )
names [ index . value ( ) ] . short_name = StringView { name , strlen ( name ) } ;
if ( char const * name = u_getPropertyValueName ( property , general_category , ADDITIONAL_NAME ) )
names [ index . value ( ) ] . additional_name = StringView { name , strlen ( name ) } ;
} ;
for ( GeneralCategory general_category = 0 ; general_category < U_CHAR_CATEGORY_COUNT ; + + general_category )
set_names ( UCHAR_GENERAL_CATEGORY , general_category , static_cast < UCharCategory > ( general_category . value ( ) ) ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_CASED_LETTER , U_GC_LC_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_LETTER , U_GC_L_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_MARK , U_GC_M_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_NUMBER , U_GC_N_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_PUNCTUATION , U_GC_P_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_SYMBOL , U_GC_S_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_SEPARATOR , U_GC_Z_MASK ) ;
set_names ( UCHAR_GENERAL_CATEGORY_MASK , GENERAL_CATEGORY_OTHER , U_GC_C_MASK ) ;
return names ;
} ( ) ;
if ( auto index = find_index ( general_category_names . begin ( ) , general_category_names . end ( ) , general_category ) ; index ! = general_category_names . size ( ) )
return static_cast < GeneralCategory > ( index ) ;
return { } ;
}
2026-02-15 15:08:24 +01:00
static uint32_t get_icu_mask ( GeneralCategory general_category )
2024-06-21 11:24:00 -04:00
{
if ( general_category = = GENERAL_CATEGORY_CASED_LETTER )
2026-02-15 15:08:24 +01:00
return U_GC_LC_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_LETTER )
2026-02-15 15:08:24 +01:00
return U_GC_L_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_MARK )
2026-02-15 15:08:24 +01:00
return U_GC_M_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_NUMBER )
2026-02-15 15:08:24 +01:00
return U_GC_N_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_PUNCTUATION )
2026-02-15 15:08:24 +01:00
return U_GC_P_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_SYMBOL )
2026-02-15 15:08:24 +01:00
return U_GC_S_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_SEPARATOR )
2026-02-15 15:08:24 +01:00
return U_GC_Z_MASK ;
2024-06-21 11:24:00 -04:00
if ( general_category = = GENERAL_CATEGORY_OTHER )
2026-02-15 15:08:24 +01:00
return U_GC_C_MASK ;
2024-06-21 11:24:00 -04:00
2026-02-15 15:08:24 +01:00
return U_MASK ( static_cast < UCharCategory > ( general_category . value ( ) ) ) ;
}
bool code_point_has_general_category ( u32 code_point , GeneralCategory general_category , CaseSensitivity case_sensitivity )
{
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
auto category_mask = get_icu_mask ( general_category ) ;
if ( ( U_GET_GC_MASK ( icu_code_point ) & category_mask ) ! = 0 )
return true ;
if ( case_sensitivity = = CaseSensitivity : : CaseSensitive )
return false ;
auto & set = s_category_sets_with_case_closure . ensure ( general_category , [ & ] {
UErrorCode status = U_ZERO_ERROR ;
auto new_set = make < icu : : UnicodeSet > ( ) ;
new_set - > applyIntPropertyValue ( UCHAR_GENERAL_CATEGORY_MASK , static_cast < int32_t > ( category_mask ) , status ) ;
new_set - > closeOver ( USET_CASE_INSENSITIVE ) ;
new_set - > freeze ( ) ;
return new_set ;
} ) ;
return set - > contains ( icu_code_point ) ;
2024-06-21 11:24:00 -04:00
}
2024-10-21 16:29:16 -04:00
bool code_point_is_printable ( u32 code_point )
{
return static_cast < bool > ( u_isprint ( static_cast < UChar32 > ( code_point ) ) ) ;
}
2024-06-21 11:24:00 -04:00
bool code_point_has_control_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , U_CONTROL_CHAR ) ;
}
2024-10-08 17:15:55 -04:00
bool code_point_has_letter_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_LETTER ) ;
}
2025-09-15 12:11:22 +02:00
bool code_point_has_mark_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_MARK ) ;
}
2024-10-08 17:15:55 -04:00
bool code_point_has_number_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_NUMBER ) ;
}
2024-09-05 12:07:59 -04:00
bool code_point_has_punctuation_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_PUNCTUATION ) ;
}
bool code_point_has_separator_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_SEPARATOR ) ;
}
2024-06-21 11:24:00 -04:00
bool code_point_has_space_separator_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , U_SPACE_SEPARATOR ) ;
}
2021-07-28 21:45:09 -04:00
2024-10-08 17:15:55 -04:00
bool code_point_has_symbol_general_category ( u32 code_point )
{
return code_point_has_general_category ( code_point , GENERAL_CATEGORY_SYMBOL ) ;
}
2024-06-21 10:39:40 -04:00
static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1 ;
static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2 ;
static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3 ;
static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4 ;
Optional < Property > property_from_string ( StringView property )
2021-07-29 14:18:51 -04:00
{
2024-06-21 10:39:40 -04:00
static auto property_names = [ ] ( ) {
Array < PropertyName < Property > , PROPERTY_LIMIT . value ( ) > names ;
for ( Property property = 0 ; property < UCHAR_BINARY_LIMIT ; + + property ) {
auto icu_property = static_cast < UProperty > ( property . value ( ) ) ;
if ( char const * name = u_getPropertyName ( icu_property , U_LONG_PROPERTY_NAME ) )
names [ property . value ( ) ] . long_name = StringView { name , strlen ( name ) } ;
if ( char const * name = u_getPropertyName ( icu_property , U_SHORT_PROPERTY_NAME ) )
names [ property . value ( ) ] . short_name = StringView { name , strlen ( name ) } ;
if ( char const * name = u_getPropertyName ( icu_property , ADDITIONAL_NAME ) )
names [ property . value ( ) ] . additional_name = StringView { name , strlen ( name ) } ;
}
names [ PROPERTY_ANY . value ( ) ] = { " Any " sv , { } , { } } ;
names [ PROPERTY_ASCII . value ( ) ] = { " ASCII " sv , { } , { } } ;
names [ PROPERTY_ASSIGNED . value ( ) ] = { " Assigned " sv , { } , { } } ;
return names ;
} ( ) ;
if ( auto index = find_index ( property_names . begin ( ) , property_names . end ( ) , property ) ; index ! = property_names . size ( ) )
return static_cast < Property > ( index ) ;
return { } ;
}
2026-02-15 15:08:24 +01:00
bool code_point_has_property ( u32 code_point , Property property , CaseSensitivity case_sensitivity )
2024-06-21 10:39:40 -04:00
{
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
if ( property = = PROPERTY_ANY )
return is_unicode ( code_point ) ;
if ( property = = PROPERTY_ASCII )
return is_ascii ( code_point ) ;
if ( property = = PROPERTY_ASSIGNED )
2026-02-15 15:08:24 +01:00
return u_isdefined ( icu_code_point ) ! = 0 ;
auto icu_property = static_cast < UProperty > ( property . value ( ) ) ;
if ( u_hasBinaryProperty ( icu_code_point , icu_property ) )
return true ;
if ( case_sensitivity = = CaseSensitivity : : CaseSensitive )
return false ;
2024-06-21 10:39:40 -04:00
2026-02-15 15:08:24 +01:00
auto & set = s_property_sets_with_case_closure . ensure ( property , [ & ] {
UErrorCode status = U_ZERO_ERROR ;
auto new_set = make < icu : : UnicodeSet > ( ) ;
new_set - > applyIntPropertyValue ( icu_property , 1 , status ) ;
new_set - > closeOver ( USET_CASE_INSENSITIVE ) ;
new_set - > freeze ( ) ;
return new_set ;
} ) ;
return set - > contains ( icu_code_point ) ;
2024-06-21 10:39:40 -04:00
}
bool code_point_has_emoji_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_EMOJI ) ;
}
bool code_point_has_emoji_modifier_base_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_EMOJI_MODIFIER_BASE ) ;
}
bool code_point_has_emoji_presentation_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_EMOJI_PRESENTATION ) ;
}
bool code_point_has_identifier_start_property ( u32 code_point )
{
return u_isIDStart ( static_cast < UChar32 > ( code_point ) ) ;
}
bool code_point_has_identifier_continue_property ( u32 code_point )
{
return u_isIDPart ( static_cast < UChar32 > ( code_point ) ) ;
}
bool code_point_has_regional_indicator_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_REGIONAL_INDICATOR ) ;
}
bool code_point_has_variation_selector_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_VARIATION_SELECTOR ) ;
}
2024-11-03 17:13:56 -05:00
bool code_point_has_white_space_property ( u32 code_point )
{
return code_point_has_property ( code_point , UCHAR_WHITE_SPACE ) ;
}
2024-06-21 10:39:40 -04:00
// https://tc39.es/ecma262/#table-binary-unicode-properties
bool is_ecma262_property ( Property property )
{
if ( property = = PROPERTY_ANY | | property = = PROPERTY_ASCII | | property = = PROPERTY_ASSIGNED )
return true ;
switch ( property . value ( ) ) {
case UCHAR_ASCII_HEX_DIGIT :
case UCHAR_ALPHABETIC :
case UCHAR_BIDI_CONTROL :
case UCHAR_BIDI_MIRRORED :
case UCHAR_CASE_IGNORABLE :
case UCHAR_CASED :
case UCHAR_CHANGES_WHEN_CASEFOLDED :
case UCHAR_CHANGES_WHEN_CASEMAPPED :
case UCHAR_CHANGES_WHEN_LOWERCASED :
case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED :
case UCHAR_CHANGES_WHEN_TITLECASED :
case UCHAR_CHANGES_WHEN_UPPERCASED :
case UCHAR_DASH :
case UCHAR_DEFAULT_IGNORABLE_CODE_POINT :
case UCHAR_DEPRECATED :
case UCHAR_DIACRITIC :
case UCHAR_EMOJI :
case UCHAR_EMOJI_COMPONENT :
case UCHAR_EMOJI_MODIFIER :
case UCHAR_EMOJI_MODIFIER_BASE :
case UCHAR_EMOJI_PRESENTATION :
case UCHAR_EXTENDED_PICTOGRAPHIC :
case UCHAR_EXTENDER :
case UCHAR_GRAPHEME_BASE :
case UCHAR_GRAPHEME_EXTEND :
case UCHAR_HEX_DIGIT :
case UCHAR_IDS_BINARY_OPERATOR :
case UCHAR_IDS_TRINARY_OPERATOR :
case UCHAR_ID_CONTINUE :
case UCHAR_ID_START :
case UCHAR_IDEOGRAPHIC :
case UCHAR_JOIN_CONTROL :
case UCHAR_LOGICAL_ORDER_EXCEPTION :
case UCHAR_LOWERCASE :
case UCHAR_MATH :
case UCHAR_NONCHARACTER_CODE_POINT :
case UCHAR_PATTERN_SYNTAX :
case UCHAR_PATTERN_WHITE_SPACE :
case UCHAR_QUOTATION_MARK :
case UCHAR_RADICAL :
case UCHAR_REGIONAL_INDICATOR :
case UCHAR_S_TERM :
case UCHAR_SOFT_DOTTED :
case UCHAR_TERMINAL_PUNCTUATION :
case UCHAR_UNIFIED_IDEOGRAPH :
case UCHAR_UPPERCASE :
case UCHAR_VARIATION_SELECTOR :
case UCHAR_WHITE_SPACE :
case UCHAR_XID_CONTINUE :
case UCHAR_XID_START :
2021-07-29 14:18:51 -04:00
return true ;
default :
return false ;
}
}
2025-07-24 00:16:08 +02:00
// https://tc39.es/ecma262/#table-binary-unicode-properties-of-strings
bool is_ecma262_string_property ( Property property )
{
switch ( property . value ( ) ) {
case UCHAR_BASIC_EMOJI :
case UCHAR_EMOJI_KEYCAP_SEQUENCE :
case UCHAR_RGI_EMOJI :
case UCHAR_RGI_EMOJI_FLAG_SEQUENCE :
case UCHAR_RGI_EMOJI_TAG_SEQUENCE :
case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE :
case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE :
return true ;
default :
return false ;
}
}
2025-11-09 13:35:16 +01:00
Vector < String > get_property_strings ( Property property )
{
Vector < String > result ;
if ( ! is_ecma262_string_property ( property ) )
return result ;
UErrorCode status = U_ZERO_ERROR ;
auto const * icu_set = u_getBinaryPropertySet ( static_cast < UProperty > ( property . value ( ) ) , & status ) ;
if ( ! icu_success ( status ) | | ! icu_set )
return result ;
auto const * unicode_set = icu : : UnicodeSet : : fromUSet ( icu_set ) ;
if ( ! unicode_set )
return result ;
auto range_count = unicode_set - > getRangeCount ( ) ;
for ( int32_t i = 0 ; i < range_count ; + + i ) {
auto start = unicode_set - > getRangeStart ( i ) ;
auto end = unicode_set - > getRangeEnd ( i ) ;
for ( auto code_point = start ; code_point < = end ; + + code_point ) {
result . append ( String : : from_code_point ( code_point ) ) ;
}
}
for ( auto const & str : unicode_set - > strings ( ) ) {
result . append ( icu_string_to_string ( str ) ) ;
}
return result ;
}
2024-06-21 12:39:11 -04:00
Optional < Script > script_from_string ( StringView script )
{
static auto script_names = [ ] ( ) {
Array < PropertyName < Script > , static_cast < size_t > ( USCRIPT_CODE_LIMIT ) > names ;
for ( Script script = 0 ; script < USCRIPT_CODE_LIMIT ; + + script ) {
auto icu_script = static_cast < UScriptCode > ( script . value ( ) ) ;
if ( char const * name = uscript_getName ( icu_script ) )
names [ script . value ( ) ] . long_name = StringView { name , strlen ( name ) } ;
if ( char const * name = uscript_getShortName ( icu_script ) )
names [ script . value ( ) ] . short_name = StringView { name , strlen ( name ) } ;
if ( char const * name = u_getPropertyValueName ( UCHAR_SCRIPT , icu_script , ADDITIONAL_NAME ) )
names [ script . value ( ) ] . additional_name = StringView { name , strlen ( name ) } ;
}
return names ;
} ( ) ;
if ( auto index = find_index ( script_names . begin ( ) , script_names . end ( ) , script ) ; index ! = script_names . size ( ) )
return static_cast < Script > ( index ) ;
return { } ;
}
bool code_point_has_script ( u32 code_point , Script script )
{
UErrorCode status = U_ZERO_ERROR ;
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
auto icu_script = static_cast < UScriptCode > ( script . value ( ) ) ;
2024-06-23 09:14:27 -04:00
if ( auto result = uscript_getScript ( icu_code_point , & status ) ; icu_success ( status ) )
2024-06-21 12:39:11 -04:00
return result = = icu_script ;
return false ;
}
bool code_point_has_script_extension ( u32 code_point , Script script )
{
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
auto icu_script = static_cast < UScriptCode > ( script . value ( ) ) ;
return static_cast < bool > ( uscript_hasScript ( icu_code_point , icu_script ) ) ;
}
2021-08-04 07:05:30 -04:00
2024-06-21 15:28:54 -04:00
static constexpr BidiClass char_direction_to_bidi_class ( UCharDirection direction )
{
switch ( direction ) {
case U_ARABIC_NUMBER :
return BidiClass : : ArabicNumber ;
case U_BLOCK_SEPARATOR :
return BidiClass : : BlockSeparator ;
case U_BOUNDARY_NEUTRAL :
return BidiClass : : BoundaryNeutral ;
case U_COMMON_NUMBER_SEPARATOR :
return BidiClass : : CommonNumberSeparator ;
case U_DIR_NON_SPACING_MARK :
return BidiClass : : DirNonSpacingMark ;
case U_EUROPEAN_NUMBER :
return BidiClass : : EuropeanNumber ;
case U_EUROPEAN_NUMBER_SEPARATOR :
return BidiClass : : EuropeanNumberSeparator ;
case U_EUROPEAN_NUMBER_TERMINATOR :
return BidiClass : : EuropeanNumberTerminator ;
case U_FIRST_STRONG_ISOLATE :
return BidiClass : : FirstStrongIsolate ;
case U_LEFT_TO_RIGHT :
return BidiClass : : LeftToRight ;
case U_LEFT_TO_RIGHT_EMBEDDING :
return BidiClass : : LeftToRightEmbedding ;
case U_LEFT_TO_RIGHT_ISOLATE :
return BidiClass : : LeftToRightIsolate ;
case U_LEFT_TO_RIGHT_OVERRIDE :
return BidiClass : : LeftToRightOverride ;
case U_OTHER_NEUTRAL :
return BidiClass : : OtherNeutral ;
case U_POP_DIRECTIONAL_FORMAT :
return BidiClass : : PopDirectionalFormat ;
case U_POP_DIRECTIONAL_ISOLATE :
return BidiClass : : PopDirectionalIsolate ;
case U_RIGHT_TO_LEFT :
return BidiClass : : RightToLeft ;
case U_RIGHT_TO_LEFT_ARABIC :
return BidiClass : : RightToLeftArabic ;
case U_RIGHT_TO_LEFT_EMBEDDING :
return BidiClass : : RightToLeftEmbedding ;
case U_RIGHT_TO_LEFT_ISOLATE :
return BidiClass : : RightToLeftIsolate ;
case U_RIGHT_TO_LEFT_OVERRIDE :
return BidiClass : : RightToLeftOverride ;
case U_SEGMENT_SEPARATOR :
return BidiClass : : SegmentSeparator ;
case U_WHITE_SPACE_NEUTRAL :
return BidiClass : : WhiteSpaceNeutral ;
case U_CHAR_DIRECTION_COUNT :
break ;
}
VERIFY_NOT_REACHED ( ) ;
}
BidiClass bidirectional_class ( u32 code_point )
{
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
auto direction = u_charDirection ( icu_code_point ) ;
return char_direction_to_bidi_class ( direction ) ;
}
2023-08-12 21:00:58 +01:00
2026-01-21 11:03:59 +00:00
LineBreakClass line_break_class ( u32 code_point )
{
auto icu_code_point = static_cast < UChar32 > ( code_point ) ;
auto icu_line_break = static_cast < ULineBreak > ( u_getIntPropertyValue ( icu_code_point , UCHAR_LINE_BREAK ) ) ;
switch ( icu_line_break ) {
case U_LB_ALPHABETIC :
case U_LB_HEBREW_LETTER :
return LineBreakClass : : Alphabetic ;
case U_LB_NUMERIC :
return LineBreakClass : : Numeric ;
case U_LB_IDEOGRAPHIC :
case U_LB_H2 :
case U_LB_H3 :
return LineBreakClass : : Ideographic ;
case U_LB_AMBIGUOUS :
return LineBreakClass : : Ambiguous ;
case U_LB_COMPLEX_CONTEXT :
return LineBreakClass : : ComplexContext ;
case U_LB_COMBINING_MARK :
return LineBreakClass : : CombiningMark ;
default :
return LineBreakClass : : Other ;
}
}
2026-02-15 15:08:24 +01:00
// 22.2.2.7.3 Canonicalize ( rer, ch ), https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
u32 canonicalize ( u32 code_point , bool unicode_mode )
{
// 1. If HasEitherUnicodeFlag(rer) is true and rer.[[IgnoreCase]] is true, then
// a. If the file CaseFolding.txt of the Unicode Character Database provides a simple or common case folding mapping for ch, return the result of applying that mapping to ch.
// b. Return ch.
if ( unicode_mode )
return u_foldCase ( static_cast < UChar32 > ( code_point ) , U_FOLD_CASE_DEFAULT ) ;
// 2. If rer.[[IgnoreCase]] is false, return ch.
// NOTE: This is handled by the caller.
// 3. Assert: ch is a UTF-16 code unit.
// 4. Let cp be the code point whose numeric value is the numeric value of ch.
// NOTE: We already have a code point.
// 5. Let u be toUppercase(« cp »), according to the Unicode Default Case Conversion algorithm.
// 6. Let uStr be CodePointsToString(u).
auto code_point_string = String : : from_code_point ( code_point ) ;
auto uppercased = code_point_string . to_uppercase ( ) ;
if ( uppercased . is_error ( ) )
return code_point ;
auto code_points = uppercased . value ( ) . code_points ( ) ;
// 7. If the length of uStr ≠1, return ch.
if ( code_points . length ( ) ! = 1 )
return code_point ;
// 8. Let cu be uStr's single code unit element.
auto it = code_points . begin ( ) ;
auto uppercased_code_point = * it ;
// 9. If the numeric value of ch ≥ 128 and the numeric value of cu < 128, return ch.
if ( code_point > = 128 & & uppercased_code_point < 128 )
return code_point ;
// 10. Return cu.
return uppercased_code_point ;
}
Vector < CodePointRange > expand_range_case_insensitive ( u32 from , u32 to )
{
icu : : UnicodeSet set ( static_cast < UChar32 > ( from ) , static_cast < UChar32 > ( to ) ) ;
set . closeOver ( USET_CASE_INSENSITIVE ) ;
Vector < CodePointRange > result ;
auto range_count = set . getRangeCount ( ) ;
result . ensure_capacity ( range_count ) ;
for ( int32_t i = 0 ; i < range_count ; + + i )
result . unchecked_append ( { static_cast < u32 > ( set . getRangeStart ( i ) ) , static_cast < u32 > ( set . getRangeEnd ( i ) ) } ) ;
return result ;
}
void for_each_case_folded_code_point ( u32 code_point , Function < IterationDecision ( u32 ) > callback )
{
u32 canonical = canonicalize ( code_point , true ) ;
icu : : UnicodeSet closure ( static_cast < UChar32 > ( canonical ) , static_cast < UChar32 > ( canonical ) ) ;
closure . closeOver ( USET_CASE_INSENSITIVE ) ;
auto range_count = closure . getRangeCount ( ) ;
for ( int32_t i = 0 ; i < range_count ; + + i ) {
auto start = closure . getRangeStart ( i ) ;
auto end = closure . getRangeEnd ( i ) ;
for ( auto cp = start ; cp < = end ; + + cp ) {
if ( callback ( static_cast < u32 > ( cp ) ) = = IterationDecision : : Break )
return ;
}
}
}
bool code_point_matches_range_ignoring_case ( u32 code_point , u32 from , u32 to , bool unicode_mode )
{
if ( code_point > = from & & code_point < = to )
return true ;
icu : : UnicodeSet candidates ( static_cast < UChar32 > ( code_point ) , static_cast < UChar32 > ( code_point ) ) ;
candidates . closeOver ( USET_CASE_INSENSITIVE ) ;
candidates . retain ( static_cast < UChar32 > ( from ) , static_cast < UChar32 > ( to ) ) ;
if ( candidates . isEmpty ( ) )
return false ;
auto canonical_ch = canonicalize ( code_point , unicode_mode ) ;
auto range_count = candidates . getRangeCount ( ) ;
for ( auto i = 0 ; i < range_count ; + + i ) {
auto start = candidates . getRangeStart ( i ) ;
auto end = candidates . getRangeEnd ( i ) ;
for ( auto candidate_cp = start ; candidate_cp < = end ; + + candidate_cp ) {
if ( canonicalize ( candidate_cp , unicode_mode ) = = canonical_ch )
return true ;
}
}
return false ;
}
2021-07-25 15:10:51 -04:00
}