| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  | /*
 | 
					
						
							| 
									
										
										
										
											2024-06-19 16:39:30 -04:00
										 |  |  |  * Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org> | 
					
						
							| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  |  * | 
					
						
							|  |  |  |  * SPDX-License-Identifier: BSD-2-Clause | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  | #include <AK/Array.h>
 | 
					
						
							|  |  |  | #include <AK/CharacterTypes.h>
 | 
					
						
							|  |  |  | #include <AK/Find.h>
 | 
					
						
							|  |  |  | #include <AK/Traits.h>
 | 
					
						
							| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  | #include <LibUnicode/CharacterTypes.h>
 | 
					
						
							| 
									
										
										
										
											2024-06-23 09:14:27 -04:00
										 |  |  | #include <LibUnicode/ICU.h>
 | 
					
						
							| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  | #include <unicode/uchar.h>
 | 
					
						
							| 
									
										
										
										
											2024-06-21 12:39:11 -04:00
										 |  |  | #include <unicode/uscript.h>
 | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | namespace Unicode { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template<typename PropertyType> | 
					
						
							|  |  |  | struct PropertyName { | 
					
						
							|  |  |  |     Optional<StringView> long_name; | 
					
						
							|  |  |  |     Optional<StringView> short_name; | 
					
						
							|  |  |  |     Optional<StringView> additional_name; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // From uchar.h:
 | 
					
						
							|  |  |  | // Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i
 | 
					
						
							|  |  |  | static constexpr auto ADDITIONAL_NAME = static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template<typename PropertyType> | 
					
						
							|  |  |  | struct AK::Traits<Unicode::PropertyName<PropertyType>> { | 
					
						
							|  |  |  |     static constexpr bool equals(Unicode::PropertyName<PropertyType> const& candidate, StringView property) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | namespace Unicode { | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 11:24:00 -04:00
										 |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8; | 
					
						
							|  |  |  | static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Optional<GeneralCategory> general_category_from_string(StringView general_category) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     static auto general_category_names = []() { | 
					
						
							|  |  |  |         Array<PropertyName<GeneralCategory>, GENERAL_CATEGORY_LIMIT.value()> names; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         auto set_names = [&](auto property, auto index, auto general_category) { | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyValueName(property, general_category, U_LONG_PROPERTY_NAME)) | 
					
						
							|  |  |  |                 names[index.value()].long_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyValueName(property, general_category, U_SHORT_PROPERTY_NAME)) | 
					
						
							|  |  |  |                 names[index.value()].short_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyValueName(property, general_category, ADDITIONAL_NAME)) | 
					
						
							|  |  |  |                 names[index.value()].additional_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for (GeneralCategory general_category = 0; general_category < U_CHAR_CATEGORY_COUNT; ++general_category) | 
					
						
							|  |  |  |             set_names(UCHAR_GENERAL_CATEGORY, general_category, static_cast<UCharCategory>(general_category.value())); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_CASED_LETTER, U_GC_LC_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_LETTER, U_GC_L_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_MARK, U_GC_M_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_NUMBER, U_GC_N_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_PUNCTUATION, U_GC_P_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SYMBOL, U_GC_S_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SEPARATOR, U_GC_Z_MASK); | 
					
						
							|  |  |  |         set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_OTHER, U_GC_C_MASK); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return names; | 
					
						
							|  |  |  |     }(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (auto index = find_index(general_category_names.begin(), general_category_names.end(), general_category); index != general_category_names.size()) | 
					
						
							|  |  |  |         return static_cast<GeneralCategory>(index); | 
					
						
							|  |  |  |     return {}; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_general_category(u32 code_point, GeneralCategory general_category) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     auto icu_code_point = static_cast<UChar32>(code_point); | 
					
						
							|  |  |  |     auto icu_general_category = static_cast<UCharCategory>(general_category.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_CASED_LETTER) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_LC_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_LETTER) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_L_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_MARK) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_M_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_NUMBER) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_N_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_PUNCTUATION) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_P_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_SYMBOL) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_S_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_SEPARATOR) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_Z_MASK) != 0; | 
					
						
							|  |  |  |     if (general_category == GENERAL_CATEGORY_OTHER) | 
					
						
							|  |  |  |         return (U_GET_GC_MASK(icu_code_point) & U_GC_C_MASK) != 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return u_charType(icu_code_point) == icu_general_category; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-21 16:29:16 -04:00
										 |  |  | bool code_point_is_printable(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return static_cast<bool>(u_isprint(static_cast<UChar32>(code_point))); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 11:24:00 -04:00
										 |  |  | bool code_point_has_control_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, U_CONTROL_CHAR); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 17:15:55 -04:00
										 |  |  | bool code_point_has_letter_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, GENERAL_CATEGORY_LETTER); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_number_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, GENERAL_CATEGORY_NUMBER); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-05 12:07:59 -04:00
										 |  |  | bool code_point_has_punctuation_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, GENERAL_CATEGORY_PUNCTUATION); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_separator_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, GENERAL_CATEGORY_SEPARATOR); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 11:24:00 -04:00
										 |  |  | bool code_point_has_space_separator_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, U_SPACE_SEPARATOR); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-07-28 21:45:09 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 17:15:55 -04:00
										 |  |  | bool code_point_has_symbol_general_category(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_general_category(code_point, GENERAL_CATEGORY_SYMBOL); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  | static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1; | 
					
						
							|  |  |  | static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2; | 
					
						
							|  |  |  | static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3; | 
					
						
							|  |  |  | static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Optional<Property> property_from_string(StringView property) | 
					
						
							| 
									
										
										
										
											2021-07-29 14:18:51 -04:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  |     static auto property_names = []() { | 
					
						
							|  |  |  |         Array<PropertyName<Property>, PROPERTY_LIMIT.value()> names; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) { | 
					
						
							|  |  |  |             auto icu_property = static_cast<UProperty>(property.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME)) | 
					
						
							|  |  |  |                 names[property.value()].long_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME)) | 
					
						
							|  |  |  |                 names[property.value()].short_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME)) | 
					
						
							|  |  |  |                 names[property.value()].additional_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} }; | 
					
						
							|  |  |  |         names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} }; | 
					
						
							|  |  |  |         names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return names; | 
					
						
							|  |  |  |     }(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size()) | 
					
						
							|  |  |  |         return static_cast<Property>(index); | 
					
						
							|  |  |  |     return {}; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_property(u32 code_point, Property property) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     auto icu_code_point = static_cast<UChar32>(code_point); | 
					
						
							|  |  |  |     auto icu_property = static_cast<UProperty>(property.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (property == PROPERTY_ANY) | 
					
						
							|  |  |  |         return is_unicode(code_point); | 
					
						
							|  |  |  |     if (property == PROPERTY_ASCII) | 
					
						
							|  |  |  |         return is_ascii(code_point); | 
					
						
							|  |  |  |     if (property == PROPERTY_ASSIGNED) | 
					
						
							|  |  |  |         return u_isdefined(icu_code_point); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return static_cast<bool>(u_hasBinaryProperty(icu_code_point, icu_property)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_emoji_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_EMOJI); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_emoji_modifier_base_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_emoji_presentation_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_identifier_start_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return u_isIDStart(static_cast<UChar32>(code_point)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_identifier_continue_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return u_isIDPart(static_cast<UChar32>(code_point)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_regional_indicator_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_variation_selector_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-03 17:13:56 -05:00
										 |  |  | bool code_point_has_white_space_property(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     return code_point_has_property(code_point, UCHAR_WHITE_SPACE); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 10:39:40 -04:00
										 |  |  | // https://tc39.es/ecma262/#table-binary-unicode-properties
 | 
					
						
							|  |  |  | bool is_ecma262_property(Property property) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED) | 
					
						
							|  |  |  |         return true; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     switch (property.value()) { | 
					
						
							|  |  |  |     case UCHAR_ASCII_HEX_DIGIT: | 
					
						
							|  |  |  |     case UCHAR_ALPHABETIC: | 
					
						
							|  |  |  |     case UCHAR_BIDI_CONTROL: | 
					
						
							|  |  |  |     case UCHAR_BIDI_MIRRORED: | 
					
						
							|  |  |  |     case UCHAR_CASE_IGNORABLE: | 
					
						
							|  |  |  |     case UCHAR_CASED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_CASEFOLDED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_CASEMAPPED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_LOWERCASED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_TITLECASED: | 
					
						
							|  |  |  |     case UCHAR_CHANGES_WHEN_UPPERCASED: | 
					
						
							|  |  |  |     case UCHAR_DASH: | 
					
						
							|  |  |  |     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: | 
					
						
							|  |  |  |     case UCHAR_DEPRECATED: | 
					
						
							|  |  |  |     case UCHAR_DIACRITIC: | 
					
						
							|  |  |  |     case UCHAR_EMOJI: | 
					
						
							|  |  |  |     case UCHAR_EMOJI_COMPONENT: | 
					
						
							|  |  |  |     case UCHAR_EMOJI_MODIFIER: | 
					
						
							|  |  |  |     case UCHAR_EMOJI_MODIFIER_BASE: | 
					
						
							|  |  |  |     case UCHAR_EMOJI_PRESENTATION: | 
					
						
							|  |  |  |     case UCHAR_EXTENDED_PICTOGRAPHIC: | 
					
						
							|  |  |  |     case UCHAR_EXTENDER: | 
					
						
							|  |  |  |     case UCHAR_GRAPHEME_BASE: | 
					
						
							|  |  |  |     case UCHAR_GRAPHEME_EXTEND: | 
					
						
							|  |  |  |     case UCHAR_HEX_DIGIT: | 
					
						
							|  |  |  |     case UCHAR_IDS_BINARY_OPERATOR: | 
					
						
							|  |  |  |     case UCHAR_IDS_TRINARY_OPERATOR: | 
					
						
							|  |  |  |     case UCHAR_ID_CONTINUE: | 
					
						
							|  |  |  |     case UCHAR_ID_START: | 
					
						
							|  |  |  |     case UCHAR_IDEOGRAPHIC: | 
					
						
							|  |  |  |     case UCHAR_JOIN_CONTROL: | 
					
						
							|  |  |  |     case UCHAR_LOGICAL_ORDER_EXCEPTION: | 
					
						
							|  |  |  |     case UCHAR_LOWERCASE: | 
					
						
							|  |  |  |     case UCHAR_MATH: | 
					
						
							|  |  |  |     case UCHAR_NONCHARACTER_CODE_POINT: | 
					
						
							|  |  |  |     case UCHAR_PATTERN_SYNTAX: | 
					
						
							|  |  |  |     case UCHAR_PATTERN_WHITE_SPACE: | 
					
						
							|  |  |  |     case UCHAR_QUOTATION_MARK: | 
					
						
							|  |  |  |     case UCHAR_RADICAL: | 
					
						
							|  |  |  |     case UCHAR_REGIONAL_INDICATOR: | 
					
						
							|  |  |  |     case UCHAR_S_TERM: | 
					
						
							|  |  |  |     case UCHAR_SOFT_DOTTED: | 
					
						
							|  |  |  |     case UCHAR_TERMINAL_PUNCTUATION: | 
					
						
							|  |  |  |     case UCHAR_UNIFIED_IDEOGRAPH: | 
					
						
							|  |  |  |     case UCHAR_UPPERCASE: | 
					
						
							|  |  |  |     case UCHAR_VARIATION_SELECTOR: | 
					
						
							|  |  |  |     case UCHAR_WHITE_SPACE: | 
					
						
							|  |  |  |     case UCHAR_XID_CONTINUE: | 
					
						
							|  |  |  |     case UCHAR_XID_START: | 
					
						
							| 
									
										
										
										
											2021-07-29 14:18:51 -04:00
										 |  |  |         return true; | 
					
						
							|  |  |  |     default: | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 12:39:11 -04:00
										 |  |  | Optional<Script> script_from_string(StringView script) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     static auto script_names = []() { | 
					
						
							|  |  |  |         Array<PropertyName<Script>, static_cast<size_t>(USCRIPT_CODE_LIMIT)> names; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for (Script script = 0; script < USCRIPT_CODE_LIMIT; ++script) { | 
					
						
							|  |  |  |             auto icu_script = static_cast<UScriptCode>(script.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if (char const* name = uscript_getName(icu_script)) | 
					
						
							|  |  |  |                 names[script.value()].long_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = uscript_getShortName(icu_script)) | 
					
						
							|  |  |  |                 names[script.value()].short_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |             if (char const* name = u_getPropertyValueName(UCHAR_SCRIPT, icu_script, ADDITIONAL_NAME)) | 
					
						
							|  |  |  |                 names[script.value()].additional_name = StringView { name, strlen(name) }; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return names; | 
					
						
							|  |  |  |     }(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (auto index = find_index(script_names.begin(), script_names.end(), script); index != script_names.size()) | 
					
						
							|  |  |  |         return static_cast<Script>(index); | 
					
						
							|  |  |  |     return {}; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_script(u32 code_point, Script script) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     UErrorCode status = U_ZERO_ERROR; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto icu_code_point = static_cast<UChar32>(code_point); | 
					
						
							|  |  |  |     auto icu_script = static_cast<UScriptCode>(script.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-23 09:14:27 -04:00
										 |  |  |     if (auto result = uscript_getScript(icu_code_point, &status); icu_success(status)) | 
					
						
							| 
									
										
										
										
											2024-06-21 12:39:11 -04:00
										 |  |  |         return result == icu_script; | 
					
						
							|  |  |  |     return false; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool code_point_has_script_extension(u32 code_point, Script script) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     auto icu_code_point = static_cast<UChar32>(code_point); | 
					
						
							|  |  |  |     auto icu_script = static_cast<UScriptCode>(script.value()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return static_cast<bool>(uscript_hasScript(icu_code_point, icu_script)); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-08-04 07:05:30 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-21 15:28:54 -04:00
										 |  |  | static constexpr BidiClass char_direction_to_bidi_class(UCharDirection direction) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     switch (direction) { | 
					
						
							|  |  |  |     case U_ARABIC_NUMBER: | 
					
						
							|  |  |  |         return BidiClass::ArabicNumber; | 
					
						
							|  |  |  |     case U_BLOCK_SEPARATOR: | 
					
						
							|  |  |  |         return BidiClass::BlockSeparator; | 
					
						
							|  |  |  |     case U_BOUNDARY_NEUTRAL: | 
					
						
							|  |  |  |         return BidiClass::BoundaryNeutral; | 
					
						
							|  |  |  |     case U_COMMON_NUMBER_SEPARATOR: | 
					
						
							|  |  |  |         return BidiClass::CommonNumberSeparator; | 
					
						
							|  |  |  |     case U_DIR_NON_SPACING_MARK: | 
					
						
							|  |  |  |         return BidiClass::DirNonSpacingMark; | 
					
						
							|  |  |  |     case U_EUROPEAN_NUMBER: | 
					
						
							|  |  |  |         return BidiClass::EuropeanNumber; | 
					
						
							|  |  |  |     case U_EUROPEAN_NUMBER_SEPARATOR: | 
					
						
							|  |  |  |         return BidiClass::EuropeanNumberSeparator; | 
					
						
							|  |  |  |     case U_EUROPEAN_NUMBER_TERMINATOR: | 
					
						
							|  |  |  |         return BidiClass::EuropeanNumberTerminator; | 
					
						
							|  |  |  |     case U_FIRST_STRONG_ISOLATE: | 
					
						
							|  |  |  |         return BidiClass::FirstStrongIsolate; | 
					
						
							|  |  |  |     case U_LEFT_TO_RIGHT: | 
					
						
							|  |  |  |         return BidiClass::LeftToRight; | 
					
						
							|  |  |  |     case U_LEFT_TO_RIGHT_EMBEDDING: | 
					
						
							|  |  |  |         return BidiClass::LeftToRightEmbedding; | 
					
						
							|  |  |  |     case U_LEFT_TO_RIGHT_ISOLATE: | 
					
						
							|  |  |  |         return BidiClass::LeftToRightIsolate; | 
					
						
							|  |  |  |     case U_LEFT_TO_RIGHT_OVERRIDE: | 
					
						
							|  |  |  |         return BidiClass::LeftToRightOverride; | 
					
						
							|  |  |  |     case U_OTHER_NEUTRAL: | 
					
						
							|  |  |  |         return BidiClass::OtherNeutral; | 
					
						
							|  |  |  |     case U_POP_DIRECTIONAL_FORMAT: | 
					
						
							|  |  |  |         return BidiClass::PopDirectionalFormat; | 
					
						
							|  |  |  |     case U_POP_DIRECTIONAL_ISOLATE: | 
					
						
							|  |  |  |         return BidiClass::PopDirectionalIsolate; | 
					
						
							|  |  |  |     case U_RIGHT_TO_LEFT: | 
					
						
							|  |  |  |         return BidiClass::RightToLeft; | 
					
						
							|  |  |  |     case U_RIGHT_TO_LEFT_ARABIC: | 
					
						
							|  |  |  |         return BidiClass::RightToLeftArabic; | 
					
						
							|  |  |  |     case U_RIGHT_TO_LEFT_EMBEDDING: | 
					
						
							|  |  |  |         return BidiClass::RightToLeftEmbedding; | 
					
						
							|  |  |  |     case U_RIGHT_TO_LEFT_ISOLATE: | 
					
						
							|  |  |  |         return BidiClass::RightToLeftIsolate; | 
					
						
							|  |  |  |     case U_RIGHT_TO_LEFT_OVERRIDE: | 
					
						
							|  |  |  |         return BidiClass::RightToLeftOverride; | 
					
						
							|  |  |  |     case U_SEGMENT_SEPARATOR: | 
					
						
							|  |  |  |         return BidiClass::SegmentSeparator; | 
					
						
							|  |  |  |     case U_WHITE_SPACE_NEUTRAL: | 
					
						
							|  |  |  |         return BidiClass::WhiteSpaceNeutral; | 
					
						
							|  |  |  |     case U_CHAR_DIRECTION_COUNT: | 
					
						
							|  |  |  |         break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     VERIFY_NOT_REACHED(); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | BidiClass bidirectional_class(u32 code_point) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     auto icu_code_point = static_cast<UChar32>(code_point); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto direction = u_charDirection(icu_code_point); | 
					
						
							|  |  |  |     return char_direction_to_bidi_class(direction); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-08-12 21:00:58 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-25 15:10:51 -04:00
										 |  |  | } |