mirror of
				https://github.com/LadybirdBrowser/ladybird.git
				synced 2025-11-04 07:10:57 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			312 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			312 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (c) 2022, mat
 | 
						|
 *
 | 
						|
 * SPDX-License-Identifier: BSD-2-Clause
 | 
						|
 */
 | 
						|
 | 
						|
#include <AK/Find.h>
 | 
						|
#include <AK/QuickSort.h>
 | 
						|
#include <AK/Utf8View.h>
 | 
						|
#include <AK/Vector.h>
 | 
						|
#include <LibUnicode/CharacterTypes.h>
 | 
						|
#include <LibUnicode/Normalize.h>
 | 
						|
 | 
						|
#if ENABLE_UNICODE_DATA
 | 
						|
#    include <LibUnicode/UnicodeData.h>
 | 
						|
#else
 | 
						|
struct Unicode::CodePointDecomposition { };
 | 
						|
#endif
 | 
						|
 | 
						|
namespace Unicode {
 | 
						|
 | 
						|
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
 | 
						|
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; }
 | 
						|
 | 
						|
NormalizationForm normalization_form_from_string(StringView form)
 | 
						|
{
 | 
						|
    if (form == "NFD"sv)
 | 
						|
        return NormalizationForm::NFD;
 | 
						|
    if (form == "NFC"sv)
 | 
						|
        return NormalizationForm::NFC;
 | 
						|
    if (form == "NFKD"sv)
 | 
						|
        return NormalizationForm::NFKD;
 | 
						|
    if (form == "NFKC"sv)
 | 
						|
        return NormalizationForm::NFKC;
 | 
						|
    VERIFY_NOT_REACHED();
 | 
						|
}
 | 
						|
 | 
						|
StringView normalization_form_to_string(NormalizationForm form)
 | 
						|
{
 | 
						|
    switch (form) {
 | 
						|
    case NormalizationForm::NFD:
 | 
						|
        return "NFD"sv;
 | 
						|
    case NormalizationForm::NFC:
 | 
						|
        return "NFC"sv;
 | 
						|
    case NormalizationForm::NFKD:
 | 
						|
        return "NFKD"sv;
 | 
						|
    case NormalizationForm::NFKC:
 | 
						|
        return "NFKC"sv;
 | 
						|
    }
 | 
						|
    VERIFY_NOT_REACHED();
 | 
						|
}
 | 
						|
 | 
						|
ALWAYS_INLINE static bool is_starter(u32 code_point)
 | 
						|
{
 | 
						|
    return Unicode::canonical_combining_class(code_point) == 0;
 | 
						|
}
 | 
						|
 | 
						|
// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
 | 
						|
static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
 | 
						|
static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
 | 
						|
static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
 | 
						|
static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
 | 
						|
static constexpr u32 HANGUL_LEADING_COUNT = 19;
 | 
						|
static constexpr u32 HANGUL_VOWEL_COUNT = 21;
 | 
						|
static constexpr u32 HANGUL_TRAILING_COUNT = 28;
 | 
						|
// NCount in the standard.
 | 
						|
static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
 | 
						|
static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
 | 
						|
 | 
						|
ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
 | 
						|
{
 | 
						|
    return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
 | 
						|
}
 | 
						|
 | 
						|
ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
 | 
						|
{
 | 
						|
    return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
 | 
						|
}
 | 
						|
 | 
						|
ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
 | 
						|
{
 | 
						|
    return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
 | 
						|
}
 | 
						|
 | 
						|
ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
 | 
						|
{
 | 
						|
    return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
 | 
						|
}
 | 
						|
 | 
						|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
 | 
						|
static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
 | 
						|
{
 | 
						|
    auto const index = code_point - HANGUL_SYLLABLE_BASE;
 | 
						|
 | 
						|
    auto const leading_index = index / HANGUL_BLOCK_COUNT;
 | 
						|
    auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
 | 
						|
    auto const trailing_index = index % HANGUL_TRAILING_COUNT;
 | 
						|
 | 
						|
    auto const leading_part = HANGUL_LEADING_BASE + leading_index;
 | 
						|
    auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
 | 
						|
    auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
 | 
						|
 | 
						|
    code_points_output.append(leading_part);
 | 
						|
    code_points_output.append(vowel_part);
 | 
						|
    if (trailing_index != 0)
 | 
						|
        code_points_output.append(trailing_part);
 | 
						|
}
 | 
						|
 | 
						|
// L, V and LV, T Hangul Syllable Composition
 | 
						|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
 | 
						|
static u32 combine_hangul_code_points(u32 a, u32 b)
 | 
						|
{
 | 
						|
    if (is_hangul_leading(a) && is_hangul_vowel(b)) {
 | 
						|
        auto const leading_index = a - HANGUL_LEADING_BASE;
 | 
						|
        auto const vowel_index = b - HANGUL_VOWEL_BASE;
 | 
						|
        auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
 | 
						|
        return HANGUL_SYLLABLE_BASE + leading_vowel_index;
 | 
						|
    }
 | 
						|
    // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
 | 
						|
    if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
 | 
						|
        return a + b - HANGUL_TRAILING_BASE;
 | 
						|
    }
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
 | 
						|
{
 | 
						|
#if ENABLE_UNICODE_DATA
 | 
						|
    Array<u32, 2> const points { a, b };
 | 
						|
 | 
						|
    // FIXME: Do something better than linear search to find reverse mappings.
 | 
						|
    for (size_t index = 0;; ++index) {
 | 
						|
        auto mapping_maybe = Unicode::code_point_decomposition_by_index(index);
 | 
						|
        if (!mapping_maybe.has_value())
 | 
						|
            break;
 | 
						|
        auto& mapping = mapping_maybe.value();
 | 
						|
        if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) {
 | 
						|
            if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion))
 | 
						|
                continue;
 | 
						|
            return mapping.code_point;
 | 
						|
        }
 | 
						|
    }
 | 
						|
#endif
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
enum class UseCompatibility {
 | 
						|
    Yes,
 | 
						|
    No
 | 
						|
};
 | 
						|
 | 
						|
static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
 | 
						|
{
 | 
						|
    if (is_hangul_code_point(code_point))
 | 
						|
        return decompose_hangul_code_point(code_point, code_points_output);
 | 
						|
 | 
						|
#if ENABLE_UNICODE_DATA
 | 
						|
    auto const mapping = Unicode::code_point_decomposition(code_point);
 | 
						|
    if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
 | 
						|
        for (auto code_point : mapping->decomposition) {
 | 
						|
            decompose_code_point(code_point, code_points_output, use_compatibility);
 | 
						|
        }
 | 
						|
    } else {
 | 
						|
        code_points_output.append(code_point);
 | 
						|
    }
 | 
						|
#endif
 | 
						|
}
 | 
						|
 | 
						|
// This can be any sorting algorithm that maintains order (like std::stable_sort),
 | 
						|
// however bubble sort is easier to implement, so go with it (for now).
 | 
						|
template<typename T, typename LessThan>
 | 
						|
void bubble_sort(Span<T> span, LessThan less_than)
 | 
						|
{
 | 
						|
    for (size_t i = 0; i < span.size() - 1; ++i) {
 | 
						|
        for (size_t j = 0; j < span.size() - 1 - i; ++j) {
 | 
						|
            if (!less_than(span[j], span[j + 1]))
 | 
						|
                swap(span[j], span[j + 1]);
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
 | 
						|
// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
 | 
						|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
 | 
						|
static void canonical_ordering_algorithm(Span<u32> code_points)
 | 
						|
{
 | 
						|
    for (size_t i = 0; i < code_points.size(); ++i) {
 | 
						|
        if (!is_starter(code_points[i])) {
 | 
						|
            auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
 | 
						|
            auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
 | 
						|
            // Nothing to reorder, so continue.
 | 
						|
            if (span_size <= 1)
 | 
						|
                continue;
 | 
						|
            Span<u32> const span { code_points.data() + i, span_size };
 | 
						|
 | 
						|
            bubble_sort(span, [](u32 a, u32 b) {
 | 
						|
                // Use <= to keep ordering.
 | 
						|
                return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
 | 
						|
            });
 | 
						|
 | 
						|
            // Skip over span we just sorted.
 | 
						|
            i += span_size - 1;
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
 | 
						|
static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
 | 
						|
{
 | 
						|
    if (!is_starter(code_points[a]) || a == c - 1)
 | 
						|
        return false;
 | 
						|
    auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
 | 
						|
    auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
 | 
						|
    return b_combining_class == 0 || b_combining_class >= c_combining_class;
 | 
						|
}
 | 
						|
 | 
						|
// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
 | 
						|
// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
 | 
						|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
 | 
						|
static void canonical_composition_algorithm(Vector<u32>& code_points)
 | 
						|
{
 | 
						|
    for (size_t i = 1; i < code_points.size(); ++i) {
 | 
						|
        auto const current_character = code_points[i];
 | 
						|
        // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
 | 
						|
        for (ssize_t j = i - 1; j >= 0; --j) {
 | 
						|
            if (!is_starter(code_points[j]))
 | 
						|
                continue;
 | 
						|
            // R2. If there is such an L, and C is not blocked from L,
 | 
						|
            //     and there exists a Primary Composite P which is canonically equivalent to <L, C>,
 | 
						|
            //     then replace L by P in the sequence and delete C from the sequence.
 | 
						|
            if (is_blocked(code_points.span(), j, i))
 | 
						|
                continue;
 | 
						|
 | 
						|
            auto composite = combine_hangul_code_points(code_points[j], current_character);
 | 
						|
 | 
						|
            if (composite == 0)
 | 
						|
                composite = combine_code_points(code_points[j], current_character);
 | 
						|
 | 
						|
            if (composite != 0) {
 | 
						|
                code_points[j] = composite;
 | 
						|
                code_points.remove(i);
 | 
						|
                --i;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static Vector<u32> normalize_nfd(Utf8View string)
 | 
						|
{
 | 
						|
    Vector<u32> result;
 | 
						|
    for (auto const code_point : string)
 | 
						|
        decompose_code_point(code_point, result, UseCompatibility::No);
 | 
						|
 | 
						|
    canonical_ordering_algorithm(result);
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static Vector<u32> normalize_nfc(Utf8View string)
 | 
						|
{
 | 
						|
    auto result = normalize_nfd(string);
 | 
						|
    canonical_composition_algorithm(result);
 | 
						|
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static Vector<u32> normalize_nfkd(Utf8View string)
 | 
						|
{
 | 
						|
    Vector<u32> result;
 | 
						|
    for (auto const code_point : string)
 | 
						|
        decompose_code_point(code_point, result, UseCompatibility::Yes);
 | 
						|
 | 
						|
    canonical_ordering_algorithm(result);
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static Vector<u32> normalize_nfkc(Utf8View string)
 | 
						|
{
 | 
						|
    auto result = normalize_nfkd(string);
 | 
						|
    canonical_composition_algorithm(result);
 | 
						|
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
 | 
						|
{
 | 
						|
    switch (form) {
 | 
						|
    case NormalizationForm::NFD:
 | 
						|
        return normalize_nfd(string);
 | 
						|
    case NormalizationForm::NFC:
 | 
						|
        return normalize_nfc(string);
 | 
						|
    case NormalizationForm::NFKD:
 | 
						|
        return normalize_nfkd(string);
 | 
						|
    case NormalizationForm::NFKC:
 | 
						|
        return normalize_nfkc(string);
 | 
						|
    }
 | 
						|
    VERIFY_NOT_REACHED();
 | 
						|
}
 | 
						|
 | 
						|
String normalize(StringView string, NormalizationForm form)
 | 
						|
{
 | 
						|
    auto const code_points = normalize_implementation(Utf8View { string }, form);
 | 
						|
 | 
						|
    StringBuilder builder;
 | 
						|
    for (auto code_point : code_points)
 | 
						|
        builder.append_code_point(code_point);
 | 
						|
 | 
						|
    return MUST(builder.to_string());
 | 
						|
}
 | 
						|
 | 
						|
}
 |