mirror of
				https://github.com/godotengine/godot.git
				synced 2025-11-03 23:21:15 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			137 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			137 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
#!/usr/bin/env python3
 | 
						|
 | 
						|
# Script used to dump char ranges for specific properties from
 | 
						|
# the Unicode Character Database to the `char_range.inc` file.
 | 
						|
# NOTE: This script is deliberately not integrated into the build system;
 | 
						|
# you should run it manually whenever you want to update the data.
 | 
						|
 | 
						|
import os
 | 
						|
import sys
 | 
						|
from typing import Final, List, Tuple
 | 
						|
from urllib.request import urlopen
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
 | 
						|
 | 
						|
from methods import generate_copyright_header
 | 
						|
 | 
						|
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
 | 
						|
 | 
						|
 | 
						|
xid_start: List[Tuple[int, int]] = []
 | 
						|
xid_continue: List[Tuple[int, int]] = []
 | 
						|
uppercase_letter: List[Tuple[int, int]] = []
 | 
						|
lowercase_letter: List[Tuple[int, int]] = []
 | 
						|
unicode_letter: List[Tuple[int, int]] = []
 | 
						|
 | 
						|
 | 
						|
def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
 | 
						|
    if len(ranges) < 2:
 | 
						|
        return
 | 
						|
 | 
						|
    last_start: int = ranges[0][0]
 | 
						|
    last_end: int = ranges[0][1]
 | 
						|
    original_ranges: List[Tuple[int, int]] = ranges[1:]
 | 
						|
 | 
						|
    ranges.clear()
 | 
						|
 | 
						|
    for curr_range in original_ranges:
 | 
						|
        curr_start: int = curr_range[0]
 | 
						|
        curr_end: int = curr_range[1]
 | 
						|
        if last_end + 1 != curr_start:
 | 
						|
            ranges.append((last_start, last_end))
 | 
						|
            last_start = curr_start
 | 
						|
        last_end = curr_end
 | 
						|
 | 
						|
    ranges.append((last_start, last_end))
 | 
						|
 | 
						|
 | 
						|
def parse_unicode_data() -> None:
 | 
						|
    lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
 | 
						|
 | 
						|
    for line in lines:
 | 
						|
        if line.startswith("#") or not line.strip():
 | 
						|
            continue
 | 
						|
 | 
						|
        split_line: List[str] = line.split(";")
 | 
						|
 | 
						|
        char_range: str = split_line[0].strip()
 | 
						|
        char_property: str = split_line[1].strip().split("#")[0].strip()
 | 
						|
 | 
						|
        range_start: str = char_range
 | 
						|
        range_end: str = char_range
 | 
						|
        if ".." in char_range:
 | 
						|
            range_start, range_end = char_range.split("..")
 | 
						|
 | 
						|
        range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
 | 
						|
 | 
						|
        if char_property == "XID_Start":
 | 
						|
            xid_start.append(range_tuple)
 | 
						|
        elif char_property == "XID_Continue":
 | 
						|
            xid_continue.append(range_tuple)
 | 
						|
        elif char_property == "Uppercase":
 | 
						|
            uppercase_letter.append(range_tuple)
 | 
						|
        elif char_property == "Lowercase":
 | 
						|
            lowercase_letter.append(range_tuple)
 | 
						|
        elif char_property == "Alphabetic":
 | 
						|
            unicode_letter.append(range_tuple)
 | 
						|
 | 
						|
    # Underscore technically isn't in XID_Start, but for our purposes it's included.
 | 
						|
    xid_start.append((0x005F, 0x005F))
 | 
						|
    xid_start.sort(key=lambda x: x[0])
 | 
						|
 | 
						|
    merge_ranges(xid_start)
 | 
						|
    merge_ranges(xid_continue)
 | 
						|
    merge_ranges(uppercase_letter)
 | 
						|
    merge_ranges(lowercase_letter)
 | 
						|
    merge_ranges(unicode_letter)
 | 
						|
 | 
						|
 | 
						|
def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
 | 
						|
    result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
 | 
						|
 | 
						|
    for start, end in range_list:
 | 
						|
        result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
 | 
						|
 | 
						|
    result += "};"
 | 
						|
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def generate_char_range_inc() -> None:
 | 
						|
    parse_unicode_data()
 | 
						|
 | 
						|
    source: str = generate_copyright_header("char_range.inc")
 | 
						|
 | 
						|
    source += f"""
 | 
						|
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
 | 
						|
 | 
						|
#pragma once
 | 
						|
 | 
						|
#include "core/typedefs.h"
 | 
						|
 | 
						|
// Unicode Derived Core Properties
 | 
						|
// Source: {URL}
 | 
						|
 | 
						|
struct CharRange {{
 | 
						|
\tchar32_t start;
 | 
						|
\tchar32_t end;
 | 
						|
}};"""
 | 
						|
 | 
						|
    source += make_array("xid_start", xid_start)
 | 
						|
    source += make_array("xid_continue", xid_continue)
 | 
						|
    source += make_array("uppercase_letter", uppercase_letter)
 | 
						|
    source += make_array("lowercase_letter", lowercase_letter)
 | 
						|
    source += make_array("unicode_letter", unicode_letter)
 | 
						|
 | 
						|
    source += "\n"
 | 
						|
 | 
						|
    char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
 | 
						|
    with open(char_range_path, "w", newline="\n") as f:
 | 
						|
        f.write(source)
 | 
						|
 | 
						|
    print("`char_range.inc` generated successfully.")
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    generate_char_range_inc()
 |