mirror of
https://github.com/python/cpython.git
synced 2026-02-21 22:50:55 +00:00
parent
6940c1dc0c
commit
f1f61bf872
8 changed files with 922 additions and 2 deletions
|
|
@ -130,6 +130,18 @@ following functions:
|
|||
`Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.
|
||||
|
||||
|
||||
.. function:: block(chr, /)
|
||||
|
||||
Returns the `block
|
||||
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
|
||||
assigned to the character *chr*. For example::
|
||||
|
||||
>>> unicodedata.block('S')
|
||||
'Basic Latin'
|
||||
|
||||
.. versionadded:: next
|
||||
|
||||
|
||||
.. function:: mirrored(chr, /)
|
||||
|
||||
Returns the mirrored property assigned to the character *chr* as
|
||||
|
|
|
|||
|
|
@ -1134,6 +1134,11 @@ unicodedata
|
|||
of the character which are related to the above algorithm.
|
||||
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
|
||||
|
||||
* Add :func:`~unicodedata.block` function to return the `Unicode block
|
||||
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
|
||||
assigned to a character.
|
||||
(Contributed by Stan Ulbrych in :gh:`66802`.)
|
||||
|
||||
|
||||
unittest
|
||||
--------
|
||||
|
|
|
|||
|
|
@ -973,6 +973,97 @@ def graphemes(*args):
|
|||
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
|
||||
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
|
||||
|
||||
def test_block(self):
|
||||
self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
|
||||
self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
|
||||
self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
|
||||
self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
|
||||
self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
|
||||
self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
|
||||
self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
|
||||
self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
|
||||
self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
|
||||
# New in 5.0.0
|
||||
self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
|
||||
self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
|
||||
# New in 5.1.0
|
||||
self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
|
||||
self.assertEqual(self.db.block('\uAA4D'), 'Cham')
|
||||
# New in 5.2.0
|
||||
self.assertEqual(self.db.block('\u0816'), 'Samaritan')
|
||||
self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
|
||||
self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
|
||||
self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
|
||||
# New in 6.0.0
|
||||
self.assertEqual(self.db.block('\u093A'), 'Devanagari')
|
||||
self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
|
||||
# New in 6.1.0
|
||||
self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
|
||||
self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
|
||||
# New in 6.2.0
|
||||
self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
|
||||
self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
|
||||
# New in 6.3.0
|
||||
self.assertEqual(self.db.block('\u180E'), 'Mongolian')
|
||||
self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
|
||||
# New in 7.0.0
|
||||
self.assertEqual(self.db.block('\u0E33'), 'Thai')
|
||||
self.assertEqual(self.db.block('\u0EB3'), 'Lao')
|
||||
self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
|
||||
self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
|
||||
self.assertEqual(self.db.block('\U0001163E'), 'Modi')
|
||||
# New in 8.0.0
|
||||
self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
|
||||
self.assertEqual(self.db.block('\U00011726'), 'Ahom')
|
||||
# New in 9.0.0
|
||||
self.assertEqual(self.db.block('\u0600'), 'Arabic')
|
||||
self.assertEqual(self.db.block('\U000E007F'), 'Tags')
|
||||
self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
|
||||
self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
|
||||
# New in 10.0.0
|
||||
self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
|
||||
self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
|
||||
self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
|
||||
# New in 11.0.0
|
||||
self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
|
||||
self.assertEqual(self.db.block('\u07FD'), 'NKo')
|
||||
self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
|
||||
# New in 12.0.0
|
||||
self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
|
||||
self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
|
||||
self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
|
||||
self.assertEqual(self.db.block('\U00016F87'), 'Miao')
|
||||
# New in 13.0.0
|
||||
self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
|
||||
self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
|
||||
self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
|
||||
# New in 14.0.0
|
||||
self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
|
||||
self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
|
||||
# New in 15.0.0
|
||||
self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
|
||||
self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
|
||||
self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
|
||||
self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
|
||||
# New in 16.0.0
|
||||
self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
|
||||
self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
|
||||
self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
|
||||
self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
|
||||
# New in 17.0.0
|
||||
self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
|
||||
self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
|
||||
# Unassigned
|
||||
self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B')
|
||||
self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B')
|
||||
|
||||
def test_block_invalid_input(self):
|
||||
self.assertRaises(TypeError, self.db.block)
|
||||
self.assertRaises(TypeError, self.db.block, b'x')
|
||||
self.assertRaises(TypeError, self.db.block, 120)
|
||||
self.assertRaises(TypeError, self.db.block, '')
|
||||
self.assertRaises(TypeError, self.db.block, 'xx')
|
||||
|
||||
|
||||
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
|
||||
db = unicodedata.ucd_3_2_0
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
Add :func:`unicodedata.block` function to return the `Unicode block
|
||||
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_ of a
|
||||
character.
|
||||
38
Modules/clinic/unicodedata.c.h
generated
38
Modules/clinic/unicodedata.c.h
generated
|
|
@ -691,6 +691,42 @@ exit:
|
|||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_block__doc__,
|
||||
"block($module, chr, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Return block assigned to the character chr.");
|
||||
|
||||
#define UNICODEDATA_BLOCK_METHODDEF \
|
||||
{"block", (PyCFunction)unicodedata_block, METH_O, unicodedata_block__doc__},
|
||||
|
||||
static PyObject *
|
||||
unicodedata_block_impl(PyObject *module, int chr);
|
||||
|
||||
static PyObject *
|
||||
unicodedata_block(PyObject *module, PyObject *arg)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
int chr;
|
||||
|
||||
if (!PyUnicode_Check(arg)) {
|
||||
_PyArg_BadArgument("block", "argument", "a unicode character", arg);
|
||||
goto exit;
|
||||
}
|
||||
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"block(): argument must be a unicode character, "
|
||||
"not a string of length %zd",
|
||||
PyUnicode_GET_LENGTH(arg));
|
||||
goto exit;
|
||||
}
|
||||
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||
return_value = unicodedata_block_impl(module, chr);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
|
||||
"grapheme_cluster_break($module, chr, /)\n"
|
||||
"--\n"
|
||||
|
|
@ -798,4 +834,4 @@ unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
|
|||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=0f09cc90f06ace76 input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=482a87df218f07c1 input=a9049054013a1b77]*/
|
||||
|
|
|
|||
|
|
@ -2066,6 +2066,39 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
|
|||
return (PyObject*)gbi;
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
unicodedata.block
|
||||
|
||||
chr: int(accept={str})
|
||||
/
|
||||
|
||||
Return block assigned to the character chr.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
unicodedata_block_impl(PyObject *module, int chr)
|
||||
/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
|
||||
{
|
||||
Py_UCS4 c = (Py_UCS4)chr;
|
||||
int lo = 0, hi = BLOCK_COUNT - 1;
|
||||
while (lo <= hi) {
|
||||
int mid = (lo + hi) / 2;
|
||||
if (c < _PyUnicode_Blocks[mid].start) {
|
||||
hi = mid - 1;
|
||||
}
|
||||
else if (c > _PyUnicode_Blocks[mid].end) {
|
||||
lo = mid + 1;
|
||||
}
|
||||
else {
|
||||
size_t name = _PyUnicode_Blocks[mid].name;
|
||||
return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
|
||||
}
|
||||
}
|
||||
// Otherwise, return the default value per
|
||||
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
|
||||
return PyUnicode_FromString("No_Block");
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
unicodedata.grapheme_cluster_break
|
||||
|
||||
|
|
@ -2128,6 +2161,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr)
|
|||
// an UCD instance.
|
||||
static PyMethodDef unicodedata_functions[] = {
|
||||
// Module only functions.
|
||||
UNICODEDATA_BLOCK_METHODDEF
|
||||
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
|
||||
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
|
||||
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
|
||||
|
|
@ -2137,7 +2171,7 @@ static PyMethodDef unicodedata_functions[] = {
|
|||
|
||||
// The following definitions are shared between the module
|
||||
// and the UCD class.
|
||||
#define DB_methods (unicodedata_functions + 6)
|
||||
#define DB_methods (unicodedata_functions + 7)
|
||||
|
||||
UNICODEDATA_UCD_DECIMAL_METHODDEF
|
||||
UNICODEDATA_UCD_DIGIT_METHODDEF
|
||||
|
|
|
|||
703
Modules/unicodedata_db.h
generated
703
Modules/unicodedata_db.h
generated
|
|
@ -796,6 +796,709 @@ const char * const _PyUnicode_IndicConjunctBreakNames[] = {
|
|||
"Extend",
|
||||
NULL
|
||||
};
|
||||
static const char * const _PyUnicode_BlockNames[] = {
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement",
|
||||
"Latin Extended-A",
|
||||
"Latin Extended-B",
|
||||
"IPA Extensions",
|
||||
"Spacing Modifier Letters",
|
||||
"Combining Diacritical Marks",
|
||||
"Greek and Coptic",
|
||||
"Cyrillic",
|
||||
"Cyrillic Supplement",
|
||||
"Armenian",
|
||||
"Hebrew",
|
||||
"Arabic",
|
||||
"Syriac",
|
||||
"Arabic Supplement",
|
||||
"Thaana",
|
||||
"NKo",
|
||||
"Samaritan",
|
||||
"Mandaic",
|
||||
"Syriac Supplement",
|
||||
"Arabic Extended-B",
|
||||
"Arabic Extended-A",
|
||||
"Devanagari",
|
||||
"Bengali",
|
||||
"Gurmukhi",
|
||||
"Gujarati",
|
||||
"Oriya",
|
||||
"Tamil",
|
||||
"Telugu",
|
||||
"Kannada",
|
||||
"Malayalam",
|
||||
"Sinhala",
|
||||
"Thai",
|
||||
"Lao",
|
||||
"Tibetan",
|
||||
"Myanmar",
|
||||
"Georgian",
|
||||
"Hangul Jamo",
|
||||
"Ethiopic",
|
||||
"Ethiopic Supplement",
|
||||
"Cherokee",
|
||||
"Unified Canadian Aboriginal Syllabics",
|
||||
"Ogham",
|
||||
"Runic",
|
||||
"Tagalog",
|
||||
"Hanunoo",
|
||||
"Buhid",
|
||||
"Tagbanwa",
|
||||
"Khmer",
|
||||
"Mongolian",
|
||||
"Unified Canadian Aboriginal Syllabics Extended",
|
||||
"Limbu",
|
||||
"Tai Le",
|
||||
"New Tai Lue",
|
||||
"Khmer Symbols",
|
||||
"Buginese",
|
||||
"Tai Tham",
|
||||
"Combining Diacritical Marks Extended",
|
||||
"Balinese",
|
||||
"Sundanese",
|
||||
"Batak",
|
||||
"Lepcha",
|
||||
"Ol Chiki",
|
||||
"Cyrillic Extended-C",
|
||||
"Georgian Extended",
|
||||
"Sundanese Supplement",
|
||||
"Vedic Extensions",
|
||||
"Phonetic Extensions",
|
||||
"Phonetic Extensions Supplement",
|
||||
"Combining Diacritical Marks Supplement",
|
||||
"Latin Extended Additional",
|
||||
"Greek Extended",
|
||||
"General Punctuation",
|
||||
"Superscripts and Subscripts",
|
||||
"Currency Symbols",
|
||||
"Combining Diacritical Marks for Symbols",
|
||||
"Letterlike Symbols",
|
||||
"Number Forms",
|
||||
"Arrows",
|
||||
"Mathematical Operators",
|
||||
"Miscellaneous Technical",
|
||||
"Control Pictures",
|
||||
"Optical Character Recognition",
|
||||
"Enclosed Alphanumerics",
|
||||
"Box Drawing",
|
||||
"Block Elements",
|
||||
"Geometric Shapes",
|
||||
"Miscellaneous Symbols",
|
||||
"Dingbats",
|
||||
"Miscellaneous Mathematical Symbols-A",
|
||||
"Supplemental Arrows-A",
|
||||
"Braille Patterns",
|
||||
"Supplemental Arrows-B",
|
||||
"Miscellaneous Mathematical Symbols-B",
|
||||
"Supplemental Mathematical Operators",
|
||||
"Miscellaneous Symbols and Arrows",
|
||||
"Glagolitic",
|
||||
"Latin Extended-C",
|
||||
"Coptic",
|
||||
"Georgian Supplement",
|
||||
"Tifinagh",
|
||||
"Ethiopic Extended",
|
||||
"Cyrillic Extended-A",
|
||||
"Supplemental Punctuation",
|
||||
"CJK Radicals Supplement",
|
||||
"Kangxi Radicals",
|
||||
"Ideographic Description Characters",
|
||||
"CJK Symbols and Punctuation",
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
"Bopomofo",
|
||||
"Hangul Compatibility Jamo",
|
||||
"Kanbun",
|
||||
"Bopomofo Extended",
|
||||
"CJK Strokes",
|
||||
"Katakana Phonetic Extensions",
|
||||
"Enclosed CJK Letters and Months",
|
||||
"CJK Compatibility",
|
||||
"CJK Unified Ideographs Extension A",
|
||||
"Yijing Hexagram Symbols",
|
||||
"CJK Unified Ideographs",
|
||||
"Yi Syllables",
|
||||
"Yi Radicals",
|
||||
"Lisu",
|
||||
"Vai",
|
||||
"Cyrillic Extended-B",
|
||||
"Bamum",
|
||||
"Modifier Tone Letters",
|
||||
"Latin Extended-D",
|
||||
"Syloti Nagri",
|
||||
"Common Indic Number Forms",
|
||||
"Phags-pa",
|
||||
"Saurashtra",
|
||||
"Devanagari Extended",
|
||||
"Kayah Li",
|
||||
"Rejang",
|
||||
"Hangul Jamo Extended-A",
|
||||
"Javanese",
|
||||
"Myanmar Extended-B",
|
||||
"Cham",
|
||||
"Myanmar Extended-A",
|
||||
"Tai Viet",
|
||||
"Meetei Mayek Extensions",
|
||||
"Ethiopic Extended-A",
|
||||
"Latin Extended-E",
|
||||
"Cherokee Supplement",
|
||||
"Meetei Mayek",
|
||||
"Hangul Syllables",
|
||||
"Hangul Jamo Extended-B",
|
||||
"High Surrogates",
|
||||
"High Private Use Surrogates",
|
||||
"Low Surrogates",
|
||||
"Private Use Area",
|
||||
"CJK Compatibility Ideographs",
|
||||
"Alphabetic Presentation Forms",
|
||||
"Arabic Presentation Forms-A",
|
||||
"Variation Selectors",
|
||||
"Vertical Forms",
|
||||
"Combining Half Marks",
|
||||
"CJK Compatibility Forms",
|
||||
"Small Form Variants",
|
||||
"Arabic Presentation Forms-B",
|
||||
"Halfwidth and Fullwidth Forms",
|
||||
"Specials",
|
||||
"Linear B Syllabary",
|
||||
"Linear B Ideograms",
|
||||
"Aegean Numbers",
|
||||
"Ancient Greek Numbers",
|
||||
"Ancient Symbols",
|
||||
"Phaistos Disc",
|
||||
"Lycian",
|
||||
"Carian",
|
||||
"Coptic Epact Numbers",
|
||||
"Old Italic",
|
||||
"Gothic",
|
||||
"Old Permic",
|
||||
"Ugaritic",
|
||||
"Old Persian",
|
||||
"Deseret",
|
||||
"Shavian",
|
||||
"Osmanya",
|
||||
"Osage",
|
||||
"Elbasan",
|
||||
"Caucasian Albanian",
|
||||
"Vithkuqi",
|
||||
"Todhri",
|
||||
"Linear A",
|
||||
"Latin Extended-F",
|
||||
"Cypriot Syllabary",
|
||||
"Imperial Aramaic",
|
||||
"Palmyrene",
|
||||
"Nabataean",
|
||||
"Hatran",
|
||||
"Phoenician",
|
||||
"Lydian",
|
||||
"Sidetic",
|
||||
"Meroitic Hieroglyphs",
|
||||
"Meroitic Cursive",
|
||||
"Kharoshthi",
|
||||
"Old South Arabian",
|
||||
"Old North Arabian",
|
||||
"Manichaean",
|
||||
"Avestan",
|
||||
"Inscriptional Parthian",
|
||||
"Inscriptional Pahlavi",
|
||||
"Psalter Pahlavi",
|
||||
"Old Turkic",
|
||||
"Old Hungarian",
|
||||
"Hanifi Rohingya",
|
||||
"Garay",
|
||||
"Rumi Numeral Symbols",
|
||||
"Yezidi",
|
||||
"Arabic Extended-C",
|
||||
"Old Sogdian",
|
||||
"Sogdian",
|
||||
"Old Uyghur",
|
||||
"Chorasmian",
|
||||
"Elymaic",
|
||||
"Brahmi",
|
||||
"Kaithi",
|
||||
"Sora Sompeng",
|
||||
"Chakma",
|
||||
"Mahajani",
|
||||
"Sharada",
|
||||
"Sinhala Archaic Numbers",
|
||||
"Khojki",
|
||||
"Multani",
|
||||
"Khudawadi",
|
||||
"Grantha",
|
||||
"Tulu-Tigalari",
|
||||
"Newa",
|
||||
"Tirhuta",
|
||||
"Siddham",
|
||||
"Modi",
|
||||
"Mongolian Supplement",
|
||||
"Takri",
|
||||
"Myanmar Extended-C",
|
||||
"Ahom",
|
||||
"Dogra",
|
||||
"Warang Citi",
|
||||
"Dives Akuru",
|
||||
"Nandinagari",
|
||||
"Zanabazar Square",
|
||||
"Soyombo",
|
||||
"Unified Canadian Aboriginal Syllabics Extended-A",
|
||||
"Pau Cin Hau",
|
||||
"Devanagari Extended-A",
|
||||
"Sharada Supplement",
|
||||
"Sunuwar",
|
||||
"Bhaiksuki",
|
||||
"Marchen",
|
||||
"Masaram Gondi",
|
||||
"Gunjala Gondi",
|
||||
"Tolong Siki",
|
||||
"Makasar",
|
||||
"Kawi",
|
||||
"Lisu Supplement",
|
||||
"Tamil Supplement",
|
||||
"Cuneiform",
|
||||
"Cuneiform Numbers and Punctuation",
|
||||
"Early Dynastic Cuneiform",
|
||||
"Cypro-Minoan",
|
||||
"Egyptian Hieroglyphs",
|
||||
"Egyptian Hieroglyph Format Controls",
|
||||
"Egyptian Hieroglyphs Extended-A",
|
||||
"Anatolian Hieroglyphs",
|
||||
"Gurung Khema",
|
||||
"Bamum Supplement",
|
||||
"Mro",
|
||||
"Tangsa",
|
||||
"Bassa Vah",
|
||||
"Pahawh Hmong",
|
||||
"Kirat Rai",
|
||||
"Medefaidrin",
|
||||
"Beria Erfe",
|
||||
"Miao",
|
||||
"Ideographic Symbols and Punctuation",
|
||||
"Tangut",
|
||||
"Tangut Components",
|
||||
"Khitan Small Script",
|
||||
"Tangut Supplement",
|
||||
"Tangut Components Supplement",
|
||||
"Kana Extended-B",
|
||||
"Kana Supplement",
|
||||
"Kana Extended-A",
|
||||
"Small Kana Extension",
|
||||
"Nushu",
|
||||
"Duployan",
|
||||
"Shorthand Format Controls",
|
||||
"Symbols for Legacy Computing Supplement",
|
||||
"Miscellaneous Symbols Supplement",
|
||||
"Znamenny Musical Notation",
|
||||
"Byzantine Musical Symbols",
|
||||
"Musical Symbols",
|
||||
"Ancient Greek Musical Notation",
|
||||
"Kaktovik Numerals",
|
||||
"Mayan Numerals",
|
||||
"Tai Xuan Jing Symbols",
|
||||
"Counting Rod Numerals",
|
||||
"Mathematical Alphanumeric Symbols",
|
||||
"Sutton SignWriting",
|
||||
"Latin Extended-G",
|
||||
"Glagolitic Supplement",
|
||||
"Cyrillic Extended-D",
|
||||
"Nyiakeng Puachue Hmong",
|
||||
"Toto",
|
||||
"Wancho",
|
||||
"Nag Mundari",
|
||||
"Ol Onal",
|
||||
"Tai Yo",
|
||||
"Ethiopic Extended-B",
|
||||
"Mende Kikakui",
|
||||
"Adlam",
|
||||
"Indic Siyaq Numbers",
|
||||
"Ottoman Siyaq Numbers",
|
||||
"Arabic Mathematical Alphabetic Symbols",
|
||||
"Mahjong Tiles",
|
||||
"Domino Tiles",
|
||||
"Playing Cards",
|
||||
"Enclosed Alphanumeric Supplement",
|
||||
"Enclosed Ideographic Supplement",
|
||||
"Miscellaneous Symbols and Pictographs",
|
||||
"Emoticons",
|
||||
"Ornamental Dingbats",
|
||||
"Transport and Map Symbols",
|
||||
"Alchemical Symbols",
|
||||
"Geometric Shapes Extended",
|
||||
"Supplemental Arrows-C",
|
||||
"Supplemental Symbols and Pictographs",
|
||||
"Chess Symbols",
|
||||
"Symbols and Pictographs Extended-A",
|
||||
"Symbols for Legacy Computing",
|
||||
"CJK Unified Ideographs Extension B",
|
||||
"CJK Unified Ideographs Extension C",
|
||||
"CJK Unified Ideographs Extension D",
|
||||
"CJK Unified Ideographs Extension E",
|
||||
"CJK Unified Ideographs Extension F",
|
||||
"CJK Unified Ideographs Extension I",
|
||||
"CJK Compatibility Ideographs Supplement",
|
||||
"CJK Unified Ideographs Extension G",
|
||||
"CJK Unified Ideographs Extension H",
|
||||
"CJK Unified Ideographs Extension J",
|
||||
"Tags",
|
||||
"Variation Selectors Supplement",
|
||||
"Supplementary Private Use Area-A",
|
||||
"Supplementary Private Use Area-B",
|
||||
};
|
||||
typedef struct {
|
||||
Py_UCS4 start;
|
||||
Py_UCS4 end;
|
||||
unsigned short name;
|
||||
} _PyUnicode_Block;
|
||||
static const _PyUnicode_Block _PyUnicode_Blocks[] = {
|
||||
{0x0000, 0x007F, 0},
|
||||
{0x0080, 0x00FF, 1},
|
||||
{0x0100, 0x017F, 2},
|
||||
{0x0180, 0x024F, 3},
|
||||
{0x0250, 0x02AF, 4},
|
||||
{0x02B0, 0x02FF, 5},
|
||||
{0x0300, 0x036F, 6},
|
||||
{0x0370, 0x03FF, 7},
|
||||
{0x0400, 0x04FF, 8},
|
||||
{0x0500, 0x052F, 9},
|
||||
{0x0530, 0x058F, 10},
|
||||
{0x0590, 0x05FF, 11},
|
||||
{0x0600, 0x06FF, 12},
|
||||
{0x0700, 0x074F, 13},
|
||||
{0x0750, 0x077F, 14},
|
||||
{0x0780, 0x07BF, 15},
|
||||
{0x07C0, 0x07FF, 16},
|
||||
{0x0800, 0x083F, 17},
|
||||
{0x0840, 0x085F, 18},
|
||||
{0x0860, 0x086F, 19},
|
||||
{0x0870, 0x089F, 20},
|
||||
{0x08A0, 0x08FF, 21},
|
||||
{0x0900, 0x097F, 22},
|
||||
{0x0980, 0x09FF, 23},
|
||||
{0x0A00, 0x0A7F, 24},
|
||||
{0x0A80, 0x0AFF, 25},
|
||||
{0x0B00, 0x0B7F, 26},
|
||||
{0x0B80, 0x0BFF, 27},
|
||||
{0x0C00, 0x0C7F, 28},
|
||||
{0x0C80, 0x0CFF, 29},
|
||||
{0x0D00, 0x0D7F, 30},
|
||||
{0x0D80, 0x0DFF, 31},
|
||||
{0x0E00, 0x0E7F, 32},
|
||||
{0x0E80, 0x0EFF, 33},
|
||||
{0x0F00, 0x0FFF, 34},
|
||||
{0x1000, 0x109F, 35},
|
||||
{0x10A0, 0x10FF, 36},
|
||||
{0x1100, 0x11FF, 37},
|
||||
{0x1200, 0x137F, 38},
|
||||
{0x1380, 0x139F, 39},
|
||||
{0x13A0, 0x13FF, 40},
|
||||
{0x1400, 0x167F, 41},
|
||||
{0x1680, 0x169F, 42},
|
||||
{0x16A0, 0x16FF, 43},
|
||||
{0x1700, 0x171F, 44},
|
||||
{0x1720, 0x173F, 45},
|
||||
{0x1740, 0x175F, 46},
|
||||
{0x1760, 0x177F, 47},
|
||||
{0x1780, 0x17FF, 48},
|
||||
{0x1800, 0x18AF, 49},
|
||||
{0x18B0, 0x18FF, 50},
|
||||
{0x1900, 0x194F, 51},
|
||||
{0x1950, 0x197F, 52},
|
||||
{0x1980, 0x19DF, 53},
|
||||
{0x19E0, 0x19FF, 54},
|
||||
{0x1A00, 0x1A1F, 55},
|
||||
{0x1A20, 0x1AAF, 56},
|
||||
{0x1AB0, 0x1AFF, 57},
|
||||
{0x1B00, 0x1B7F, 58},
|
||||
{0x1B80, 0x1BBF, 59},
|
||||
{0x1BC0, 0x1BFF, 60},
|
||||
{0x1C00, 0x1C4F, 61},
|
||||
{0x1C50, 0x1C7F, 62},
|
||||
{0x1C80, 0x1C8F, 63},
|
||||
{0x1C90, 0x1CBF, 64},
|
||||
{0x1CC0, 0x1CCF, 65},
|
||||
{0x1CD0, 0x1CFF, 66},
|
||||
{0x1D00, 0x1D7F, 67},
|
||||
{0x1D80, 0x1DBF, 68},
|
||||
{0x1DC0, 0x1DFF, 69},
|
||||
{0x1E00, 0x1EFF, 70},
|
||||
{0x1F00, 0x1FFF, 71},
|
||||
{0x2000, 0x206F, 72},
|
||||
{0x2070, 0x209F, 73},
|
||||
{0x20A0, 0x20CF, 74},
|
||||
{0x20D0, 0x20FF, 75},
|
||||
{0x2100, 0x214F, 76},
|
||||
{0x2150, 0x218F, 77},
|
||||
{0x2190, 0x21FF, 78},
|
||||
{0x2200, 0x22FF, 79},
|
||||
{0x2300, 0x23FF, 80},
|
||||
{0x2400, 0x243F, 81},
|
||||
{0x2440, 0x245F, 82},
|
||||
{0x2460, 0x24FF, 83},
|
||||
{0x2500, 0x257F, 84},
|
||||
{0x2580, 0x259F, 85},
|
||||
{0x25A0, 0x25FF, 86},
|
||||
{0x2600, 0x26FF, 87},
|
||||
{0x2700, 0x27BF, 88},
|
||||
{0x27C0, 0x27EF, 89},
|
||||
{0x27F0, 0x27FF, 90},
|
||||
{0x2800, 0x28FF, 91},
|
||||
{0x2900, 0x297F, 92},
|
||||
{0x2980, 0x29FF, 93},
|
||||
{0x2A00, 0x2AFF, 94},
|
||||
{0x2B00, 0x2BFF, 95},
|
||||
{0x2C00, 0x2C5F, 96},
|
||||
{0x2C60, 0x2C7F, 97},
|
||||
{0x2C80, 0x2CFF, 98},
|
||||
{0x2D00, 0x2D2F, 99},
|
||||
{0x2D30, 0x2D7F, 100},
|
||||
{0x2D80, 0x2DDF, 101},
|
||||
{0x2DE0, 0x2DFF, 102},
|
||||
{0x2E00, 0x2E7F, 103},
|
||||
{0x2E80, 0x2EFF, 104},
|
||||
{0x2F00, 0x2FDF, 105},
|
||||
{0x2FF0, 0x2FFF, 106},
|
||||
{0x3000, 0x303F, 107},
|
||||
{0x3040, 0x309F, 108},
|
||||
{0x30A0, 0x30FF, 109},
|
||||
{0x3100, 0x312F, 110},
|
||||
{0x3130, 0x318F, 111},
|
||||
{0x3190, 0x319F, 112},
|
||||
{0x31A0, 0x31BF, 113},
|
||||
{0x31C0, 0x31EF, 114},
|
||||
{0x31F0, 0x31FF, 115},
|
||||
{0x3200, 0x32FF, 116},
|
||||
{0x3300, 0x33FF, 117},
|
||||
{0x3400, 0x4DBF, 118},
|
||||
{0x4DC0, 0x4DFF, 119},
|
||||
{0x4E00, 0x9FFF, 120},
|
||||
{0xA000, 0xA48F, 121},
|
||||
{0xA490, 0xA4CF, 122},
|
||||
{0xA4D0, 0xA4FF, 123},
|
||||
{0xA500, 0xA63F, 124},
|
||||
{0xA640, 0xA69F, 125},
|
||||
{0xA6A0, 0xA6FF, 126},
|
||||
{0xA700, 0xA71F, 127},
|
||||
{0xA720, 0xA7FF, 128},
|
||||
{0xA800, 0xA82F, 129},
|
||||
{0xA830, 0xA83F, 130},
|
||||
{0xA840, 0xA87F, 131},
|
||||
{0xA880, 0xA8DF, 132},
|
||||
{0xA8E0, 0xA8FF, 133},
|
||||
{0xA900, 0xA92F, 134},
|
||||
{0xA930, 0xA95F, 135},
|
||||
{0xA960, 0xA97F, 136},
|
||||
{0xA980, 0xA9DF, 137},
|
||||
{0xA9E0, 0xA9FF, 138},
|
||||
{0xAA00, 0xAA5F, 139},
|
||||
{0xAA60, 0xAA7F, 140},
|
||||
{0xAA80, 0xAADF, 141},
|
||||
{0xAAE0, 0xAAFF, 142},
|
||||
{0xAB00, 0xAB2F, 143},
|
||||
{0xAB30, 0xAB6F, 144},
|
||||
{0xAB70, 0xABBF, 145},
|
||||
{0xABC0, 0xABFF, 146},
|
||||
{0xAC00, 0xD7AF, 147},
|
||||
{0xD7B0, 0xD7FF, 148},
|
||||
{0xD800, 0xDB7F, 149},
|
||||
{0xDB80, 0xDBFF, 150},
|
||||
{0xDC00, 0xDFFF, 151},
|
||||
{0xE000, 0xF8FF, 152},
|
||||
{0xF900, 0xFAFF, 153},
|
||||
{0xFB00, 0xFB4F, 154},
|
||||
{0xFB50, 0xFDFF, 155},
|
||||
{0xFE00, 0xFE0F, 156},
|
||||
{0xFE10, 0xFE1F, 157},
|
||||
{0xFE20, 0xFE2F, 158},
|
||||
{0xFE30, 0xFE4F, 159},
|
||||
{0xFE50, 0xFE6F, 160},
|
||||
{0xFE70, 0xFEFF, 161},
|
||||
{0xFF00, 0xFFEF, 162},
|
||||
{0xFFF0, 0xFFFF, 163},
|
||||
{0x10000, 0x1007F, 164},
|
||||
{0x10080, 0x100FF, 165},
|
||||
{0x10100, 0x1013F, 166},
|
||||
{0x10140, 0x1018F, 167},
|
||||
{0x10190, 0x101CF, 168},
|
||||
{0x101D0, 0x101FF, 169},
|
||||
{0x10280, 0x1029F, 170},
|
||||
{0x102A0, 0x102DF, 171},
|
||||
{0x102E0, 0x102FF, 172},
|
||||
{0x10300, 0x1032F, 173},
|
||||
{0x10330, 0x1034F, 174},
|
||||
{0x10350, 0x1037F, 175},
|
||||
{0x10380, 0x1039F, 176},
|
||||
{0x103A0, 0x103DF, 177},
|
||||
{0x10400, 0x1044F, 178},
|
||||
{0x10450, 0x1047F, 179},
|
||||
{0x10480, 0x104AF, 180},
|
||||
{0x104B0, 0x104FF, 181},
|
||||
{0x10500, 0x1052F, 182},
|
||||
{0x10530, 0x1056F, 183},
|
||||
{0x10570, 0x105BF, 184},
|
||||
{0x105C0, 0x105FF, 185},
|
||||
{0x10600, 0x1077F, 186},
|
||||
{0x10780, 0x107BF, 187},
|
||||
{0x10800, 0x1083F, 188},
|
||||
{0x10840, 0x1085F, 189},
|
||||
{0x10860, 0x1087F, 190},
|
||||
{0x10880, 0x108AF, 191},
|
||||
{0x108E0, 0x108FF, 192},
|
||||
{0x10900, 0x1091F, 193},
|
||||
{0x10920, 0x1093F, 194},
|
||||
{0x10940, 0x1095F, 195},
|
||||
{0x10980, 0x1099F, 196},
|
||||
{0x109A0, 0x109FF, 197},
|
||||
{0x10A00, 0x10A5F, 198},
|
||||
{0x10A60, 0x10A7F, 199},
|
||||
{0x10A80, 0x10A9F, 200},
|
||||
{0x10AC0, 0x10AFF, 201},
|
||||
{0x10B00, 0x10B3F, 202},
|
||||
{0x10B40, 0x10B5F, 203},
|
||||
{0x10B60, 0x10B7F, 204},
|
||||
{0x10B80, 0x10BAF, 205},
|
||||
{0x10C00, 0x10C4F, 206},
|
||||
{0x10C80, 0x10CFF, 207},
|
||||
{0x10D00, 0x10D3F, 208},
|
||||
{0x10D40, 0x10D8F, 209},
|
||||
{0x10E60, 0x10E7F, 210},
|
||||
{0x10E80, 0x10EBF, 211},
|
||||
{0x10EC0, 0x10EFF, 212},
|
||||
{0x10F00, 0x10F2F, 213},
|
||||
{0x10F30, 0x10F6F, 214},
|
||||
{0x10F70, 0x10FAF, 215},
|
||||
{0x10FB0, 0x10FDF, 216},
|
||||
{0x10FE0, 0x10FFF, 217},
|
||||
{0x11000, 0x1107F, 218},
|
||||
{0x11080, 0x110CF, 219},
|
||||
{0x110D0, 0x110FF, 220},
|
||||
{0x11100, 0x1114F, 221},
|
||||
{0x11150, 0x1117F, 222},
|
||||
{0x11180, 0x111DF, 223},
|
||||
{0x111E0, 0x111FF, 224},
|
||||
{0x11200, 0x1124F, 225},
|
||||
{0x11280, 0x112AF, 226},
|
||||
{0x112B0, 0x112FF, 227},
|
||||
{0x11300, 0x1137F, 228},
|
||||
{0x11380, 0x113FF, 229},
|
||||
{0x11400, 0x1147F, 230},
|
||||
{0x11480, 0x114DF, 231},
|
||||
{0x11580, 0x115FF, 232},
|
||||
{0x11600, 0x1165F, 233},
|
||||
{0x11660, 0x1167F, 234},
|
||||
{0x11680, 0x116CF, 235},
|
||||
{0x116D0, 0x116FF, 236},
|
||||
{0x11700, 0x1174F, 237},
|
||||
{0x11800, 0x1184F, 238},
|
||||
{0x118A0, 0x118FF, 239},
|
||||
{0x11900, 0x1195F, 240},
|
||||
{0x119A0, 0x119FF, 241},
|
||||
{0x11A00, 0x11A4F, 242},
|
||||
{0x11A50, 0x11AAF, 243},
|
||||
{0x11AB0, 0x11ABF, 244},
|
||||
{0x11AC0, 0x11AFF, 245},
|
||||
{0x11B00, 0x11B5F, 246},
|
||||
{0x11B60, 0x11B7F, 247},
|
||||
{0x11BC0, 0x11BFF, 248},
|
||||
{0x11C00, 0x11C6F, 249},
|
||||
{0x11C70, 0x11CBF, 250},
|
||||
{0x11D00, 0x11D5F, 251},
|
||||
{0x11D60, 0x11DAF, 252},
|
||||
{0x11DB0, 0x11DEF, 253},
|
||||
{0x11EE0, 0x11EFF, 254},
|
||||
{0x11F00, 0x11F5F, 255},
|
||||
{0x11FB0, 0x11FBF, 256},
|
||||
{0x11FC0, 0x11FFF, 257},
|
||||
{0x12000, 0x123FF, 258},
|
||||
{0x12400, 0x1247F, 259},
|
||||
{0x12480, 0x1254F, 260},
|
||||
{0x12F90, 0x12FFF, 261},
|
||||
{0x13000, 0x1342F, 262},
|
||||
{0x13430, 0x1345F, 263},
|
||||
{0x13460, 0x143FF, 264},
|
||||
{0x14400, 0x1467F, 265},
|
||||
{0x16100, 0x1613F, 266},
|
||||
{0x16800, 0x16A3F, 267},
|
||||
{0x16A40, 0x16A6F, 268},
|
||||
{0x16A70, 0x16ACF, 269},
|
||||
{0x16AD0, 0x16AFF, 270},
|
||||
{0x16B00, 0x16B8F, 271},
|
||||
{0x16D40, 0x16D7F, 272},
|
||||
{0x16E40, 0x16E9F, 273},
|
||||
{0x16EA0, 0x16EDF, 274},
|
||||
{0x16F00, 0x16F9F, 275},
|
||||
{0x16FE0, 0x16FFF, 276},
|
||||
{0x17000, 0x187FF, 277},
|
||||
{0x18800, 0x18AFF, 278},
|
||||
{0x18B00, 0x18CFF, 279},
|
||||
{0x18D00, 0x18D7F, 280},
|
||||
{0x18D80, 0x18DFF, 281},
|
||||
{0x1AFF0, 0x1AFFF, 282},
|
||||
{0x1B000, 0x1B0FF, 283},
|
||||
{0x1B100, 0x1B12F, 284},
|
||||
{0x1B130, 0x1B16F, 285},
|
||||
{0x1B170, 0x1B2FF, 286},
|
||||
{0x1BC00, 0x1BC9F, 287},
|
||||
{0x1BCA0, 0x1BCAF, 288},
|
||||
{0x1CC00, 0x1CEBF, 289},
|
||||
{0x1CEC0, 0x1CEFF, 290},
|
||||
{0x1CF00, 0x1CFCF, 291},
|
||||
{0x1D000, 0x1D0FF, 292},
|
||||
{0x1D100, 0x1D1FF, 293},
|
||||
{0x1D200, 0x1D24F, 294},
|
||||
{0x1D2C0, 0x1D2DF, 295},
|
||||
{0x1D2E0, 0x1D2FF, 296},
|
||||
{0x1D300, 0x1D35F, 297},
|
||||
{0x1D360, 0x1D37F, 298},
|
||||
{0x1D400, 0x1D7FF, 299},
|
||||
{0x1D800, 0x1DAAF, 300},
|
||||
{0x1DF00, 0x1DFFF, 301},
|
||||
{0x1E000, 0x1E02F, 302},
|
||||
{0x1E030, 0x1E08F, 303},
|
||||
{0x1E100, 0x1E14F, 304},
|
||||
{0x1E290, 0x1E2BF, 305},
|
||||
{0x1E2C0, 0x1E2FF, 306},
|
||||
{0x1E4D0, 0x1E4FF, 307},
|
||||
{0x1E5D0, 0x1E5FF, 308},
|
||||
{0x1E6C0, 0x1E6FF, 309},
|
||||
{0x1E7E0, 0x1E7FF, 310},
|
||||
{0x1E800, 0x1E8DF, 311},
|
||||
{0x1E900, 0x1E95F, 312},
|
||||
{0x1EC70, 0x1ECBF, 313},
|
||||
{0x1ED00, 0x1ED4F, 314},
|
||||
{0x1EE00, 0x1EEFF, 315},
|
||||
{0x1F000, 0x1F02F, 316},
|
||||
{0x1F030, 0x1F09F, 317},
|
||||
{0x1F0A0, 0x1F0FF, 318},
|
||||
{0x1F100, 0x1F1FF, 319},
|
||||
{0x1F200, 0x1F2FF, 320},
|
||||
{0x1F300, 0x1F5FF, 321},
|
||||
{0x1F600, 0x1F64F, 322},
|
||||
{0x1F650, 0x1F67F, 323},
|
||||
{0x1F680, 0x1F6FF, 324},
|
||||
{0x1F700, 0x1F77F, 325},
|
||||
{0x1F780, 0x1F7FF, 326},
|
||||
{0x1F800, 0x1F8FF, 327},
|
||||
{0x1F900, 0x1F9FF, 328},
|
||||
{0x1FA00, 0x1FA6F, 329},
|
||||
{0x1FA70, 0x1FAFF, 330},
|
||||
{0x1FB00, 0x1FBFF, 331},
|
||||
{0x20000, 0x2A6DF, 332},
|
||||
{0x2A700, 0x2B73F, 333},
|
||||
{0x2B740, 0x2B81F, 334},
|
||||
{0x2B820, 0x2CEAF, 335},
|
||||
{0x2CEB0, 0x2EBEF, 336},
|
||||
{0x2EBF0, 0x2EE5F, 337},
|
||||
{0x2F800, 0x2FA1F, 338},
|
||||
{0x30000, 0x3134F, 339},
|
||||
{0x31350, 0x323AF, 340},
|
||||
{0x323B0, 0x3347F, 341},
|
||||
{0xE0000, 0xE007F, 342},
|
||||
{0xE0100, 0xE01EF, 343},
|
||||
{0xF0000, 0xFFFFF, 344},
|
||||
{0x100000, 0x10FFFF, 345},
|
||||
};
|
||||
#define BLOCK_COUNT 346
|
||||
|
||||
static const char *decomp_prefix[] = {
|
||||
"",
|
||||
"<noBreak>",
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@
|
|||
CASE_FOLDING = "CaseFolding%s.txt"
|
||||
GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
|
||||
EMOJI_DATA = "emoji/emoji-data%s.txt"
|
||||
BLOCKS = "Blocks%s.txt"
|
||||
|
||||
# Private Use Areas -- in planes 1, 15, 16
|
||||
PUA_1 = range(0xE000, 0xF900)
|
||||
|
|
@ -392,6 +393,34 @@ def makeunicodedata(unicode, trace):
|
|||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
# Generate block tables
|
||||
names = []
|
||||
name_to_index = {}
|
||||
blocks = []
|
||||
for start, end, name in unicode.blocks:
|
||||
if name not in name_to_index:
|
||||
name_to_index[name] = len(names)
|
||||
names.append(name)
|
||||
blocks.append((start, end, name_to_index[name]))
|
||||
|
||||
fprint("static const char * const _PyUnicode_BlockNames[] = {")
|
||||
for name in names:
|
||||
fprint(' "%s",' % name)
|
||||
fprint("};")
|
||||
|
||||
fprint("typedef struct {")
|
||||
fprint(" Py_UCS4 start;")
|
||||
fprint(" Py_UCS4 end;")
|
||||
fprint(" unsigned short name;")
|
||||
fprint("} _PyUnicode_Block;")
|
||||
|
||||
fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {")
|
||||
for start, end, name in blocks:
|
||||
fprint(" {0x%04X, 0x%04X, %d}," % (start, end, name))
|
||||
fprint("};")
|
||||
fprint(f"#define BLOCK_COUNT {len(blocks)}")
|
||||
fprint()
|
||||
|
||||
fprint("static const char *decomp_prefix[] = {")
|
||||
for name in decomp_prefix:
|
||||
fprint(" \"%s\"," % name)
|
||||
|
|
@ -1205,6 +1234,13 @@ def __init__(self, version, ideograph_check=True):
|
|||
ext_picts[char] = True
|
||||
self.ext_picts = ext_picts
|
||||
|
||||
# See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189
|
||||
self.blocks = []
|
||||
for record in UcdFile(BLOCKS, version).records():
|
||||
start_end, name = record
|
||||
start, end = [int(c, 16) for c in start_end.split('..')]
|
||||
self.blocks.append((start, end, name))
|
||||
self.blocks.sort()
|
||||
|
||||
def uselatin1(self):
|
||||
# restrict character range to ISO Latin 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue