gh-66802: Add unicodedata.block() function (#145042)

Closes #66802
This commit is contained in:
Stan Ulbrych 2026-02-21 17:27:55 +00:00 committed by GitHub
parent 6940c1dc0c
commit f1f61bf872
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 922 additions and 2 deletions

View file

@ -130,6 +130,18 @@ following functions:
`Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.
.. function:: block(chr, /)
Returns the `block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
assigned to the character *chr*. For example::
>>> unicodedata.block('S')
'Basic Latin'
.. versionadded:: next
.. function:: mirrored(chr, /)
Returns the mirrored property assigned to the character *chr* as

View file

@ -1134,6 +1134,11 @@ unicodedata
of the character which are related to the above algorithm.
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
* Add :func:`~unicodedata.block` function to return the `Unicode block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
assigned to a character.
(Contributed by Stan Ulbrych in :gh:`66802`.)
unittest
--------

View file

@ -973,6 +973,97 @@ def graphemes(*args):
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
def test_block(self):
self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
# New in 5.0.0
self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
# New in 5.1.0
self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
self.assertEqual(self.db.block('\uAA4D'), 'Cham')
# New in 5.2.0
self.assertEqual(self.db.block('\u0816'), 'Samaritan')
self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
# New in 6.0.0
self.assertEqual(self.db.block('\u093A'), 'Devanagari')
self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
# New in 6.1.0
self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
# New in 6.2.0
self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
# New in 6.3.0
self.assertEqual(self.db.block('\u180E'), 'Mongolian')
self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
# New in 7.0.0
self.assertEqual(self.db.block('\u0E33'), 'Thai')
self.assertEqual(self.db.block('\u0EB3'), 'Lao')
self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
self.assertEqual(self.db.block('\U0001163E'), 'Modi')
# New in 8.0.0
self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
self.assertEqual(self.db.block('\U00011726'), 'Ahom')
# New in 9.0.0
self.assertEqual(self.db.block('\u0600'), 'Arabic')
self.assertEqual(self.db.block('\U000E007F'), 'Tags')
self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
# New in 10.0.0
self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
# New in 11.0.0
self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
self.assertEqual(self.db.block('\u07FD'), 'NKo')
self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
# New in 12.0.0
self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
self.assertEqual(self.db.block('\U00016F87'), 'Miao')
# New in 13.0.0
self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
# New in 14.0.0
self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
# New in 15.0.0
self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
# New in 16.0.0
self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
# New in 17.0.0
self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
# Unassigned
self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B')
self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B')
def test_block_invalid_input(self):
self.assertRaises(TypeError, self.db.block)
self.assertRaises(TypeError, self.db.block, b'x')
self.assertRaises(TypeError, self.db.block, 120)
self.assertRaises(TypeError, self.db.block, '')
self.assertRaises(TypeError, self.db.block, 'xx')
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0

View file

@ -0,0 +1,3 @@
Add :func:`unicodedata.block` function to return the `Unicode block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_ of a
character.

View file

@ -691,6 +691,42 @@ exit:
return return_value;
}
PyDoc_STRVAR(unicodedata_block__doc__,
"block($module, chr, /)\n"
"--\n"
"\n"
"Return block assigned to the character chr.");
#define UNICODEDATA_BLOCK_METHODDEF \
{"block", (PyCFunction)unicodedata_block, METH_O, unicodedata_block__doc__},
static PyObject *
unicodedata_block_impl(PyObject *module, int chr);
static PyObject *
unicodedata_block(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;
if (!PyUnicode_Check(arg)) {
_PyArg_BadArgument("block", "argument", "a unicode character", arg);
goto exit;
}
if (PyUnicode_GET_LENGTH(arg) != 1) {
PyErr_Format(PyExc_TypeError,
"block(): argument must be a unicode character, "
"not a string of length %zd",
PyUnicode_GET_LENGTH(arg));
goto exit;
}
chr = PyUnicode_READ_CHAR(arg, 0);
return_value = unicodedata_block_impl(module, chr);
exit:
return return_value;
}
PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
"grapheme_cluster_break($module, chr, /)\n"
"--\n"
@ -798,4 +834,4 @@ unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=0f09cc90f06ace76 input=a9049054013a1b77]*/
/*[clinic end generated code: output=482a87df218f07c1 input=a9049054013a1b77]*/

View file

@ -2066,6 +2066,39 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
return (PyObject*)gbi;
}
/*[clinic input]
unicodedata.block
chr: int(accept={str})
/
Return block assigned to the character chr.
[clinic start generated code]*/
static PyObject *
unicodedata_block_impl(PyObject *module, int chr)
/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
{
Py_UCS4 c = (Py_UCS4)chr;
int lo = 0, hi = BLOCK_COUNT - 1;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (c < _PyUnicode_Blocks[mid].start) {
hi = mid - 1;
}
else if (c > _PyUnicode_Blocks[mid].end) {
lo = mid + 1;
}
else {
size_t name = _PyUnicode_Blocks[mid].name;
return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
}
}
// Otherwise, return the default value per
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
return PyUnicode_FromString("No_Block");
}
/*[clinic input]
unicodedata.grapheme_cluster_break
@ -2128,6 +2161,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr)
// an UCD instance.
static PyMethodDef unicodedata_functions[] = {
// Module only functions.
UNICODEDATA_BLOCK_METHODDEF
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
@ -2137,7 +2171,7 @@ static PyMethodDef unicodedata_functions[] = {
// The following definitions are shared between the module
// and the UCD class.
#define DB_methods (unicodedata_functions + 6)
#define DB_methods (unicodedata_functions + 7)
UNICODEDATA_UCD_DECIMAL_METHODDEF
UNICODEDATA_UCD_DIGIT_METHODDEF

703
Modules/unicodedata_db.h generated
View file

@ -796,6 +796,709 @@ const char * const _PyUnicode_IndicConjunctBreakNames[] = {
"Extend",
NULL
};
static const char * const _PyUnicode_BlockNames[] = {
"Basic Latin",
"Latin-1 Supplement",
"Latin Extended-A",
"Latin Extended-B",
"IPA Extensions",
"Spacing Modifier Letters",
"Combining Diacritical Marks",
"Greek and Coptic",
"Cyrillic",
"Cyrillic Supplement",
"Armenian",
"Hebrew",
"Arabic",
"Syriac",
"Arabic Supplement",
"Thaana",
"NKo",
"Samaritan",
"Mandaic",
"Syriac Supplement",
"Arabic Extended-B",
"Arabic Extended-A",
"Devanagari",
"Bengali",
"Gurmukhi",
"Gujarati",
"Oriya",
"Tamil",
"Telugu",
"Kannada",
"Malayalam",
"Sinhala",
"Thai",
"Lao",
"Tibetan",
"Myanmar",
"Georgian",
"Hangul Jamo",
"Ethiopic",
"Ethiopic Supplement",
"Cherokee",
"Unified Canadian Aboriginal Syllabics",
"Ogham",
"Runic",
"Tagalog",
"Hanunoo",
"Buhid",
"Tagbanwa",
"Khmer",
"Mongolian",
"Unified Canadian Aboriginal Syllabics Extended",
"Limbu",
"Tai Le",
"New Tai Lue",
"Khmer Symbols",
"Buginese",
"Tai Tham",
"Combining Diacritical Marks Extended",
"Balinese",
"Sundanese",
"Batak",
"Lepcha",
"Ol Chiki",
"Cyrillic Extended-C",
"Georgian Extended",
"Sundanese Supplement",
"Vedic Extensions",
"Phonetic Extensions",
"Phonetic Extensions Supplement",
"Combining Diacritical Marks Supplement",
"Latin Extended Additional",
"Greek Extended",
"General Punctuation",
"Superscripts and Subscripts",
"Currency Symbols",
"Combining Diacritical Marks for Symbols",
"Letterlike Symbols",
"Number Forms",
"Arrows",
"Mathematical Operators",
"Miscellaneous Technical",
"Control Pictures",
"Optical Character Recognition",
"Enclosed Alphanumerics",
"Box Drawing",
"Block Elements",
"Geometric Shapes",
"Miscellaneous Symbols",
"Dingbats",
"Miscellaneous Mathematical Symbols-A",
"Supplemental Arrows-A",
"Braille Patterns",
"Supplemental Arrows-B",
"Miscellaneous Mathematical Symbols-B",
"Supplemental Mathematical Operators",
"Miscellaneous Symbols and Arrows",
"Glagolitic",
"Latin Extended-C",
"Coptic",
"Georgian Supplement",
"Tifinagh",
"Ethiopic Extended",
"Cyrillic Extended-A",
"Supplemental Punctuation",
"CJK Radicals Supplement",
"Kangxi Radicals",
"Ideographic Description Characters",
"CJK Symbols and Punctuation",
"Hiragana",
"Katakana",
"Bopomofo",
"Hangul Compatibility Jamo",
"Kanbun",
"Bopomofo Extended",
"CJK Strokes",
"Katakana Phonetic Extensions",
"Enclosed CJK Letters and Months",
"CJK Compatibility",
"CJK Unified Ideographs Extension A",
"Yijing Hexagram Symbols",
"CJK Unified Ideographs",
"Yi Syllables",
"Yi Radicals",
"Lisu",
"Vai",
"Cyrillic Extended-B",
"Bamum",
"Modifier Tone Letters",
"Latin Extended-D",
"Syloti Nagri",
"Common Indic Number Forms",
"Phags-pa",
"Saurashtra",
"Devanagari Extended",
"Kayah Li",
"Rejang",
"Hangul Jamo Extended-A",
"Javanese",
"Myanmar Extended-B",
"Cham",
"Myanmar Extended-A",
"Tai Viet",
"Meetei Mayek Extensions",
"Ethiopic Extended-A",
"Latin Extended-E",
"Cherokee Supplement",
"Meetei Mayek",
"Hangul Syllables",
"Hangul Jamo Extended-B",
"High Surrogates",
"High Private Use Surrogates",
"Low Surrogates",
"Private Use Area",
"CJK Compatibility Ideographs",
"Alphabetic Presentation Forms",
"Arabic Presentation Forms-A",
"Variation Selectors",
"Vertical Forms",
"Combining Half Marks",
"CJK Compatibility Forms",
"Small Form Variants",
"Arabic Presentation Forms-B",
"Halfwidth and Fullwidth Forms",
"Specials",
"Linear B Syllabary",
"Linear B Ideograms",
"Aegean Numbers",
"Ancient Greek Numbers",
"Ancient Symbols",
"Phaistos Disc",
"Lycian",
"Carian",
"Coptic Epact Numbers",
"Old Italic",
"Gothic",
"Old Permic",
"Ugaritic",
"Old Persian",
"Deseret",
"Shavian",
"Osmanya",
"Osage",
"Elbasan",
"Caucasian Albanian",
"Vithkuqi",
"Todhri",
"Linear A",
"Latin Extended-F",
"Cypriot Syllabary",
"Imperial Aramaic",
"Palmyrene",
"Nabataean",
"Hatran",
"Phoenician",
"Lydian",
"Sidetic",
"Meroitic Hieroglyphs",
"Meroitic Cursive",
"Kharoshthi",
"Old South Arabian",
"Old North Arabian",
"Manichaean",
"Avestan",
"Inscriptional Parthian",
"Inscriptional Pahlavi",
"Psalter Pahlavi",
"Old Turkic",
"Old Hungarian",
"Hanifi Rohingya",
"Garay",
"Rumi Numeral Symbols",
"Yezidi",
"Arabic Extended-C",
"Old Sogdian",
"Sogdian",
"Old Uyghur",
"Chorasmian",
"Elymaic",
"Brahmi",
"Kaithi",
"Sora Sompeng",
"Chakma",
"Mahajani",
"Sharada",
"Sinhala Archaic Numbers",
"Khojki",
"Multani",
"Khudawadi",
"Grantha",
"Tulu-Tigalari",
"Newa",
"Tirhuta",
"Siddham",
"Modi",
"Mongolian Supplement",
"Takri",
"Myanmar Extended-C",
"Ahom",
"Dogra",
"Warang Citi",
"Dives Akuru",
"Nandinagari",
"Zanabazar Square",
"Soyombo",
"Unified Canadian Aboriginal Syllabics Extended-A",
"Pau Cin Hau",
"Devanagari Extended-A",
"Sharada Supplement",
"Sunuwar",
"Bhaiksuki",
"Marchen",
"Masaram Gondi",
"Gunjala Gondi",
"Tolong Siki",
"Makasar",
"Kawi",
"Lisu Supplement",
"Tamil Supplement",
"Cuneiform",
"Cuneiform Numbers and Punctuation",
"Early Dynastic Cuneiform",
"Cypro-Minoan",
"Egyptian Hieroglyphs",
"Egyptian Hieroglyph Format Controls",
"Egyptian Hieroglyphs Extended-A",
"Anatolian Hieroglyphs",
"Gurung Khema",
"Bamum Supplement",
"Mro",
"Tangsa",
"Bassa Vah",
"Pahawh Hmong",
"Kirat Rai",
"Medefaidrin",
"Beria Erfe",
"Miao",
"Ideographic Symbols and Punctuation",
"Tangut",
"Tangut Components",
"Khitan Small Script",
"Tangut Supplement",
"Tangut Components Supplement",
"Kana Extended-B",
"Kana Supplement",
"Kana Extended-A",
"Small Kana Extension",
"Nushu",
"Duployan",
"Shorthand Format Controls",
"Symbols for Legacy Computing Supplement",
"Miscellaneous Symbols Supplement",
"Znamenny Musical Notation",
"Byzantine Musical Symbols",
"Musical Symbols",
"Ancient Greek Musical Notation",
"Kaktovik Numerals",
"Mayan Numerals",
"Tai Xuan Jing Symbols",
"Counting Rod Numerals",
"Mathematical Alphanumeric Symbols",
"Sutton SignWriting",
"Latin Extended-G",
"Glagolitic Supplement",
"Cyrillic Extended-D",
"Nyiakeng Puachue Hmong",
"Toto",
"Wancho",
"Nag Mundari",
"Ol Onal",
"Tai Yo",
"Ethiopic Extended-B",
"Mende Kikakui",
"Adlam",
"Indic Siyaq Numbers",
"Ottoman Siyaq Numbers",
"Arabic Mathematical Alphabetic Symbols",
"Mahjong Tiles",
"Domino Tiles",
"Playing Cards",
"Enclosed Alphanumeric Supplement",
"Enclosed Ideographic Supplement",
"Miscellaneous Symbols and Pictographs",
"Emoticons",
"Ornamental Dingbats",
"Transport and Map Symbols",
"Alchemical Symbols",
"Geometric Shapes Extended",
"Supplemental Arrows-C",
"Supplemental Symbols and Pictographs",
"Chess Symbols",
"Symbols and Pictographs Extended-A",
"Symbols for Legacy Computing",
"CJK Unified Ideographs Extension B",
"CJK Unified Ideographs Extension C",
"CJK Unified Ideographs Extension D",
"CJK Unified Ideographs Extension E",
"CJK Unified Ideographs Extension F",
"CJK Unified Ideographs Extension I",
"CJK Compatibility Ideographs Supplement",
"CJK Unified Ideographs Extension G",
"CJK Unified Ideographs Extension H",
"CJK Unified Ideographs Extension J",
"Tags",
"Variation Selectors Supplement",
"Supplementary Private Use Area-A",
"Supplementary Private Use Area-B",
};
typedef struct {
Py_UCS4 start;
Py_UCS4 end;
unsigned short name;
} _PyUnicode_Block;
static const _PyUnicode_Block _PyUnicode_Blocks[] = {
{0x0000, 0x007F, 0},
{0x0080, 0x00FF, 1},
{0x0100, 0x017F, 2},
{0x0180, 0x024F, 3},
{0x0250, 0x02AF, 4},
{0x02B0, 0x02FF, 5},
{0x0300, 0x036F, 6},
{0x0370, 0x03FF, 7},
{0x0400, 0x04FF, 8},
{0x0500, 0x052F, 9},
{0x0530, 0x058F, 10},
{0x0590, 0x05FF, 11},
{0x0600, 0x06FF, 12},
{0x0700, 0x074F, 13},
{0x0750, 0x077F, 14},
{0x0780, 0x07BF, 15},
{0x07C0, 0x07FF, 16},
{0x0800, 0x083F, 17},
{0x0840, 0x085F, 18},
{0x0860, 0x086F, 19},
{0x0870, 0x089F, 20},
{0x08A0, 0x08FF, 21},
{0x0900, 0x097F, 22},
{0x0980, 0x09FF, 23},
{0x0A00, 0x0A7F, 24},
{0x0A80, 0x0AFF, 25},
{0x0B00, 0x0B7F, 26},
{0x0B80, 0x0BFF, 27},
{0x0C00, 0x0C7F, 28},
{0x0C80, 0x0CFF, 29},
{0x0D00, 0x0D7F, 30},
{0x0D80, 0x0DFF, 31},
{0x0E00, 0x0E7F, 32},
{0x0E80, 0x0EFF, 33},
{0x0F00, 0x0FFF, 34},
{0x1000, 0x109F, 35},
{0x10A0, 0x10FF, 36},
{0x1100, 0x11FF, 37},
{0x1200, 0x137F, 38},
{0x1380, 0x139F, 39},
{0x13A0, 0x13FF, 40},
{0x1400, 0x167F, 41},
{0x1680, 0x169F, 42},
{0x16A0, 0x16FF, 43},
{0x1700, 0x171F, 44},
{0x1720, 0x173F, 45},
{0x1740, 0x175F, 46},
{0x1760, 0x177F, 47},
{0x1780, 0x17FF, 48},
{0x1800, 0x18AF, 49},
{0x18B0, 0x18FF, 50},
{0x1900, 0x194F, 51},
{0x1950, 0x197F, 52},
{0x1980, 0x19DF, 53},
{0x19E0, 0x19FF, 54},
{0x1A00, 0x1A1F, 55},
{0x1A20, 0x1AAF, 56},
{0x1AB0, 0x1AFF, 57},
{0x1B00, 0x1B7F, 58},
{0x1B80, 0x1BBF, 59},
{0x1BC0, 0x1BFF, 60},
{0x1C00, 0x1C4F, 61},
{0x1C50, 0x1C7F, 62},
{0x1C80, 0x1C8F, 63},
{0x1C90, 0x1CBF, 64},
{0x1CC0, 0x1CCF, 65},
{0x1CD0, 0x1CFF, 66},
{0x1D00, 0x1D7F, 67},
{0x1D80, 0x1DBF, 68},
{0x1DC0, 0x1DFF, 69},
{0x1E00, 0x1EFF, 70},
{0x1F00, 0x1FFF, 71},
{0x2000, 0x206F, 72},
{0x2070, 0x209F, 73},
{0x20A0, 0x20CF, 74},
{0x20D0, 0x20FF, 75},
{0x2100, 0x214F, 76},
{0x2150, 0x218F, 77},
{0x2190, 0x21FF, 78},
{0x2200, 0x22FF, 79},
{0x2300, 0x23FF, 80},
{0x2400, 0x243F, 81},
{0x2440, 0x245F, 82},
{0x2460, 0x24FF, 83},
{0x2500, 0x257F, 84},
{0x2580, 0x259F, 85},
{0x25A0, 0x25FF, 86},
{0x2600, 0x26FF, 87},
{0x2700, 0x27BF, 88},
{0x27C0, 0x27EF, 89},
{0x27F0, 0x27FF, 90},
{0x2800, 0x28FF, 91},
{0x2900, 0x297F, 92},
{0x2980, 0x29FF, 93},
{0x2A00, 0x2AFF, 94},
{0x2B00, 0x2BFF, 95},
{0x2C00, 0x2C5F, 96},
{0x2C60, 0x2C7F, 97},
{0x2C80, 0x2CFF, 98},
{0x2D00, 0x2D2F, 99},
{0x2D30, 0x2D7F, 100},
{0x2D80, 0x2DDF, 101},
{0x2DE0, 0x2DFF, 102},
{0x2E00, 0x2E7F, 103},
{0x2E80, 0x2EFF, 104},
{0x2F00, 0x2FDF, 105},
{0x2FF0, 0x2FFF, 106},
{0x3000, 0x303F, 107},
{0x3040, 0x309F, 108},
{0x30A0, 0x30FF, 109},
{0x3100, 0x312F, 110},
{0x3130, 0x318F, 111},
{0x3190, 0x319F, 112},
{0x31A0, 0x31BF, 113},
{0x31C0, 0x31EF, 114},
{0x31F0, 0x31FF, 115},
{0x3200, 0x32FF, 116},
{0x3300, 0x33FF, 117},
{0x3400, 0x4DBF, 118},
{0x4DC0, 0x4DFF, 119},
{0x4E00, 0x9FFF, 120},
{0xA000, 0xA48F, 121},
{0xA490, 0xA4CF, 122},
{0xA4D0, 0xA4FF, 123},
{0xA500, 0xA63F, 124},
{0xA640, 0xA69F, 125},
{0xA6A0, 0xA6FF, 126},
{0xA700, 0xA71F, 127},
{0xA720, 0xA7FF, 128},
{0xA800, 0xA82F, 129},
{0xA830, 0xA83F, 130},
{0xA840, 0xA87F, 131},
{0xA880, 0xA8DF, 132},
{0xA8E0, 0xA8FF, 133},
{0xA900, 0xA92F, 134},
{0xA930, 0xA95F, 135},
{0xA960, 0xA97F, 136},
{0xA980, 0xA9DF, 137},
{0xA9E0, 0xA9FF, 138},
{0xAA00, 0xAA5F, 139},
{0xAA60, 0xAA7F, 140},
{0xAA80, 0xAADF, 141},
{0xAAE0, 0xAAFF, 142},
{0xAB00, 0xAB2F, 143},
{0xAB30, 0xAB6F, 144},
{0xAB70, 0xABBF, 145},
{0xABC0, 0xABFF, 146},
{0xAC00, 0xD7AF, 147},
{0xD7B0, 0xD7FF, 148},
{0xD800, 0xDB7F, 149},
{0xDB80, 0xDBFF, 150},
{0xDC00, 0xDFFF, 151},
{0xE000, 0xF8FF, 152},
{0xF900, 0xFAFF, 153},
{0xFB00, 0xFB4F, 154},
{0xFB50, 0xFDFF, 155},
{0xFE00, 0xFE0F, 156},
{0xFE10, 0xFE1F, 157},
{0xFE20, 0xFE2F, 158},
{0xFE30, 0xFE4F, 159},
{0xFE50, 0xFE6F, 160},
{0xFE70, 0xFEFF, 161},
{0xFF00, 0xFFEF, 162},
{0xFFF0, 0xFFFF, 163},
{0x10000, 0x1007F, 164},
{0x10080, 0x100FF, 165},
{0x10100, 0x1013F, 166},
{0x10140, 0x1018F, 167},
{0x10190, 0x101CF, 168},
{0x101D0, 0x101FF, 169},
{0x10280, 0x1029F, 170},
{0x102A0, 0x102DF, 171},
{0x102E0, 0x102FF, 172},
{0x10300, 0x1032F, 173},
{0x10330, 0x1034F, 174},
{0x10350, 0x1037F, 175},
{0x10380, 0x1039F, 176},
{0x103A0, 0x103DF, 177},
{0x10400, 0x1044F, 178},
{0x10450, 0x1047F, 179},
{0x10480, 0x104AF, 180},
{0x104B0, 0x104FF, 181},
{0x10500, 0x1052F, 182},
{0x10530, 0x1056F, 183},
{0x10570, 0x105BF, 184},
{0x105C0, 0x105FF, 185},
{0x10600, 0x1077F, 186},
{0x10780, 0x107BF, 187},
{0x10800, 0x1083F, 188},
{0x10840, 0x1085F, 189},
{0x10860, 0x1087F, 190},
{0x10880, 0x108AF, 191},
{0x108E0, 0x108FF, 192},
{0x10900, 0x1091F, 193},
{0x10920, 0x1093F, 194},
{0x10940, 0x1095F, 195},
{0x10980, 0x1099F, 196},
{0x109A0, 0x109FF, 197},
{0x10A00, 0x10A5F, 198},
{0x10A60, 0x10A7F, 199},
{0x10A80, 0x10A9F, 200},
{0x10AC0, 0x10AFF, 201},
{0x10B00, 0x10B3F, 202},
{0x10B40, 0x10B5F, 203},
{0x10B60, 0x10B7F, 204},
{0x10B80, 0x10BAF, 205},
{0x10C00, 0x10C4F, 206},
{0x10C80, 0x10CFF, 207},
{0x10D00, 0x10D3F, 208},
{0x10D40, 0x10D8F, 209},
{0x10E60, 0x10E7F, 210},
{0x10E80, 0x10EBF, 211},
{0x10EC0, 0x10EFF, 212},
{0x10F00, 0x10F2F, 213},
{0x10F30, 0x10F6F, 214},
{0x10F70, 0x10FAF, 215},
{0x10FB0, 0x10FDF, 216},
{0x10FE0, 0x10FFF, 217},
{0x11000, 0x1107F, 218},
{0x11080, 0x110CF, 219},
{0x110D0, 0x110FF, 220},
{0x11100, 0x1114F, 221},
{0x11150, 0x1117F, 222},
{0x11180, 0x111DF, 223},
{0x111E0, 0x111FF, 224},
{0x11200, 0x1124F, 225},
{0x11280, 0x112AF, 226},
{0x112B0, 0x112FF, 227},
{0x11300, 0x1137F, 228},
{0x11380, 0x113FF, 229},
{0x11400, 0x1147F, 230},
{0x11480, 0x114DF, 231},
{0x11580, 0x115FF, 232},
{0x11600, 0x1165F, 233},
{0x11660, 0x1167F, 234},
{0x11680, 0x116CF, 235},
{0x116D0, 0x116FF, 236},
{0x11700, 0x1174F, 237},
{0x11800, 0x1184F, 238},
{0x118A0, 0x118FF, 239},
{0x11900, 0x1195F, 240},
{0x119A0, 0x119FF, 241},
{0x11A00, 0x11A4F, 242},
{0x11A50, 0x11AAF, 243},
{0x11AB0, 0x11ABF, 244},
{0x11AC0, 0x11AFF, 245},
{0x11B00, 0x11B5F, 246},
{0x11B60, 0x11B7F, 247},
{0x11BC0, 0x11BFF, 248},
{0x11C00, 0x11C6F, 249},
{0x11C70, 0x11CBF, 250},
{0x11D00, 0x11D5F, 251},
{0x11D60, 0x11DAF, 252},
{0x11DB0, 0x11DEF, 253},
{0x11EE0, 0x11EFF, 254},
{0x11F00, 0x11F5F, 255},
{0x11FB0, 0x11FBF, 256},
{0x11FC0, 0x11FFF, 257},
{0x12000, 0x123FF, 258},
{0x12400, 0x1247F, 259},
{0x12480, 0x1254F, 260},
{0x12F90, 0x12FFF, 261},
{0x13000, 0x1342F, 262},
{0x13430, 0x1345F, 263},
{0x13460, 0x143FF, 264},
{0x14400, 0x1467F, 265},
{0x16100, 0x1613F, 266},
{0x16800, 0x16A3F, 267},
{0x16A40, 0x16A6F, 268},
{0x16A70, 0x16ACF, 269},
{0x16AD0, 0x16AFF, 270},
{0x16B00, 0x16B8F, 271},
{0x16D40, 0x16D7F, 272},
{0x16E40, 0x16E9F, 273},
{0x16EA0, 0x16EDF, 274},
{0x16F00, 0x16F9F, 275},
{0x16FE0, 0x16FFF, 276},
{0x17000, 0x187FF, 277},
{0x18800, 0x18AFF, 278},
{0x18B00, 0x18CFF, 279},
{0x18D00, 0x18D7F, 280},
{0x18D80, 0x18DFF, 281},
{0x1AFF0, 0x1AFFF, 282},
{0x1B000, 0x1B0FF, 283},
{0x1B100, 0x1B12F, 284},
{0x1B130, 0x1B16F, 285},
{0x1B170, 0x1B2FF, 286},
{0x1BC00, 0x1BC9F, 287},
{0x1BCA0, 0x1BCAF, 288},
{0x1CC00, 0x1CEBF, 289},
{0x1CEC0, 0x1CEFF, 290},
{0x1CF00, 0x1CFCF, 291},
{0x1D000, 0x1D0FF, 292},
{0x1D100, 0x1D1FF, 293},
{0x1D200, 0x1D24F, 294},
{0x1D2C0, 0x1D2DF, 295},
{0x1D2E0, 0x1D2FF, 296},
{0x1D300, 0x1D35F, 297},
{0x1D360, 0x1D37F, 298},
{0x1D400, 0x1D7FF, 299},
{0x1D800, 0x1DAAF, 300},
{0x1DF00, 0x1DFFF, 301},
{0x1E000, 0x1E02F, 302},
{0x1E030, 0x1E08F, 303},
{0x1E100, 0x1E14F, 304},
{0x1E290, 0x1E2BF, 305},
{0x1E2C0, 0x1E2FF, 306},
{0x1E4D0, 0x1E4FF, 307},
{0x1E5D0, 0x1E5FF, 308},
{0x1E6C0, 0x1E6FF, 309},
{0x1E7E0, 0x1E7FF, 310},
{0x1E800, 0x1E8DF, 311},
{0x1E900, 0x1E95F, 312},
{0x1EC70, 0x1ECBF, 313},
{0x1ED00, 0x1ED4F, 314},
{0x1EE00, 0x1EEFF, 315},
{0x1F000, 0x1F02F, 316},
{0x1F030, 0x1F09F, 317},
{0x1F0A0, 0x1F0FF, 318},
{0x1F100, 0x1F1FF, 319},
{0x1F200, 0x1F2FF, 320},
{0x1F300, 0x1F5FF, 321},
{0x1F600, 0x1F64F, 322},
{0x1F650, 0x1F67F, 323},
{0x1F680, 0x1F6FF, 324},
{0x1F700, 0x1F77F, 325},
{0x1F780, 0x1F7FF, 326},
{0x1F800, 0x1F8FF, 327},
{0x1F900, 0x1F9FF, 328},
{0x1FA00, 0x1FA6F, 329},
{0x1FA70, 0x1FAFF, 330},
{0x1FB00, 0x1FBFF, 331},
{0x20000, 0x2A6DF, 332},
{0x2A700, 0x2B73F, 333},
{0x2B740, 0x2B81F, 334},
{0x2B820, 0x2CEAF, 335},
{0x2CEB0, 0x2EBEF, 336},
{0x2EBF0, 0x2EE5F, 337},
{0x2F800, 0x2FA1F, 338},
{0x30000, 0x3134F, 339},
{0x31350, 0x323AF, 340},
{0x323B0, 0x3347F, 341},
{0xE0000, 0xE007F, 342},
{0xE0100, 0xE01EF, 343},
{0xF0000, 0xFFFFF, 344},
{0x100000, 0x10FFFF, 345},
};
#define BLOCK_COUNT 346
static const char *decomp_prefix[] = {
"",
"<noBreak>",

View file

@ -60,6 +60,7 @@
CASE_FOLDING = "CaseFolding%s.txt"
GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
EMOJI_DATA = "emoji/emoji-data%s.txt"
BLOCKS = "Blocks%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
@ -392,6 +393,34 @@ def makeunicodedata(unicode, trace):
fprint(" NULL")
fprint("};")
# Generate block tables
names = []
name_to_index = {}
blocks = []
for start, end, name in unicode.blocks:
if name not in name_to_index:
name_to_index[name] = len(names)
names.append(name)
blocks.append((start, end, name_to_index[name]))
fprint("static const char * const _PyUnicode_BlockNames[] = {")
for name in names:
fprint(' "%s",' % name)
fprint("};")
fprint("typedef struct {")
fprint(" Py_UCS4 start;")
fprint(" Py_UCS4 end;")
fprint(" unsigned short name;")
fprint("} _PyUnicode_Block;")
fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {")
for start, end, name in blocks:
fprint(" {0x%04X, 0x%04X, %d}," % (start, end, name))
fprint("};")
fprint(f"#define BLOCK_COUNT {len(blocks)}")
fprint()
fprint("static const char *decomp_prefix[] = {")
for name in decomp_prefix:
fprint(" \"%s\"," % name)
@ -1205,6 +1234,13 @@ def __init__(self, version, ideograph_check=True):
ext_picts[char] = True
self.ext_picts = ext_picts
# See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189
self.blocks = []
for record in UcdFile(BLOCKS, version).records():
start_end, name = record
start, end = [int(c, 16) for c in start_end.split('..')]
self.blocks.append((start, end, name))
self.blocks.sort()
def uselatin1(self):
# restrict character range to ISO Latin 1