diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
index 2fc8b1d8b52..d5f0405efbe 100644
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -130,6 +130,18 @@ following functions:
`Unicode Standard Annex #11 `_.
+.. function:: block(chr, /)
+
+ Returns the `block
+ `_
+ assigned to the character *chr*. For example::
+
+ >>> unicodedata.block('S')
+ 'Basic Latin'
+
+ .. versionadded:: next
+
+
.. function:: mirrored(chr, /)
Returns the mirrored property assigned to the character *chr* as
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index fa3ba25a954..cd1ec0e5c45 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -1134,6 +1134,11 @@ unicodedata
of the character which are related to the above algorithm.
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
+* Add :func:`~unicodedata.block` function to return the `Unicode block
+ `_
+ assigned to a character.
+ (Contributed by Stan Ulbrych in :gh:`66802`.)
+
unittest
--------
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index 1d03e7d9fec..8d4ba677faa 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -973,6 +973,97 @@ def graphemes(*args):
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
+ def test_block(self):
+ self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
+ self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
+ self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
+ self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
+ self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
+ self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
+ self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
+ self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
+ self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
+ # New in 5.0.0
+ self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
+ self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
+ # New in 5.1.0
+ self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
+ self.assertEqual(self.db.block('\uAA4D'), 'Cham')
+ # New in 5.2.0
+ self.assertEqual(self.db.block('\u0816'), 'Samaritan')
+ self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
+ self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
+ self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
+ # New in 6.0.0
+ self.assertEqual(self.db.block('\u093A'), 'Devanagari')
+ self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
+ # New in 6.1.0
+ self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
+ self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
+ # New in 6.2.0
+ self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
+ self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
+ # New in 6.3.0
+ self.assertEqual(self.db.block('\u180E'), 'Mongolian')
+ self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
+ # New in 7.0.0
+ self.assertEqual(self.db.block('\u0E33'), 'Thai')
+ self.assertEqual(self.db.block('\u0EB3'), 'Lao')
+ self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
+ self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
+ self.assertEqual(self.db.block('\U0001163E'), 'Modi')
+ # New in 8.0.0
+ self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
+ self.assertEqual(self.db.block('\U00011726'), 'Ahom')
+ # New in 9.0.0
+ self.assertEqual(self.db.block('\u0600'), 'Arabic')
+ self.assertEqual(self.db.block('\U000E007F'), 'Tags')
+ self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
+ self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
+ # New in 10.0.0
+ self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
+ self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
+ self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
+ # New in 11.0.0
+ self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
+ self.assertEqual(self.db.block('\u07FD'), 'NKo')
+ self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
+ # New in 12.0.0
+ self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
+ self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
+ self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
+ self.assertEqual(self.db.block('\U00016F87'), 'Miao')
+ # New in 13.0.0
+ self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
+ self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
+ self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
+ # New in 14.0.0
+ self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
+ self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
+ # New in 15.0.0
+ self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
+ self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
+ self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
+ self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
+ # New in 16.0.0
+ self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
+ self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
+ self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
+ self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
+ # New in 17.0.0
+ self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
+ self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
+ # Unassigned
+ self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B')
+ self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B')
+
+ def test_block_invalid_input(self):
+ self.assertRaises(TypeError, self.db.block)
+ self.assertRaises(TypeError, self.db.block, b'x')
+ self.assertRaises(TypeError, self.db.block, 120)
+ self.assertRaises(TypeError, self.db.block, '')
+ self.assertRaises(TypeError, self.db.block, 'xx')
+
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
diff --git a/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst
new file mode 100644
index 00000000000..68a25262c7d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst
@@ -0,0 +1,3 @@
+Add :func:`unicodedata.block` function to return the `Unicode block
+`_ of a
+character.
diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h
index 8e2dd7a0ce5..5443893079b 100644
--- a/Modules/clinic/unicodedata.c.h
+++ b/Modules/clinic/unicodedata.c.h
@@ -691,6 +691,42 @@ exit:
return return_value;
}
+PyDoc_STRVAR(unicodedata_block__doc__,
+"block($module, chr, /)\n"
+"--\n"
+"\n"
+"Return block assigned to the character chr.");
+
+#define UNICODEDATA_BLOCK_METHODDEF \
+ {"block", (PyCFunction)unicodedata_block, METH_O, unicodedata_block__doc__},
+
+static PyObject *
+unicodedata_block_impl(PyObject *module, int chr);
+
+static PyObject *
+unicodedata_block(PyObject *module, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ int chr;
+
+ if (!PyUnicode_Check(arg)) {
+ _PyArg_BadArgument("block", "argument", "a unicode character", arg);
+ goto exit;
+ }
+ if (PyUnicode_GET_LENGTH(arg) != 1) {
+ PyErr_Format(PyExc_TypeError,
+ "block(): argument must be a unicode character, "
+ "not a string of length %zd",
+ PyUnicode_GET_LENGTH(arg));
+ goto exit;
+ }
+ chr = PyUnicode_READ_CHAR(arg, 0);
+ return_value = unicodedata_block_impl(module, chr);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
"grapheme_cluster_break($module, chr, /)\n"
"--\n"
@@ -798,4 +834,4 @@ unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
exit:
return return_value;
}
-/*[clinic end generated code: output=0f09cc90f06ace76 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=482a87df218f07c1 input=a9049054013a1b77]*/
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 1ed9760874b..f20726a937c 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -2066,6 +2066,39 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
return (PyObject*)gbi;
}
+/*[clinic input]
+unicodedata.block
+
+ chr: int(accept={str})
+ /
+
+Return block assigned to the character chr.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_block_impl(PyObject *module, int chr)
+/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
+{
+ Py_UCS4 c = (Py_UCS4)chr;
+ int lo = 0, hi = BLOCK_COUNT - 1;
+ while (lo <= hi) {
+ int mid = (lo + hi) / 2;
+ if (c < _PyUnicode_Blocks[mid].start) {
+ hi = mid - 1;
+ }
+ else if (c > _PyUnicode_Blocks[mid].end) {
+ lo = mid + 1;
+ }
+ else {
+ size_t name = _PyUnicode_Blocks[mid].name;
+ return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
+ }
+ }
+ // Otherwise, return the default value per
+ // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
+ return PyUnicode_FromString("No_Block");
+}
+
/*[clinic input]
unicodedata.grapheme_cluster_break
@@ -2128,6 +2161,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr)
// an UCD instance.
static PyMethodDef unicodedata_functions[] = {
// Module only functions.
+ UNICODEDATA_BLOCK_METHODDEF
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
@@ -2137,7 +2171,7 @@ static PyMethodDef unicodedata_functions[] = {
// The following definitions are shared between the module
// and the UCD class.
-#define DB_methods (unicodedata_functions + 6)
+#define DB_methods (unicodedata_functions + 7)
UNICODEDATA_UCD_DECIMAL_METHODDEF
UNICODEDATA_UCD_DIGIT_METHODDEF
diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h
index 3cc5776a1f2..9e88f5cca71 100644
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
@@ -796,6 +796,709 @@ const char * const _PyUnicode_IndicConjunctBreakNames[] = {
"Extend",
NULL
};
+static const char * const _PyUnicode_BlockNames[] = {
+ "Basic Latin",
+ "Latin-1 Supplement",
+ "Latin Extended-A",
+ "Latin Extended-B",
+ "IPA Extensions",
+ "Spacing Modifier Letters",
+ "Combining Diacritical Marks",
+ "Greek and Coptic",
+ "Cyrillic",
+ "Cyrillic Supplement",
+ "Armenian",
+ "Hebrew",
+ "Arabic",
+ "Syriac",
+ "Arabic Supplement",
+ "Thaana",
+ "NKo",
+ "Samaritan",
+ "Mandaic",
+ "Syriac Supplement",
+ "Arabic Extended-B",
+ "Arabic Extended-A",
+ "Devanagari",
+ "Bengali",
+ "Gurmukhi",
+ "Gujarati",
+ "Oriya",
+ "Tamil",
+ "Telugu",
+ "Kannada",
+ "Malayalam",
+ "Sinhala",
+ "Thai",
+ "Lao",
+ "Tibetan",
+ "Myanmar",
+ "Georgian",
+ "Hangul Jamo",
+ "Ethiopic",
+ "Ethiopic Supplement",
+ "Cherokee",
+ "Unified Canadian Aboriginal Syllabics",
+ "Ogham",
+ "Runic",
+ "Tagalog",
+ "Hanunoo",
+ "Buhid",
+ "Tagbanwa",
+ "Khmer",
+ "Mongolian",
+ "Unified Canadian Aboriginal Syllabics Extended",
+ "Limbu",
+ "Tai Le",
+ "New Tai Lue",
+ "Khmer Symbols",
+ "Buginese",
+ "Tai Tham",
+ "Combining Diacritical Marks Extended",
+ "Balinese",
+ "Sundanese",
+ "Batak",
+ "Lepcha",
+ "Ol Chiki",
+ "Cyrillic Extended-C",
+ "Georgian Extended",
+ "Sundanese Supplement",
+ "Vedic Extensions",
+ "Phonetic Extensions",
+ "Phonetic Extensions Supplement",
+ "Combining Diacritical Marks Supplement",
+ "Latin Extended Additional",
+ "Greek Extended",
+ "General Punctuation",
+ "Superscripts and Subscripts",
+ "Currency Symbols",
+ "Combining Diacritical Marks for Symbols",
+ "Letterlike Symbols",
+ "Number Forms",
+ "Arrows",
+ "Mathematical Operators",
+ "Miscellaneous Technical",
+ "Control Pictures",
+ "Optical Character Recognition",
+ "Enclosed Alphanumerics",
+ "Box Drawing",
+ "Block Elements",
+ "Geometric Shapes",
+ "Miscellaneous Symbols",
+ "Dingbats",
+ "Miscellaneous Mathematical Symbols-A",
+ "Supplemental Arrows-A",
+ "Braille Patterns",
+ "Supplemental Arrows-B",
+ "Miscellaneous Mathematical Symbols-B",
+ "Supplemental Mathematical Operators",
+ "Miscellaneous Symbols and Arrows",
+ "Glagolitic",
+ "Latin Extended-C",
+ "Coptic",
+ "Georgian Supplement",
+ "Tifinagh",
+ "Ethiopic Extended",
+ "Cyrillic Extended-A",
+ "Supplemental Punctuation",
+ "CJK Radicals Supplement",
+ "Kangxi Radicals",
+ "Ideographic Description Characters",
+ "CJK Symbols and Punctuation",
+ "Hiragana",
+ "Katakana",
+ "Bopomofo",
+ "Hangul Compatibility Jamo",
+ "Kanbun",
+ "Bopomofo Extended",
+ "CJK Strokes",
+ "Katakana Phonetic Extensions",
+ "Enclosed CJK Letters and Months",
+ "CJK Compatibility",
+ "CJK Unified Ideographs Extension A",
+ "Yijing Hexagram Symbols",
+ "CJK Unified Ideographs",
+ "Yi Syllables",
+ "Yi Radicals",
+ "Lisu",
+ "Vai",
+ "Cyrillic Extended-B",
+ "Bamum",
+ "Modifier Tone Letters",
+ "Latin Extended-D",
+ "Syloti Nagri",
+ "Common Indic Number Forms",
+ "Phags-pa",
+ "Saurashtra",
+ "Devanagari Extended",
+ "Kayah Li",
+ "Rejang",
+ "Hangul Jamo Extended-A",
+ "Javanese",
+ "Myanmar Extended-B",
+ "Cham",
+ "Myanmar Extended-A",
+ "Tai Viet",
+ "Meetei Mayek Extensions",
+ "Ethiopic Extended-A",
+ "Latin Extended-E",
+ "Cherokee Supplement",
+ "Meetei Mayek",
+ "Hangul Syllables",
+ "Hangul Jamo Extended-B",
+ "High Surrogates",
+ "High Private Use Surrogates",
+ "Low Surrogates",
+ "Private Use Area",
+ "CJK Compatibility Ideographs",
+ "Alphabetic Presentation Forms",
+ "Arabic Presentation Forms-A",
+ "Variation Selectors",
+ "Vertical Forms",
+ "Combining Half Marks",
+ "CJK Compatibility Forms",
+ "Small Form Variants",
+ "Arabic Presentation Forms-B",
+ "Halfwidth and Fullwidth Forms",
+ "Specials",
+ "Linear B Syllabary",
+ "Linear B Ideograms",
+ "Aegean Numbers",
+ "Ancient Greek Numbers",
+ "Ancient Symbols",
+ "Phaistos Disc",
+ "Lycian",
+ "Carian",
+ "Coptic Epact Numbers",
+ "Old Italic",
+ "Gothic",
+ "Old Permic",
+ "Ugaritic",
+ "Old Persian",
+ "Deseret",
+ "Shavian",
+ "Osmanya",
+ "Osage",
+ "Elbasan",
+ "Caucasian Albanian",
+ "Vithkuqi",
+ "Todhri",
+ "Linear A",
+ "Latin Extended-F",
+ "Cypriot Syllabary",
+ "Imperial Aramaic",
+ "Palmyrene",
+ "Nabataean",
+ "Hatran",
+ "Phoenician",
+ "Lydian",
+ "Sidetic",
+ "Meroitic Hieroglyphs",
+ "Meroitic Cursive",
+ "Kharoshthi",
+ "Old South Arabian",
+ "Old North Arabian",
+ "Manichaean",
+ "Avestan",
+ "Inscriptional Parthian",
+ "Inscriptional Pahlavi",
+ "Psalter Pahlavi",
+ "Old Turkic",
+ "Old Hungarian",
+ "Hanifi Rohingya",
+ "Garay",
+ "Rumi Numeral Symbols",
+ "Yezidi",
+ "Arabic Extended-C",
+ "Old Sogdian",
+ "Sogdian",
+ "Old Uyghur",
+ "Chorasmian",
+ "Elymaic",
+ "Brahmi",
+ "Kaithi",
+ "Sora Sompeng",
+ "Chakma",
+ "Mahajani",
+ "Sharada",
+ "Sinhala Archaic Numbers",
+ "Khojki",
+ "Multani",
+ "Khudawadi",
+ "Grantha",
+ "Tulu-Tigalari",
+ "Newa",
+ "Tirhuta",
+ "Siddham",
+ "Modi",
+ "Mongolian Supplement",
+ "Takri",
+ "Myanmar Extended-C",
+ "Ahom",
+ "Dogra",
+ "Warang Citi",
+ "Dives Akuru",
+ "Nandinagari",
+ "Zanabazar Square",
+ "Soyombo",
+ "Unified Canadian Aboriginal Syllabics Extended-A",
+ "Pau Cin Hau",
+ "Devanagari Extended-A",
+ "Sharada Supplement",
+ "Sunuwar",
+ "Bhaiksuki",
+ "Marchen",
+ "Masaram Gondi",
+ "Gunjala Gondi",
+ "Tolong Siki",
+ "Makasar",
+ "Kawi",
+ "Lisu Supplement",
+ "Tamil Supplement",
+ "Cuneiform",
+ "Cuneiform Numbers and Punctuation",
+ "Early Dynastic Cuneiform",
+ "Cypro-Minoan",
+ "Egyptian Hieroglyphs",
+ "Egyptian Hieroglyph Format Controls",
+ "Egyptian Hieroglyphs Extended-A",
+ "Anatolian Hieroglyphs",
+ "Gurung Khema",
+ "Bamum Supplement",
+ "Mro",
+ "Tangsa",
+ "Bassa Vah",
+ "Pahawh Hmong",
+ "Kirat Rai",
+ "Medefaidrin",
+ "Beria Erfe",
+ "Miao",
+ "Ideographic Symbols and Punctuation",
+ "Tangut",
+ "Tangut Components",
+ "Khitan Small Script",
+ "Tangut Supplement",
+ "Tangut Components Supplement",
+ "Kana Extended-B",
+ "Kana Supplement",
+ "Kana Extended-A",
+ "Small Kana Extension",
+ "Nushu",
+ "Duployan",
+ "Shorthand Format Controls",
+ "Symbols for Legacy Computing Supplement",
+ "Miscellaneous Symbols Supplement",
+ "Znamenny Musical Notation",
+ "Byzantine Musical Symbols",
+ "Musical Symbols",
+ "Ancient Greek Musical Notation",
+ "Kaktovik Numerals",
+ "Mayan Numerals",
+ "Tai Xuan Jing Symbols",
+ "Counting Rod Numerals",
+ "Mathematical Alphanumeric Symbols",
+ "Sutton SignWriting",
+ "Latin Extended-G",
+ "Glagolitic Supplement",
+ "Cyrillic Extended-D",
+ "Nyiakeng Puachue Hmong",
+ "Toto",
+ "Wancho",
+ "Nag Mundari",
+ "Ol Onal",
+ "Tai Yo",
+ "Ethiopic Extended-B",
+ "Mende Kikakui",
+ "Adlam",
+ "Indic Siyaq Numbers",
+ "Ottoman Siyaq Numbers",
+ "Arabic Mathematical Alphabetic Symbols",
+ "Mahjong Tiles",
+ "Domino Tiles",
+ "Playing Cards",
+ "Enclosed Alphanumeric Supplement",
+ "Enclosed Ideographic Supplement",
+ "Miscellaneous Symbols and Pictographs",
+ "Emoticons",
+ "Ornamental Dingbats",
+ "Transport and Map Symbols",
+ "Alchemical Symbols",
+ "Geometric Shapes Extended",
+ "Supplemental Arrows-C",
+ "Supplemental Symbols and Pictographs",
+ "Chess Symbols",
+ "Symbols and Pictographs Extended-A",
+ "Symbols for Legacy Computing",
+ "CJK Unified Ideographs Extension B",
+ "CJK Unified Ideographs Extension C",
+ "CJK Unified Ideographs Extension D",
+ "CJK Unified Ideographs Extension E",
+ "CJK Unified Ideographs Extension F",
+ "CJK Unified Ideographs Extension I",
+ "CJK Compatibility Ideographs Supplement",
+ "CJK Unified Ideographs Extension G",
+ "CJK Unified Ideographs Extension H",
+ "CJK Unified Ideographs Extension J",
+ "Tags",
+ "Variation Selectors Supplement",
+ "Supplementary Private Use Area-A",
+ "Supplementary Private Use Area-B",
+};
+typedef struct {
+ Py_UCS4 start;
+ Py_UCS4 end;
+ unsigned short name;
+} _PyUnicode_Block;
+static const _PyUnicode_Block _PyUnicode_Blocks[] = {
+ {0x0000, 0x007F, 0},
+ {0x0080, 0x00FF, 1},
+ {0x0100, 0x017F, 2},
+ {0x0180, 0x024F, 3},
+ {0x0250, 0x02AF, 4},
+ {0x02B0, 0x02FF, 5},
+ {0x0300, 0x036F, 6},
+ {0x0370, 0x03FF, 7},
+ {0x0400, 0x04FF, 8},
+ {0x0500, 0x052F, 9},
+ {0x0530, 0x058F, 10},
+ {0x0590, 0x05FF, 11},
+ {0x0600, 0x06FF, 12},
+ {0x0700, 0x074F, 13},
+ {0x0750, 0x077F, 14},
+ {0x0780, 0x07BF, 15},
+ {0x07C0, 0x07FF, 16},
+ {0x0800, 0x083F, 17},
+ {0x0840, 0x085F, 18},
+ {0x0860, 0x086F, 19},
+ {0x0870, 0x089F, 20},
+ {0x08A0, 0x08FF, 21},
+ {0x0900, 0x097F, 22},
+ {0x0980, 0x09FF, 23},
+ {0x0A00, 0x0A7F, 24},
+ {0x0A80, 0x0AFF, 25},
+ {0x0B00, 0x0B7F, 26},
+ {0x0B80, 0x0BFF, 27},
+ {0x0C00, 0x0C7F, 28},
+ {0x0C80, 0x0CFF, 29},
+ {0x0D00, 0x0D7F, 30},
+ {0x0D80, 0x0DFF, 31},
+ {0x0E00, 0x0E7F, 32},
+ {0x0E80, 0x0EFF, 33},
+ {0x0F00, 0x0FFF, 34},
+ {0x1000, 0x109F, 35},
+ {0x10A0, 0x10FF, 36},
+ {0x1100, 0x11FF, 37},
+ {0x1200, 0x137F, 38},
+ {0x1380, 0x139F, 39},
+ {0x13A0, 0x13FF, 40},
+ {0x1400, 0x167F, 41},
+ {0x1680, 0x169F, 42},
+ {0x16A0, 0x16FF, 43},
+ {0x1700, 0x171F, 44},
+ {0x1720, 0x173F, 45},
+ {0x1740, 0x175F, 46},
+ {0x1760, 0x177F, 47},
+ {0x1780, 0x17FF, 48},
+ {0x1800, 0x18AF, 49},
+ {0x18B0, 0x18FF, 50},
+ {0x1900, 0x194F, 51},
+ {0x1950, 0x197F, 52},
+ {0x1980, 0x19DF, 53},
+ {0x19E0, 0x19FF, 54},
+ {0x1A00, 0x1A1F, 55},
+ {0x1A20, 0x1AAF, 56},
+ {0x1AB0, 0x1AFF, 57},
+ {0x1B00, 0x1B7F, 58},
+ {0x1B80, 0x1BBF, 59},
+ {0x1BC0, 0x1BFF, 60},
+ {0x1C00, 0x1C4F, 61},
+ {0x1C50, 0x1C7F, 62},
+ {0x1C80, 0x1C8F, 63},
+ {0x1C90, 0x1CBF, 64},
+ {0x1CC0, 0x1CCF, 65},
+ {0x1CD0, 0x1CFF, 66},
+ {0x1D00, 0x1D7F, 67},
+ {0x1D80, 0x1DBF, 68},
+ {0x1DC0, 0x1DFF, 69},
+ {0x1E00, 0x1EFF, 70},
+ {0x1F00, 0x1FFF, 71},
+ {0x2000, 0x206F, 72},
+ {0x2070, 0x209F, 73},
+ {0x20A0, 0x20CF, 74},
+ {0x20D0, 0x20FF, 75},
+ {0x2100, 0x214F, 76},
+ {0x2150, 0x218F, 77},
+ {0x2190, 0x21FF, 78},
+ {0x2200, 0x22FF, 79},
+ {0x2300, 0x23FF, 80},
+ {0x2400, 0x243F, 81},
+ {0x2440, 0x245F, 82},
+ {0x2460, 0x24FF, 83},
+ {0x2500, 0x257F, 84},
+ {0x2580, 0x259F, 85},
+ {0x25A0, 0x25FF, 86},
+ {0x2600, 0x26FF, 87},
+ {0x2700, 0x27BF, 88},
+ {0x27C0, 0x27EF, 89},
+ {0x27F0, 0x27FF, 90},
+ {0x2800, 0x28FF, 91},
+ {0x2900, 0x297F, 92},
+ {0x2980, 0x29FF, 93},
+ {0x2A00, 0x2AFF, 94},
+ {0x2B00, 0x2BFF, 95},
+ {0x2C00, 0x2C5F, 96},
+ {0x2C60, 0x2C7F, 97},
+ {0x2C80, 0x2CFF, 98},
+ {0x2D00, 0x2D2F, 99},
+ {0x2D30, 0x2D7F, 100},
+ {0x2D80, 0x2DDF, 101},
+ {0x2DE0, 0x2DFF, 102},
+ {0x2E00, 0x2E7F, 103},
+ {0x2E80, 0x2EFF, 104},
+ {0x2F00, 0x2FDF, 105},
+ {0x2FF0, 0x2FFF, 106},
+ {0x3000, 0x303F, 107},
+ {0x3040, 0x309F, 108},
+ {0x30A0, 0x30FF, 109},
+ {0x3100, 0x312F, 110},
+ {0x3130, 0x318F, 111},
+ {0x3190, 0x319F, 112},
+ {0x31A0, 0x31BF, 113},
+ {0x31C0, 0x31EF, 114},
+ {0x31F0, 0x31FF, 115},
+ {0x3200, 0x32FF, 116},
+ {0x3300, 0x33FF, 117},
+ {0x3400, 0x4DBF, 118},
+ {0x4DC0, 0x4DFF, 119},
+ {0x4E00, 0x9FFF, 120},
+ {0xA000, 0xA48F, 121},
+ {0xA490, 0xA4CF, 122},
+ {0xA4D0, 0xA4FF, 123},
+ {0xA500, 0xA63F, 124},
+ {0xA640, 0xA69F, 125},
+ {0xA6A0, 0xA6FF, 126},
+ {0xA700, 0xA71F, 127},
+ {0xA720, 0xA7FF, 128},
+ {0xA800, 0xA82F, 129},
+ {0xA830, 0xA83F, 130},
+ {0xA840, 0xA87F, 131},
+ {0xA880, 0xA8DF, 132},
+ {0xA8E0, 0xA8FF, 133},
+ {0xA900, 0xA92F, 134},
+ {0xA930, 0xA95F, 135},
+ {0xA960, 0xA97F, 136},
+ {0xA980, 0xA9DF, 137},
+ {0xA9E0, 0xA9FF, 138},
+ {0xAA00, 0xAA5F, 139},
+ {0xAA60, 0xAA7F, 140},
+ {0xAA80, 0xAADF, 141},
+ {0xAAE0, 0xAAFF, 142},
+ {0xAB00, 0xAB2F, 143},
+ {0xAB30, 0xAB6F, 144},
+ {0xAB70, 0xABBF, 145},
+ {0xABC0, 0xABFF, 146},
+ {0xAC00, 0xD7AF, 147},
+ {0xD7B0, 0xD7FF, 148},
+ {0xD800, 0xDB7F, 149},
+ {0xDB80, 0xDBFF, 150},
+ {0xDC00, 0xDFFF, 151},
+ {0xE000, 0xF8FF, 152},
+ {0xF900, 0xFAFF, 153},
+ {0xFB00, 0xFB4F, 154},
+ {0xFB50, 0xFDFF, 155},
+ {0xFE00, 0xFE0F, 156},
+ {0xFE10, 0xFE1F, 157},
+ {0xFE20, 0xFE2F, 158},
+ {0xFE30, 0xFE4F, 159},
+ {0xFE50, 0xFE6F, 160},
+ {0xFE70, 0xFEFF, 161},
+ {0xFF00, 0xFFEF, 162},
+ {0xFFF0, 0xFFFF, 163},
+ {0x10000, 0x1007F, 164},
+ {0x10080, 0x100FF, 165},
+ {0x10100, 0x1013F, 166},
+ {0x10140, 0x1018F, 167},
+ {0x10190, 0x101CF, 168},
+ {0x101D0, 0x101FF, 169},
+ {0x10280, 0x1029F, 170},
+ {0x102A0, 0x102DF, 171},
+ {0x102E0, 0x102FF, 172},
+ {0x10300, 0x1032F, 173},
+ {0x10330, 0x1034F, 174},
+ {0x10350, 0x1037F, 175},
+ {0x10380, 0x1039F, 176},
+ {0x103A0, 0x103DF, 177},
+ {0x10400, 0x1044F, 178},
+ {0x10450, 0x1047F, 179},
+ {0x10480, 0x104AF, 180},
+ {0x104B0, 0x104FF, 181},
+ {0x10500, 0x1052F, 182},
+ {0x10530, 0x1056F, 183},
+ {0x10570, 0x105BF, 184},
+ {0x105C0, 0x105FF, 185},
+ {0x10600, 0x1077F, 186},
+ {0x10780, 0x107BF, 187},
+ {0x10800, 0x1083F, 188},
+ {0x10840, 0x1085F, 189},
+ {0x10860, 0x1087F, 190},
+ {0x10880, 0x108AF, 191},
+ {0x108E0, 0x108FF, 192},
+ {0x10900, 0x1091F, 193},
+ {0x10920, 0x1093F, 194},
+ {0x10940, 0x1095F, 195},
+ {0x10980, 0x1099F, 196},
+ {0x109A0, 0x109FF, 197},
+ {0x10A00, 0x10A5F, 198},
+ {0x10A60, 0x10A7F, 199},
+ {0x10A80, 0x10A9F, 200},
+ {0x10AC0, 0x10AFF, 201},
+ {0x10B00, 0x10B3F, 202},
+ {0x10B40, 0x10B5F, 203},
+ {0x10B60, 0x10B7F, 204},
+ {0x10B80, 0x10BAF, 205},
+ {0x10C00, 0x10C4F, 206},
+ {0x10C80, 0x10CFF, 207},
+ {0x10D00, 0x10D3F, 208},
+ {0x10D40, 0x10D8F, 209},
+ {0x10E60, 0x10E7F, 210},
+ {0x10E80, 0x10EBF, 211},
+ {0x10EC0, 0x10EFF, 212},
+ {0x10F00, 0x10F2F, 213},
+ {0x10F30, 0x10F6F, 214},
+ {0x10F70, 0x10FAF, 215},
+ {0x10FB0, 0x10FDF, 216},
+ {0x10FE0, 0x10FFF, 217},
+ {0x11000, 0x1107F, 218},
+ {0x11080, 0x110CF, 219},
+ {0x110D0, 0x110FF, 220},
+ {0x11100, 0x1114F, 221},
+ {0x11150, 0x1117F, 222},
+ {0x11180, 0x111DF, 223},
+ {0x111E0, 0x111FF, 224},
+ {0x11200, 0x1124F, 225},
+ {0x11280, 0x112AF, 226},
+ {0x112B0, 0x112FF, 227},
+ {0x11300, 0x1137F, 228},
+ {0x11380, 0x113FF, 229},
+ {0x11400, 0x1147F, 230},
+ {0x11480, 0x114DF, 231},
+ {0x11580, 0x115FF, 232},
+ {0x11600, 0x1165F, 233},
+ {0x11660, 0x1167F, 234},
+ {0x11680, 0x116CF, 235},
+ {0x116D0, 0x116FF, 236},
+ {0x11700, 0x1174F, 237},
+ {0x11800, 0x1184F, 238},
+ {0x118A0, 0x118FF, 239},
+ {0x11900, 0x1195F, 240},
+ {0x119A0, 0x119FF, 241},
+ {0x11A00, 0x11A4F, 242},
+ {0x11A50, 0x11AAF, 243},
+ {0x11AB0, 0x11ABF, 244},
+ {0x11AC0, 0x11AFF, 245},
+ {0x11B00, 0x11B5F, 246},
+ {0x11B60, 0x11B7F, 247},
+ {0x11BC0, 0x11BFF, 248},
+ {0x11C00, 0x11C6F, 249},
+ {0x11C70, 0x11CBF, 250},
+ {0x11D00, 0x11D5F, 251},
+ {0x11D60, 0x11DAF, 252},
+ {0x11DB0, 0x11DEF, 253},
+ {0x11EE0, 0x11EFF, 254},
+ {0x11F00, 0x11F5F, 255},
+ {0x11FB0, 0x11FBF, 256},
+ {0x11FC0, 0x11FFF, 257},
+ {0x12000, 0x123FF, 258},
+ {0x12400, 0x1247F, 259},
+ {0x12480, 0x1254F, 260},
+ {0x12F90, 0x12FFF, 261},
+ {0x13000, 0x1342F, 262},
+ {0x13430, 0x1345F, 263},
+ {0x13460, 0x143FF, 264},
+ {0x14400, 0x1467F, 265},
+ {0x16100, 0x1613F, 266},
+ {0x16800, 0x16A3F, 267},
+ {0x16A40, 0x16A6F, 268},
+ {0x16A70, 0x16ACF, 269},
+ {0x16AD0, 0x16AFF, 270},
+ {0x16B00, 0x16B8F, 271},
+ {0x16D40, 0x16D7F, 272},
+ {0x16E40, 0x16E9F, 273},
+ {0x16EA0, 0x16EDF, 274},
+ {0x16F00, 0x16F9F, 275},
+ {0x16FE0, 0x16FFF, 276},
+ {0x17000, 0x187FF, 277},
+ {0x18800, 0x18AFF, 278},
+ {0x18B00, 0x18CFF, 279},
+ {0x18D00, 0x18D7F, 280},
+ {0x18D80, 0x18DFF, 281},
+ {0x1AFF0, 0x1AFFF, 282},
+ {0x1B000, 0x1B0FF, 283},
+ {0x1B100, 0x1B12F, 284},
+ {0x1B130, 0x1B16F, 285},
+ {0x1B170, 0x1B2FF, 286},
+ {0x1BC00, 0x1BC9F, 287},
+ {0x1BCA0, 0x1BCAF, 288},
+ {0x1CC00, 0x1CEBF, 289},
+ {0x1CEC0, 0x1CEFF, 290},
+ {0x1CF00, 0x1CFCF, 291},
+ {0x1D000, 0x1D0FF, 292},
+ {0x1D100, 0x1D1FF, 293},
+ {0x1D200, 0x1D24F, 294},
+ {0x1D2C0, 0x1D2DF, 295},
+ {0x1D2E0, 0x1D2FF, 296},
+ {0x1D300, 0x1D35F, 297},
+ {0x1D360, 0x1D37F, 298},
+ {0x1D400, 0x1D7FF, 299},
+ {0x1D800, 0x1DAAF, 300},
+ {0x1DF00, 0x1DFFF, 301},
+ {0x1E000, 0x1E02F, 302},
+ {0x1E030, 0x1E08F, 303},
+ {0x1E100, 0x1E14F, 304},
+ {0x1E290, 0x1E2BF, 305},
+ {0x1E2C0, 0x1E2FF, 306},
+ {0x1E4D0, 0x1E4FF, 307},
+ {0x1E5D0, 0x1E5FF, 308},
+ {0x1E6C0, 0x1E6FF, 309},
+ {0x1E7E0, 0x1E7FF, 310},
+ {0x1E800, 0x1E8DF, 311},
+ {0x1E900, 0x1E95F, 312},
+ {0x1EC70, 0x1ECBF, 313},
+ {0x1ED00, 0x1ED4F, 314},
+ {0x1EE00, 0x1EEFF, 315},
+ {0x1F000, 0x1F02F, 316},
+ {0x1F030, 0x1F09F, 317},
+ {0x1F0A0, 0x1F0FF, 318},
+ {0x1F100, 0x1F1FF, 319},
+ {0x1F200, 0x1F2FF, 320},
+ {0x1F300, 0x1F5FF, 321},
+ {0x1F600, 0x1F64F, 322},
+ {0x1F650, 0x1F67F, 323},
+ {0x1F680, 0x1F6FF, 324},
+ {0x1F700, 0x1F77F, 325},
+ {0x1F780, 0x1F7FF, 326},
+ {0x1F800, 0x1F8FF, 327},
+ {0x1F900, 0x1F9FF, 328},
+ {0x1FA00, 0x1FA6F, 329},
+ {0x1FA70, 0x1FAFF, 330},
+ {0x1FB00, 0x1FBFF, 331},
+ {0x20000, 0x2A6DF, 332},
+ {0x2A700, 0x2B73F, 333},
+ {0x2B740, 0x2B81F, 334},
+ {0x2B820, 0x2CEAF, 335},
+ {0x2CEB0, 0x2EBEF, 336},
+ {0x2EBF0, 0x2EE5F, 337},
+ {0x2F800, 0x2FA1F, 338},
+ {0x30000, 0x3134F, 339},
+ {0x31350, 0x323AF, 340},
+ {0x323B0, 0x3347F, 341},
+ {0xE0000, 0xE007F, 342},
+ {0xE0100, 0xE01EF, 343},
+ {0xF0000, 0xFFFFF, 344},
+ {0x100000, 0x10FFFF, 345},
+};
+#define BLOCK_COUNT 346
+
static const char *decomp_prefix[] = {
"",
"",
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 11f626ca0ab..5db850ca2d1 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -60,6 +60,7 @@
CASE_FOLDING = "CaseFolding%s.txt"
GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
EMOJI_DATA = "emoji/emoji-data%s.txt"
+BLOCKS = "Blocks%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
@@ -392,6 +393,34 @@ def makeunicodedata(unicode, trace):
fprint(" NULL")
fprint("};")
+ # Generate block tables
+ names = []
+ name_to_index = {}
+ blocks = []
+ for start, end, name in unicode.blocks:
+ if name not in name_to_index:
+ name_to_index[name] = len(names)
+ names.append(name)
+ blocks.append((start, end, name_to_index[name]))
+
+ fprint("static const char * const _PyUnicode_BlockNames[] = {")
+ for name in names:
+ fprint(' "%s",' % name)
+ fprint("};")
+
+ fprint("typedef struct {")
+ fprint(" Py_UCS4 start;")
+ fprint(" Py_UCS4 end;")
+ fprint(" unsigned short name;")
+ fprint("} _PyUnicode_Block;")
+
+ fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {")
+ for start, end, name in blocks:
+ fprint(" {0x%04X, 0x%04X, %d}," % (start, end, name))
+ fprint("};")
+ fprint(f"#define BLOCK_COUNT {len(blocks)}")
+ fprint()
+
fprint("static const char *decomp_prefix[] = {")
for name in decomp_prefix:
fprint(" \"%s\"," % name)
@@ -1205,6 +1234,13 @@ def __init__(self, version, ideograph_check=True):
ext_picts[char] = True
self.ext_picts = ext_picts
+ # See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189
+ self.blocks = []
+ for record in UcdFile(BLOCKS, version).records():
+ start_end, name = record
+ start, end = [int(c, 16) for c in start_end.split('..')]
+ self.blocks.append((start, end, name))
+ self.blocks.sort()
def uselatin1(self):
# restrict character range to ISO Latin 1