gh-74902: Add Unicode Grapheme Cluster Break algorithm (GH-143076)

Add the unicodedata.iter_graphemes() function to iterate over grapheme
clusters according to rules defined in Unicode Standard Annex #29.

Add unicodedata.grapheme_cluster_break(), unicodedata.indic_conjunct_break()
and unicodedata.extended_pictographic() functions to get the properties
of the character which are related to the above algorithm.

Co-authored-by: Guillaume "Vermeille" Sanchez <guillaume.v.sanchez@gmail.com>
This commit is contained in:
Serhiy Storchaka 2026-01-14 16:37:57 +02:00 committed by GitHub
parent 0e0d51cdce
commit bab1d7a561
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 4350 additions and 3039 deletions

View file

@ -184,6 +184,28 @@ following functions:
'0041 0303'
.. function:: grapheme_cluster_break(chr, /)
Returns the Grapheme_Cluster_Break property assigned to the character.
.. versionadded:: next
.. function:: indic_conjunct_break(chr, /)
Returns the Indic_Conjunct_Break property assigned to the character.
.. versionadded:: next
.. function:: extended_pictographic(chr, /)
Returns ``True`` if the character has the Extended_Pictographic property,
``False`` otherwise.
.. versionadded:: next
.. function:: normalize(form, unistr, /)
Return the normal form *form* for the Unicode string *unistr*. Valid values for
@ -225,6 +247,24 @@ following functions:
.. versionadded:: 3.8
.. function:: iter_graphemes(unistr, start=0, end=sys.maxsize, /)
Returns an iterator to iterate over grapheme clusters.
With optional *start*, iteration begins at that position.
With optional *end*, iteration stops at that position.
Converting an emitted item to string returns a substring corresponding to
the grapheme cluster.
Its ``start`` and ``end`` attributes denote the start and end of
the grapheme cluster.
It uses extended grapheme cluster rules defined by Unicode
Standard Annex #29, `"Unicode Text Segmentation"
<https://www.unicode.org/reports/tr29/>`_.
.. versionadded:: next
In addition, the module exposes the following constant:
.. data:: unidata_version
@ -234,7 +274,7 @@ In addition, the module exposes the following constant:
.. data:: ucd_3_2_0
This is an object that has the same methods as the entire module, but uses the
This is an object that has most of the methods of the entire module, but uses the
Unicode database version 3.2 instead, for applications that require this
specific version of the Unicode database (such as IDNA).

View file

@ -811,6 +811,16 @@ unicodedata
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
(Contributed by Stan Ulbrych in :gh:`129117`.)
* Add the :func:`~unicodedata.iter_graphemes`
function to iterate over grapheme clusters according to rules defined in
`Unicode Standard Annex #29, "Unicode Text Segmentation"
<https://www.unicode.org/reports/tr29/>`_.
Add :func:`~unicodedata.grapheme_cluster_break`,
:func:`~unicodedata.indic_conjunct_break` and
:func:`~unicodedata.extended_pictographic` functions to get the properties
of the character which are related to the above algorithm.
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
unittest
--------

View file

@ -616,6 +616,221 @@ def test_isxidcontinue(self):
self.assertRaises(TypeError, self.db.isxidcontinue)
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
def test_grapheme_cluster_break(self):
gcb = self.db.grapheme_cluster_break
self.assertEqual(gcb(' '), 'Other')
self.assertEqual(gcb('x'), 'Other')
self.assertEqual(gcb('\U0010FFFF'), 'Other')
self.assertEqual(gcb('\r'), 'CR')
self.assertEqual(gcb('\n'), 'LF')
self.assertEqual(gcb('\0'), 'Control')
self.assertEqual(gcb('\t'), 'Control')
self.assertEqual(gcb('\x1F'), 'Control')
self.assertEqual(gcb('\x7F'), 'Control')
self.assertEqual(gcb('\x9F'), 'Control')
self.assertEqual(gcb('\U000E0001'), 'Control')
self.assertEqual(gcb('\u0300'), 'Extend')
self.assertEqual(gcb('\u200C'), 'Extend')
self.assertEqual(gcb('\U000E01EF'), 'Extend')
self.assertEqual(gcb('\u1159'), 'L')
self.assertEqual(gcb('\u11F9'), 'T')
self.assertEqual(gcb('\uD788'), 'LV')
self.assertEqual(gcb('\uD7A3'), 'LVT')
# New in 5.0.0
self.assertEqual(gcb('\u05BA'), 'Extend')
self.assertEqual(gcb('\u20EF'), 'Extend')
# New in 5.1.0
self.assertEqual(gcb('\u2064'), 'Control')
self.assertEqual(gcb('\uAA4D'), 'SpacingMark')
# New in 5.2.0
self.assertEqual(gcb('\u0816'), 'Extend')
self.assertEqual(gcb('\uA97C'), 'L')
self.assertEqual(gcb('\uD7C6'), 'V')
self.assertEqual(gcb('\uD7FB'), 'T')
# New in 6.0.0
self.assertEqual(gcb('\u093A'), 'Extend')
self.assertEqual(gcb('\U00011002'), 'SpacingMark')
# New in 6.1.0
self.assertEqual(gcb('\U000E0FFF'), 'Control')
self.assertEqual(gcb('\U00016F7E'), 'SpacingMark')
# New in 6.2.0
self.assertEqual(gcb('\U0001F1E6'), 'Regional_Indicator')
self.assertEqual(gcb('\U0001F1FF'), 'Regional_Indicator')
# New in 6.3.0
self.assertEqual(gcb('\u180E'), 'Control')
self.assertEqual(gcb('\u1A1B'), 'Extend')
# New in 7.0.0
self.assertEqual(gcb('\u0E33'), 'SpacingMark')
self.assertEqual(gcb('\u0EB3'), 'SpacingMark')
self.assertEqual(gcb('\U0001BCA3'), 'Control')
self.assertEqual(gcb('\U0001E8D6'), 'Extend')
self.assertEqual(gcb('\U0001163E'), 'SpacingMark')
# New in 8.0.0
self.assertEqual(gcb('\u08E3'), 'Extend')
self.assertEqual(gcb('\U00011726'), 'SpacingMark')
# New in 9.0.0
self.assertEqual(gcb('\u0600'), 'Prepend')
self.assertEqual(gcb('\U000E007F'), 'Extend')
self.assertEqual(gcb('\U00011CB4'), 'SpacingMark')
self.assertEqual(gcb('\u200D'), 'ZWJ')
# New in 10.0.0
self.assertEqual(gcb('\U00011D46'), 'Prepend')
self.assertEqual(gcb('\U00011D47'), 'Extend')
self.assertEqual(gcb('\U00011A97'), 'SpacingMark')
# New in 11.0.0
self.assertEqual(gcb('\U000110CD'), 'Prepend')
self.assertEqual(gcb('\u07FD'), 'Extend')
self.assertEqual(gcb('\U00011EF6'), 'SpacingMark')
# New in 12.0.0
self.assertEqual(gcb('\U00011A84'), 'Prepend')
self.assertEqual(gcb('\U00013438'), 'Control')
self.assertEqual(gcb('\U0001E2EF'), 'Extend')
self.assertEqual(gcb('\U00016F87'), 'SpacingMark')
# New in 13.0.0
self.assertEqual(gcb('\U00011941'), 'Prepend')
self.assertEqual(gcb('\U00016FE4'), 'Extend')
self.assertEqual(gcb('\U00011942'), 'SpacingMark')
# New in 14.0.0
self.assertEqual(gcb('\u0891'), 'Prepend')
self.assertEqual(gcb('\U0001E2AE'), 'Extend')
# New in 15.0.0
self.assertEqual(gcb('\U00011F02'), 'Prepend')
self.assertEqual(gcb('\U0001343F'), 'Control')
self.assertEqual(gcb('\U0001E4EF'), 'Extend')
self.assertEqual(gcb('\U00011F3F'), 'SpacingMark')
# New in 16.0.0
self.assertEqual(gcb('\U000113D1'), 'Prepend')
self.assertEqual(gcb('\U0001E5EF'), 'Extend')
self.assertEqual(gcb('\U0001612C'), 'SpacingMark')
self.assertEqual(gcb('\U00016D63'), 'V')
# New in 17.0.0
self.assertEqual(gcb('\u1AEB'), 'Extend')
self.assertEqual(gcb('\U00011B67'), 'SpacingMark')
self.assertRaises(TypeError, gcb)
self.assertRaises(TypeError, gcb, b'x')
self.assertRaises(TypeError, gcb, 120)
self.assertRaises(TypeError, gcb, '')
self.assertRaises(TypeError, gcb, 'xx')
def test_indic_conjunct_break(self):
incb = self.db.indic_conjunct_break
self.assertEqual(incb(' '), 'None')
self.assertEqual(incb('x'), 'None')
self.assertEqual(incb('\U0010FFFF'), 'None')
# New in 15.1.0
self.assertEqual(incb('\u094D'), 'Linker')
self.assertEqual(incb('\u0D4D'), 'Linker')
self.assertEqual(incb('\u0915'), 'Consonant')
self.assertEqual(incb('\u0D3A'), 'Consonant')
self.assertEqual(incb('\u0300'), 'Extend')
self.assertEqual(incb('\U0001E94A'), 'Extend')
# New in 16.0.0
self.assertEqual(incb('\u034F'), 'Extend')
self.assertEqual(incb('\U000E01EF'), 'Extend')
# New in 17.0.0
self.assertEqual(incb('\u1039'), 'Linker')
self.assertEqual(incb('\U00011F42'), 'Linker')
self.assertEqual(incb('\u1000'), 'Consonant')
self.assertEqual(incb('\U00011F33'), 'Consonant')
self.assertEqual(incb('\U0001E6F5'), 'Extend')
self.assertRaises(TypeError, incb)
self.assertRaises(TypeError, incb, b'x')
self.assertRaises(TypeError, incb, 120)
self.assertRaises(TypeError, incb, '')
self.assertRaises(TypeError, incb, 'xx')
def test_extended_pictographic(self):
ext_pict = self.db.extended_pictographic
self.assertIs(ext_pict(' '), False)
self.assertIs(ext_pict('x'), False)
self.assertIs(ext_pict('\U0010FFFF'), False)
# New in 13.0.0
self.assertIs(ext_pict('\xA9'), True)
self.assertIs(ext_pict('\u203C'), True)
self.assertIs(ext_pict('\U0001FAD6'), True)
self.assertIs(ext_pict('\U0001FFFD'), True)
# New in 17.0.0
self.assertIs(ext_pict('\u2388'), False)
self.assertIs(ext_pict('\U0001FA6D'), False)
self.assertRaises(TypeError, ext_pict)
self.assertRaises(TypeError, ext_pict, b'x')
self.assertRaises(TypeError, ext_pict, 120)
self.assertRaises(TypeError, ext_pict, '')
self.assertRaises(TypeError, ext_pict, 'xx')
def test_grapheme_break(self):
def graphemes(*args):
return list(map(str, self.db.iter_graphemes(*args)))
self.assertRaises(TypeError, self.db.iter_graphemes)
self.assertRaises(TypeError, self.db.iter_graphemes, b'x')
self.assertRaises(TypeError, self.db.iter_graphemes, 'x', 0, 0, 0)
self.assertEqual(graphemes(''), [])
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
self.assertEqual(graphemes('abcd', 3, 1), [])
self.assertEqual(graphemes('abcd', 5), [])
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 0, -5), [])
# GB3
self.assertEqual(graphemes('\r\n'), ['\r\n'])
# GB4
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
# GB5
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
# GB6
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
# GB7
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
# GB8
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
# GB9
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
# GB9a
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
# GB9b
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
# GB9c
self.assertEqual(graphemes('\u0915\u094d\u0924'),
['\u0915\u094d\u0924'])
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
['\u0915\u094D\u094D\u0924'])
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
['\u0915\u094D\u0924\u094D\u092F'])
# GB11
self.assertEqual(graphemes(
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
# GB12
self.assertEqual(graphemes(
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
# GB13
self.assertEqual(graphemes(
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
@ -624,6 +839,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
if quicktest else
'f217b8688d7bdff31db4207e078a96702f091597')
test_grapheme_cluster_break = None
test_indic_conjunct_break = None
test_extended_pictographic = None
test_grapheme_break = None
class UnicodeMiscTest(unittest.TestCase):
db = unicodedata
@ -726,6 +946,17 @@ def test_linebreak_7643(self):
self.assertEqual(len(lines), 1,
r"%a should not be a linebreak" % c)
def test_segment_object(self):
segments = list(unicodedata.iter_graphemes('spa\u0300m'))
self.assertEqual(len(segments), 4, segments)
segment = segments[2]
self.assertEqual(segment.start, 2)
self.assertEqual(segment.end, 4)
self.assertEqual(str(segment), 'a\u0300')
self.assertEqual(repr(segment), '<Segment 2:4>')
self.assertRaises(TypeError, iter, segment)
self.assertRaises(TypeError, len, segment)
class NormalizationTest(unittest.TestCase):
@staticmethod
@ -848,5 +1079,61 @@ class MyStr(str):
self.assertIs(type(normalize(form, MyStr(input_str))), str)
class GraphemeBreakTest(unittest.TestCase):
@staticmethod
def check_version(testfile):
hdr = testfile.readline()
return unicodedata.unidata_version in hdr
@requires_resource('network')
def test_grapheme_break(self):
TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
# Hit the exception early
try:
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
check=self.check_version)
except PermissionError:
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
f"into the test data directory")
except (OSError, HTTPException) as exc:
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
with testdata:
self.run_grapheme_break_tests(testdata)
def run_grapheme_break_tests(self, testdata):
for line in testdata:
line, _, comment = line.partition('#')
line = line.strip()
if not line:
continue
comment = comment.strip()
chunks = []
breaks = []
pos = 0
for field in line.replace('×', ' ').split():
if field == '÷':
chunks.append('')
breaks.append(pos)
else:
chunks[-1] += chr(int(field, 16))
pos += 1
self.assertEqual(chunks.pop(), '', line)
input = ''.join(chunks)
with self.subTest(line):
result = list(unicodedata.iter_graphemes(input))
self.assertEqual(list(map(str, result)), chunks, comment)
self.assertEqual([x.start for x in result], breaks[:-1], comment)
self.assertEqual([x.end for x in result], breaks[1:], comment)
for i in range(1, len(breaks) - 1):
result = list(unicodedata.iter_graphemes(input, breaks[i]))
self.assertEqual(list(map(str, result)), chunks[i:], comment)
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
if __name__ == "__main__":
unittest.main()

View file

@ -1664,6 +1664,7 @@ Victor Salgado
Rich Salz
Kevin Samborn
Adrian Sampson
Guillaume Sanchez
Nevada Sanchez
James Sanders
Ilya Sandler

View file

@ -0,0 +1,8 @@
Add the :func:`~unicodedata.iter_graphemes` function in the
:mod:`unicodedata` module to iterate over grapheme clusters according to
rules defined in `Unicode Standard Annex #29, "Unicode Text Segmentation"
<https://www.unicode.org/reports/tr29/>`_. Add
:func:`~unicodedata.grapheme_cluster_break`,
:func:`~unicodedata.indic_conjunct_break` and
:func:`~unicodedata.extended_pictographic` functions to get the properties
of the character which are related to the above algorithm.

View file

@ -2,6 +2,7 @@
preserve
[clinic start generated code]*/
#include "pycore_abstract.h" // _PyNumber_Index()
#include "pycore_modsupport.h" // _PyArg_CheckPositional()
PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
@ -621,4 +622,180 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
PyDoc_STRVAR(unicodedata_iter_graphemes__doc__,
"iter_graphemes($module, unistr, start=0, end=sys.maxsize, /)\n"
"--\n"
"\n"
"Returns an iterator to iterate over grapheme clusters.\n"
"\n"
"It uses extended grapheme cluster rules from TR29.");
#define UNICODEDATA_ITER_GRAPHEMES_METHODDEF \
{"iter_graphemes", _PyCFunction_CAST(unicodedata_iter_graphemes), METH_FASTCALL, unicodedata_iter_graphemes__doc__},
static PyObject *
unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
Py_ssize_t start, Py_ssize_t end);
static PyObject *
unicodedata_iter_graphemes(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
{
PyObject *return_value = NULL;
PyObject *unistr;
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
if (!_PyArg_CheckPositional("iter_graphemes", nargs, 1, 3)) {
goto exit;
}
if (!PyUnicode_Check(args[0])) {
_PyArg_BadArgument("iter_graphemes", "argument 1", "str", args[0]);
goto exit;
}
unistr = args[0];
if (nargs < 2) {
goto skip_optional;
}
{
Py_ssize_t ival = -1;
PyObject *iobj = _PyNumber_Index(args[1]);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
}
if (ival == -1 && PyErr_Occurred()) {
goto exit;
}
start = ival;
}
if (nargs < 3) {
goto skip_optional;
}
{
Py_ssize_t ival = -1;
PyObject *iobj = _PyNumber_Index(args[2]);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
}
if (ival == -1 && PyErr_Occurred()) {
goto exit;
}
end = ival;
}
skip_optional:
return_value = unicodedata_iter_graphemes_impl(module, unistr, start, end);
exit:
return return_value;
}
PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
"grapheme_cluster_break($module, chr, /)\n"
"--\n"
"\n"
"Returns the Grapheme_Cluster_Break property assigned to the character.");
#define UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF \
{"grapheme_cluster_break", (PyCFunction)unicodedata_grapheme_cluster_break, METH_O, unicodedata_grapheme_cluster_break__doc__},
static PyObject *
unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr);
static PyObject *
unicodedata_grapheme_cluster_break(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;
if (!PyUnicode_Check(arg)) {
_PyArg_BadArgument("grapheme_cluster_break", "argument", "a unicode character", arg);
goto exit;
}
if (PyUnicode_GET_LENGTH(arg) != 1) {
PyErr_Format(PyExc_TypeError,
"grapheme_cluster_break(): argument must be a unicode character, "
"not a string of length %zd",
PyUnicode_GET_LENGTH(arg));
goto exit;
}
chr = PyUnicode_READ_CHAR(arg, 0);
return_value = unicodedata_grapheme_cluster_break_impl(module, chr);
exit:
return return_value;
}
PyDoc_STRVAR(unicodedata_indic_conjunct_break__doc__,
"indic_conjunct_break($module, chr, /)\n"
"--\n"
"\n"
"Returns the Indic_Conjunct_Break property assigned to the character.");
#define UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF \
{"indic_conjunct_break", (PyCFunction)unicodedata_indic_conjunct_break, METH_O, unicodedata_indic_conjunct_break__doc__},
static PyObject *
unicodedata_indic_conjunct_break_impl(PyObject *module, int chr);
static PyObject *
unicodedata_indic_conjunct_break(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;
if (!PyUnicode_Check(arg)) {
_PyArg_BadArgument("indic_conjunct_break", "argument", "a unicode character", arg);
goto exit;
}
if (PyUnicode_GET_LENGTH(arg) != 1) {
PyErr_Format(PyExc_TypeError,
"indic_conjunct_break(): argument must be a unicode character, "
"not a string of length %zd",
PyUnicode_GET_LENGTH(arg));
goto exit;
}
chr = PyUnicode_READ_CHAR(arg, 0);
return_value = unicodedata_indic_conjunct_break_impl(module, chr);
exit:
return return_value;
}
PyDoc_STRVAR(unicodedata_extended_pictographic__doc__,
"extended_pictographic($module, chr, /)\n"
"--\n"
"\n"
"Returns the Extended_Pictographic property assigned to the character, as boolean.");
#define UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF \
{"extended_pictographic", (PyCFunction)unicodedata_extended_pictographic, METH_O, unicodedata_extended_pictographic__doc__},
static PyObject *
unicodedata_extended_pictographic_impl(PyObject *module, int chr);
static PyObject *
unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;
if (!PyUnicode_Check(arg)) {
_PyArg_BadArgument("extended_pictographic", "argument", "a unicode character", arg);
goto exit;
}
if (PyUnicode_GET_LENGTH(arg) != 1) {
PyErr_Format(PyExc_TypeError,
"extended_pictographic(): argument must be a unicode character, "
"not a string of length %zd",
PyUnicode_GET_LENGTH(arg));
goto exit;
}
chr = PyUnicode_READ_CHAR(arg, 0);
return_value = unicodedata_extended_pictographic_impl(module, chr);
exit:
return return_value;
}
/*[clinic end generated code: output=6991246310e3f2aa input=a9049054013a1b77]*/

View file

@ -24,6 +24,26 @@
#include <stdbool.h>
#include <stddef.h> // offsetof()
/* helper macro to fixup start/end slice values */
#define ADJUST_INDICES(start, end, len) \
do { \
if (end > len) { \
end = len; \
} \
else if (end < 0) { \
end += len; \
if (end < 0) { \
end = 0; \
} \
} \
if (start < 0) { \
start += len; \
if (start < 0) { \
start = 0; \
} \
} \
} while (0)
/*[clinic input]
module unicodedata
class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
@ -42,6 +62,11 @@ typedef struct {
const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */
const unsigned char normalization_quick_check; /* see is_normalized() */
const unsigned char grapheme_cluster_break; /* index into
_PyUnicode_GraphemeBreakNames */
const unsigned char incb; /* index into
_PyUnicode_IndicConjunctBreakNames */
const unsigned char ext_pict; /* true if Extended_Pictographic */
} _PyUnicode_DatabaseRecord;
typedef struct change_record {
@ -71,6 +96,19 @@ _getrecord_ex(Py_UCS4 code)
return &_PyUnicode_Database_Records[index];
}
typedef struct {
PyObject *SegmentType;
PyObject *GraphemeBreakIteratorType;
} unicodedatastate;
static inline unicodedatastate *
get_unicodedata_state(PyObject *module)
{
void *state = _PyModule_GetState(module);
assert(state != NULL);
return (unicodedatastate *)state;
}
/* ------------- Previous-version API ------------------------------------- */
typedef struct previous_version {
PyObject_HEAD
@ -1628,11 +1666,469 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
return PyUnicode_FromOrdinal(code);
}
/* Grapheme Cluster Break algorithm */
enum ExtPictState {
ExtPictState_Init,
// \p{Extended_Pictographic} Extend*
ExtPictState_Started,
// ... ZWJ
ExtPictState_ZWJ,
// ... \p{Extended_Pictographic}
ExtPictState_Matched,
};
enum InCBState {
InCBState_Init,
// \p{InCB=Consonant} \p{InCB=Extend}*
InCBState_Started,
// ... \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]*
InCBState_Linker,
// ... \p{InCB=Consonant}
InCBState_Matched,
};
typedef struct {
PyObject *str;
Py_ssize_t start;
Py_ssize_t pos;
Py_ssize_t end;
int gcb;
enum ExtPictState ep_state;
enum InCBState incb_state;
bool ri_flag;
} _PyGraphemeBreak;
static inline enum ExtPictState
update_ext_pict_state(enum ExtPictState state, int gcb, bool ext_pict)
{
if (ext_pict) {
return (state == ExtPictState_ZWJ) ? ExtPictState_Matched : ExtPictState_Started;
}
if (state == ExtPictState_Started || state == ExtPictState_Matched) {
if (gcb == GCB_Extend) {
return ExtPictState_Started;
}
if (gcb == GCB_ZWJ) {
return ExtPictState_ZWJ;
}
}
return ExtPictState_Init;
}
static inline enum InCBState
update_incb_state(enum InCBState state, int incb)
{
if (incb == InCB_Consonant) {
return (state == InCBState_Linker) ? InCBState_Matched : InCBState_Started;
}
if (state != InCBState_Init) {
if (incb == InCB_Extend) {
return (state == InCBState_Linker) ? InCBState_Linker : InCBState_Started;
}
if (incb == InCB_Linker) {
return InCBState_Linker;
}
}
return InCBState_Init;
}
static inline bool
update_ri_flag(bool flag, int gcb)
{
if (gcb == GCB_Regional_Indicator) {
return !flag;
}
else {
return false;
}
}
static inline bool
grapheme_break(int prev_gcb, int curr_gcb, enum ExtPictState ep_state,
bool ri_flag, enum InCBState incb_state)
{
/* GB3 */
if (prev_gcb == GCB_CR && curr_gcb == GCB_LF) {
return false;
}
/* GB4 */
if (prev_gcb == GCB_CR ||
prev_gcb == GCB_LF ||
prev_gcb == GCB_Control)
{
return true;
}
/* GB5 */
if (curr_gcb == GCB_CR ||
curr_gcb == GCB_LF ||
curr_gcb == GCB_Control)
{
return true;
}
/* GB6 */
if (prev_gcb == GCB_L &&
(curr_gcb == GCB_L ||
curr_gcb == GCB_V ||
curr_gcb == GCB_LV ||
curr_gcb == GCB_LVT))
{
return false;
}
/* GB7 */
if ((prev_gcb == GCB_LV || prev_gcb == GCB_V) &&
(curr_gcb == GCB_V || curr_gcb == GCB_T))
{
return false;
}
/* GB8 */
if ((prev_gcb == GCB_LVT || prev_gcb == GCB_T) &&
curr_gcb == GCB_T)
{
return false;
}
/* GB9 */
if (curr_gcb == GCB_Extend || curr_gcb == GCB_ZWJ) {
return false;
}
/* GB9a */
if (curr_gcb == GCB_SpacingMark) {
return false;
}
/* GB9b */
if (prev_gcb == GCB_Prepend) {
return false;
}
/* GB9c */
if (incb_state == InCBState_Matched) {
return false;
}
/* GB11 */
if (ep_state == ExtPictState_Matched) {
return false;
}
/* GB12 and GB13 */
if (prev_gcb == GCB_Regional_Indicator && curr_gcb == prev_gcb) {
return ri_flag;
}
/* GB999 */
return true;
}
static void
_Py_InitGraphemeBreak(_PyGraphemeBreak *iter, PyObject *str,
Py_ssize_t start, Py_ssize_t end)
{
iter->str = str;
iter->start = iter->pos = start;
iter->end = end;
iter->gcb = 0;
iter->ep_state = ExtPictState_Init;
iter->ri_flag = false;
iter->incb_state = InCBState_Init;
}
static Py_ssize_t
_Py_NextGraphemeBreak(_PyGraphemeBreak *iter)
{
if (iter->start >= iter->end) {
return -1;
}
int kind = PyUnicode_KIND(iter->str);
void *pstr = PyUnicode_DATA(iter->str);
while (iter->pos < iter->end) {
Py_UCS4 chr = PyUnicode_READ(kind, pstr, iter->pos);
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(chr);
int gcb = record->grapheme_cluster_break;
iter->ep_state = update_ext_pict_state(iter->ep_state, gcb, record->ext_pict);
iter->ri_flag = update_ri_flag(iter->ri_flag, gcb);
iter->incb_state = update_incb_state(iter->incb_state, record->incb);
int prev_gcb = iter->gcb;
iter->gcb = gcb;
if (iter->pos != iter->start &&
grapheme_break(prev_gcb, gcb, iter->ep_state, iter->ri_flag,
iter->incb_state))
{
iter->start = iter->pos;
return iter->pos++;
}
++iter->pos;
}
iter->start = iter->pos;
return iter->pos;
}
/* Text Segment object */
typedef struct {
PyObject_HEAD
PyObject *string;
Py_ssize_t start;
Py_ssize_t end;
} SegmentObject;
static void
Segment_dealloc(PyObject *self)
{
PyObject_GC_UnTrack(self);
Py_DECREF(((SegmentObject *)self)->string);
PyObject_GC_Del(self);
}
static int
Segment_traverse(PyObject *self, visitproc visit, void *arg)
{
Py_VISIT(((SegmentObject *)self)->string);
return 0;
}
static int
Segment_clear(PyObject *self)
{
Py_CLEAR(((SegmentObject *)self)->string);
return 0;
}
static PyObject *
Segment_str(PyObject *self)
{
SegmentObject *s = (SegmentObject *)self;
return PyUnicode_Substring(s->string, s->start, s->end);
}
static PyObject *
Segment_repr(PyObject *self)
{
SegmentObject *s = (SegmentObject *)self;
return PyUnicode_FromFormat("<Segment %zd:%zd>", s->start, s->end);
}
static PyMemberDef Segment_members[] = {
{"start", Py_T_PYSSIZET, offsetof(SegmentObject, start), 0,
PyDoc_STR("grapheme start")},
{"end", Py_T_PYSSIZET, offsetof(SegmentObject, end), 0,
PyDoc_STR("grapheme end")},
{NULL} /* Sentinel */
};
static PyType_Slot Segment_slots[] = {
{Py_tp_dealloc, Segment_dealloc},
{Py_tp_traverse, Segment_traverse},
{Py_tp_clear, Segment_clear},
{Py_tp_str, Segment_str},
{Py_tp_repr, Segment_repr},
{Py_tp_members, Segment_members},
{0, 0},
};
static PyType_Spec Segment_spec = {
.name = "unicodedata.Segment",
.basicsize = sizeof(SegmentObject),
.flags = (
Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_HAVE_GC
| Py_TPFLAGS_DISALLOW_INSTANTIATION
| Py_TPFLAGS_IMMUTABLETYPE
),
.slots = Segment_slots
};
/* Grapheme Cluster iterator */
typedef struct {
PyObject_HEAD
_PyGraphemeBreak iter;
} GraphemeBreakIterator;
static void
GBI_dealloc(PyObject *self)
{
PyObject_GC_UnTrack(self);
Py_DECREF(((GraphemeBreakIterator *)self)->iter.str);
PyObject_GC_Del(self);
}
static int
GBI_traverse(PyObject *self, visitproc visit, void *arg)
{
Py_VISIT(((GraphemeBreakIterator *)self)->iter.str);
return 0;
}
static int
GBI_clear(PyObject *self)
{
Py_CLEAR(((GraphemeBreakIterator *)self)->iter.str);
return 0;
}
static PyObject *
GBI_iternext(PyObject *self)
{
GraphemeBreakIterator *it = (GraphemeBreakIterator *)self;
Py_ssize_t start = it->iter.start;
Py_ssize_t pos = _Py_NextGraphemeBreak(&it->iter);
if (pos < 0) {
return NULL;
}
PyObject *module = PyType_GetModule(Py_TYPE(it));
PyObject *SegmentType = get_unicodedata_state(module)->SegmentType;
SegmentObject *s = PyObject_GC_New(SegmentObject,
(PyTypeObject *)SegmentType);
if (!s) {
return NULL;
}
s->string = Py_NewRef(it->iter.str);
s->start = start;
s->end = pos;
PyObject_GC_Track(s);
return (PyObject *)s;
}
static PyType_Slot GraphemeBreakIterator_slots[] = {
{Py_tp_dealloc, GBI_dealloc},
{Py_tp_iter, PyObject_SelfIter},
{Py_tp_iternext, GBI_iternext},
{Py_tp_traverse, GBI_traverse},
{Py_tp_clear, GBI_clear},
{0, 0},
};
static PyType_Spec GraphemeBreakIterator_spec = {
.name = "unicodedata.GraphemeBreakIterator",
.basicsize = sizeof(GraphemeBreakIterator),
.flags = (
Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_HAVE_GC
| Py_TPFLAGS_DISALLOW_INSTANTIATION
| Py_TPFLAGS_IMMUTABLETYPE
),
.slots = GraphemeBreakIterator_slots
};
/*[clinic input]
unicodedata.iter_graphemes
unistr: unicode
start: Py_ssize_t = 0
end: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
/
Returns an iterator to iterate over grapheme clusters.
It uses extended grapheme cluster rules from TR29.
[clinic start generated code]*/
static PyObject *
unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
Py_ssize_t start, Py_ssize_t end)
/*[clinic end generated code: output=b0b831944265d36f input=a1454d9e8135951f]*/
{
PyObject *GraphemeBreakIteratorType = get_unicodedata_state(module)->GraphemeBreakIteratorType;
GraphemeBreakIterator *gbi = PyObject_GC_New(GraphemeBreakIterator,
(PyTypeObject *)GraphemeBreakIteratorType);
if (!gbi) {
return NULL;
}
Py_ssize_t len = PyUnicode_GET_LENGTH(unistr);
ADJUST_INDICES(start, end, len);
Py_INCREF(unistr);
_Py_InitGraphemeBreak(&gbi->iter, unistr, start, end);
PyObject_GC_Track(gbi);
return (PyObject*)gbi;
}
/*[clinic input]
unicodedata.grapheme_cluster_break
chr: int(accept={str})
/
Returns the Grapheme_Cluster_Break property assigned to the character.
[clinic start generated code]*/
static PyObject *
unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr)
/*[clinic end generated code: output=39542e0f63bba36f input=5da75e86435576fd]*/
{
Py_UCS4 c = (Py_UCS4)chr;
int index = (int) _getrecord_ex(c)->grapheme_cluster_break;
return PyUnicode_FromString(_PyUnicode_GraphemeBreakNames[index]);
}
/*[clinic input]
unicodedata.indic_conjunct_break
chr: int(accept={str})
/
Returns the Indic_Conjunct_Break property assigned to the character.
[clinic start generated code]*/
static PyObject *
unicodedata_indic_conjunct_break_impl(PyObject *module, int chr)
/*[clinic end generated code: output=673eff2caf797f08 input=5c730f78e469f2e8]*/
{
Py_UCS4 c = (Py_UCS4)chr;
int index = (int) _getrecord_ex(c)->incb;
return PyUnicode_FromString(_PyUnicode_IndicConjunctBreakNames[index]);
}
/*[clinic input]
@permit_long_summary
unicodedata.extended_pictographic
chr: int(accept={str})
/
Returns the Extended_Pictographic property assigned to the character, as boolean.
[clinic start generated code]*/
static PyObject *
unicodedata_extended_pictographic_impl(PyObject *module, int chr)
/*[clinic end generated code: output=b6bbb349427370b1 input=250d7bd988997eb3]*/
{
Py_UCS4 c = (Py_UCS4)chr;
int index = (int) _getrecord_ex(c)->ext_pict;
return PyBool_FromLong(index);
}
// List of functions used to define module functions *AND* unicodedata.UCD
// methods. For module functions, self is the module. For UCD methods, self
// is an UCD instance. The UCD_Check() macro is used to check if self is
// an UCD instance.
static PyMethodDef unicodedata_functions[] = {
// Module only functions.
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
UNICODEDATA_ITER_GRAPHEMES_METHODDEF
// The following definitions are shared between the module
// and the UCD class.
#define DB_methods (unicodedata_functions + 4)
UNICODEDATA_UCD_DECIMAL_METHODDEF
UNICODEDATA_UCD_DIGIT_METHODDEF
UNICODEDATA_UCD_NUMERIC_METHODDEF
@ -1664,7 +2160,7 @@ static PyType_Slot ucd_type_slots[] = {
{Py_tp_dealloc, ucd_dealloc},
{Py_tp_traverse, _PyObject_VisitType},
{Py_tp_getattro, PyObject_GenericGetAttr},
{Py_tp_methods, unicodedata_functions},
{Py_tp_methods, DB_methods},
{Py_tp_members, DB_members},
{0, 0}
};
@ -1677,6 +2173,7 @@ static PyType_Spec ucd_type_spec = {
.slots = ucd_type_slots
};
PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
@ -1686,9 +2183,47 @@ this database is based on the UnicodeData.txt file version\n\
The module uses the same names and symbols as defined by the\n\
UnicodeData File Format " UNIDATA_VERSION ".");
static int
unicodedata_traverse(PyObject *module, visitproc visit, void *arg)
{
unicodedatastate *state = get_unicodedata_state(module);
Py_VISIT(state->SegmentType);
Py_VISIT(state->GraphemeBreakIteratorType);
return 0;
}
static int
unicodedata_clear(PyObject *module)
{
unicodedatastate *state = get_unicodedata_state(module);
Py_CLEAR(state->SegmentType);
Py_CLEAR(state->GraphemeBreakIteratorType);
return 0;
}
static void
unicodedata_free(void *module)
{
unicodedata_clear((PyObject *)module);
}
static int
unicodedata_exec(PyObject *module)
{
unicodedatastate *state = get_unicodedata_state(module);
PyObject *SegmentType = PyType_FromModuleAndSpec(module, &Segment_spec, NULL);
if (SegmentType == NULL) {
return -1;
}
state->SegmentType = SegmentType;
PyObject *GraphemeBreakIteratorType = PyType_FromModuleAndSpec(module, &GraphemeBreakIterator_spec, NULL);
if (GraphemeBreakIteratorType == NULL) {
return -1;
}
state->GraphemeBreakIteratorType = GraphemeBreakIteratorType;
if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
return -1;
}
@ -1730,9 +2265,12 @@ static struct PyModuleDef unicodedata_module = {
PyModuleDef_HEAD_INIT,
.m_name = "unicodedata",
.m_doc = unicodedata_docstring,
.m_size = 0,
.m_size = sizeof(unicodedatastate),
.m_methods = unicodedata_functions,
.m_slots = unicodedata_slots,
.m_traverse = unicodedata_traverse,
.m_clear = unicodedata_clear,
.m_free = unicodedata_free,
};
PyMODINIT_FUNC

6231
Modules/unicodedata_db.h generated

File diff suppressed because it is too large Load diff

View file

@ -56,6 +56,8 @@
NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
CASE_FOLDING = "CaseFolding%s.txt"
GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
EMOJI_DATA = "emoji/emoji-data%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
@ -77,6 +79,14 @@
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON", "LRI", "RLI", "FSI", "PDI" ]
# "Other" needs to be the first entry, see the comment in makeunicodedata
GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
'ZWJ' ]
# "None" needs to be the first entry, see the comment in makeunicodedata
INDIC_CONJUNCT_BREAK_NAMES = [ 'None', 'Linker', 'Consonant', 'Extend' ]
# "N" needs to be the first entry, see the comment in makeunicodedata
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
@ -147,7 +157,9 @@ def makeunicodedata(unicode, trace):
# EastAsianWidth.txt
# see https://unicode.org/reports/tr11/#Unassigned
assert EASTASIANWIDTH_NAMES[0] == "N"
dummy = (0, 0, 0, 0, 0, 0)
assert GRAPHEME_CLUSTER_NAMES[0] == "Other"
assert INDIC_CONJUNCT_BREAK_NAMES[0] == "None"
dummy = (0, 0, 0, 0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
@ -160,23 +172,25 @@ def makeunicodedata(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
extpict = unicode.ext_picts[char]
if record:
# extract database properties
category = CATEGORY_NAMES.index(record.general_category)
combining = int(record.canonical_combining_class)
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
mirrored = record.bidi_mirrored == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
normalizationquickcheck = record.quick_check
incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
item = (
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
normalizationquickcheck, graphemebreak, incb, extpict,
)
elif unicode.widths[char] is not None:
elif eastasianwidth or graphemebreak or extpict:
# an unassigned but reserved character, with a known
# east_asian_width
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
item = (0, 0, 0, 0, eastasianwidth, 0)
# east_asian_width or grapheme_break or ext_pict
item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
else:
continue
@ -296,7 +310,7 @@ def makeunicodedata(unicode, trace):
fprint("/* a list of unique database records */")
fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
fprint(" {%d, %d, %d, %d, %d, %d, %d, %d, %d}," % item)
fprint("};")
fprint()
@ -337,6 +351,24 @@ def makeunicodedata(unicode, trace):
fprint(" NULL")
fprint("};")
for i, name in enumerate(GRAPHEME_CLUSTER_NAMES):
fprint("#define GCB_%s %d" % (name, i))
fprint("const char * const _PyUnicode_GraphemeBreakNames[] = {")
for name in GRAPHEME_CLUSTER_NAMES:
fprint(' "%s",' % name)
fprint(" NULL")
fprint("};")
for i, name in enumerate(INDIC_CONJUNCT_BREAK_NAMES):
fprint("#define InCB_%s %d" % (name, i))
fprint("const char * const _PyUnicode_IndicConjunctBreakNames[] = {")
for name in INDIC_CONJUNCT_BREAK_NAMES:
fprint(' "%s",' % name)
fprint(" NULL")
fprint("};")
fprint("static const char *decomp_prefix[] = {")
for name in decomp_prefix:
fprint(" \"%s\"," % name)
@ -783,6 +815,10 @@ def merge_old_version(version, new, old):
# normalization quickchecks are not performed
# for older versions
pass
elif k == 18:
# The Indic_Conjunct_Break property did not exist for
# older versions
pass
else:
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
@ -804,7 +840,7 @@ def open_data(template, version):
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
else:
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(os.path.dirname(local), exist_ok=True)
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
return open(local, encoding='utf-8')
@ -892,9 +928,13 @@ class UcdRecord:
# We store them as a bitmask.
quick_check: int
# The Indic_Conjunct_Break property from DerivedCoreProperties.txt. See:
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
incb: str
def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0)
return UcdRecord(*row, None, set(), 0, "None")
# --------------------------------------------------------------------
@ -992,14 +1032,14 @@ def __init__(self, version, cjk_check=True):
self.widths = widths
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if propinfo:
# this is not a binary property, ignore it
continue
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(propname)
if not propinfo:
# binary property
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(propname)
elif propname == 'InCB': # Indic_Conjunct_Break
table[char].incb, = propinfo
for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
@ -1068,6 +1108,19 @@ def __init__(self, version, cjk_check=True):
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
if version != "3.2.0":
grapheme_breaks = [None] * 0x110000
for char, (prop,) in UcdFile(GRAPHEME_CLUSTER_BREAK, version).expanded():
grapheme_breaks[char] = prop
self.grapheme_breaks = grapheme_breaks
ext_picts = [False] * 0x110000
for char, (prop,) in UcdFile(EMOJI_DATA, version).expanded():
if prop == 'Extended_Pictographic':
ext_picts[char] = True
self.ext_picts = ext_picts
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = list(range(256))