gh-74902: Add Unicode Grapheme Cluster Break algorithm (GH-143076)

Add the unicodedata.iter_graphemes() function to iterate over grapheme clusters according to rules defined in Unicode Standard Annex #29. Add unicodedata.grapheme_cluster_break(), unicodedata.indic_conjunct_break() and unicodedata.extended_pictographic() functions to get the properties of the character which are related to the above algorithm. Co-authored-by: Guillaume "Vermeille" Sanchez <guillaume.v.sanchez@gmail.com>
2026-03-10 23:10:48 +00:00 · 2026-01-14 16:37:57 +02:00 · 2026-01-14 16:37:57 +02:00 · bab1d7a561
commit bab1d7a561
parent 0e0d51cdce
9 changed files with 4350 additions and 3039 deletions
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@ -184,6 +184,28 @@ following functions:
      '0041 0303'


+.. function:: grapheme_cluster_break(chr, /)
+
+   Returns the Grapheme_Cluster_Break property assigned to the character.
+
+   .. versionadded:: next
+
+
+.. function:: indic_conjunct_break(chr, /)
+
+   Returns the Indic_Conjunct_Break property assigned to the character.
+
+   .. versionadded:: next
+
+
+.. function:: extended_pictographic(chr, /)
+
+   Returns ``True`` if the character has the Extended_Pictographic property,
+   ``False`` otherwise.
+
+   .. versionadded:: next
+
+
 .. function:: normalize(form, unistr, /)

   Return the normal form *form* for the Unicode string *unistr*. Valid values for
@ -225,6 +247,24 @@ following functions:
   .. versionadded:: 3.8


+.. function:: iter_graphemes(unistr, start=0, end=sys.maxsize, /)
+
+   Returns an iterator to iterate over grapheme clusters.
+   With optional *start*, iteration begins at that position.
+   With optional *end*, iteration stops at that position.
+
+   Converting an emitted item to string returns a substring corresponding to
+   the grapheme cluster.
+   Its ``start`` and ``end`` attributes denote the start and end of
+   the grapheme cluster.
+
+   It uses extended grapheme cluster rules defined by Unicode
+   Standard Annex #29, `"Unicode Text Segmentation"
+   <https://www.unicode.org/reports/tr29/>`_.
+
+   .. versionadded:: next
+
+
 In addition, the module exposes the following constant:

 .. data:: unidata_version
@ -234,7 +274,7 @@ In addition, the module exposes the following constant:

 .. data:: ucd_3_2_0

-   This is an object that has the same methods as the entire module, but uses the
+   This is an object that has most of the methods of the entire module, but uses the
   Unicode database version 3.2 instead, for applications that require this
   specific version of the Unicode database (such as IDNA).

--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@ -811,6 +811,16 @@ unicodedata
  `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
  (Contributed by Stan Ulbrych in :gh:`129117`.)

+* Add the :func:`~unicodedata.iter_graphemes`
+  function to iterate over grapheme clusters according to rules defined in
+  `Unicode Standard Annex #29, "Unicode Text Segmentation"
+  <https://www.unicode.org/reports/tr29/>`_.
+  Add :func:`~unicodedata.grapheme_cluster_break`,
+  :func:`~unicodedata.indic_conjunct_break` and
+  :func:`~unicodedata.extended_pictographic` functions to get the properties
+  of the character which are related to the above algorithm.
+  (Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
+

 unittest
 --------
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -616,6 +616,221 @@ def test_isxidcontinue(self):
        self.assertRaises(TypeError, self.db.isxidcontinue)
        self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')

+    def test_grapheme_cluster_break(self):
+        gcb = self.db.grapheme_cluster_break
+        self.assertEqual(gcb(' '), 'Other')
+        self.assertEqual(gcb('x'), 'Other')
+        self.assertEqual(gcb('\U0010FFFF'), 'Other')
+        self.assertEqual(gcb('\r'), 'CR')
+        self.assertEqual(gcb('\n'), 'LF')
+        self.assertEqual(gcb('\0'), 'Control')
+        self.assertEqual(gcb('\t'), 'Control')
+        self.assertEqual(gcb('\x1F'), 'Control')
+        self.assertEqual(gcb('\x7F'), 'Control')
+        self.assertEqual(gcb('\x9F'), 'Control')
+        self.assertEqual(gcb('\U000E0001'), 'Control')
+        self.assertEqual(gcb('\u0300'), 'Extend')
+        self.assertEqual(gcb('\u200C'), 'Extend')
+        self.assertEqual(gcb('\U000E01EF'), 'Extend')
+        self.assertEqual(gcb('\u1159'), 'L')
+        self.assertEqual(gcb('\u11F9'), 'T')
+        self.assertEqual(gcb('\uD788'), 'LV')
+        self.assertEqual(gcb('\uD7A3'), 'LVT')
+        # New in 5.0.0
+        self.assertEqual(gcb('\u05BA'), 'Extend')
+        self.assertEqual(gcb('\u20EF'), 'Extend')
+        # New in 5.1.0
+        self.assertEqual(gcb('\u2064'), 'Control')
+        self.assertEqual(gcb('\uAA4D'), 'SpacingMark')
+        # New in 5.2.0
+        self.assertEqual(gcb('\u0816'), 'Extend')
+        self.assertEqual(gcb('\uA97C'), 'L')
+        self.assertEqual(gcb('\uD7C6'), 'V')
+        self.assertEqual(gcb('\uD7FB'), 'T')
+        # New in 6.0.0
+        self.assertEqual(gcb('\u093A'), 'Extend')
+        self.assertEqual(gcb('\U00011002'), 'SpacingMark')
+        # New in 6.1.0
+        self.assertEqual(gcb('\U000E0FFF'), 'Control')
+        self.assertEqual(gcb('\U00016F7E'), 'SpacingMark')
+        # New in 6.2.0
+        self.assertEqual(gcb('\U0001F1E6'), 'Regional_Indicator')
+        self.assertEqual(gcb('\U0001F1FF'), 'Regional_Indicator')
+        # New in 6.3.0
+        self.assertEqual(gcb('\u180E'), 'Control')
+        self.assertEqual(gcb('\u1A1B'), 'Extend')
+        # New in 7.0.0
+        self.assertEqual(gcb('\u0E33'), 'SpacingMark')
+        self.assertEqual(gcb('\u0EB3'), 'SpacingMark')
+        self.assertEqual(gcb('\U0001BCA3'), 'Control')
+        self.assertEqual(gcb('\U0001E8D6'), 'Extend')
+        self.assertEqual(gcb('\U0001163E'), 'SpacingMark')
+        # New in 8.0.0
+        self.assertEqual(gcb('\u08E3'), 'Extend')
+        self.assertEqual(gcb('\U00011726'), 'SpacingMark')
+        # New in 9.0.0
+        self.assertEqual(gcb('\u0600'), 'Prepend')
+        self.assertEqual(gcb('\U000E007F'), 'Extend')
+        self.assertEqual(gcb('\U00011CB4'), 'SpacingMark')
+        self.assertEqual(gcb('\u200D'), 'ZWJ')
+        # New in 10.0.0
+        self.assertEqual(gcb('\U00011D46'), 'Prepend')
+        self.assertEqual(gcb('\U00011D47'), 'Extend')
+        self.assertEqual(gcb('\U00011A97'), 'SpacingMark')
+        # New in 11.0.0
+        self.assertEqual(gcb('\U000110CD'), 'Prepend')
+        self.assertEqual(gcb('\u07FD'), 'Extend')
+        self.assertEqual(gcb('\U00011EF6'), 'SpacingMark')
+        # New in 12.0.0
+        self.assertEqual(gcb('\U00011A84'), 'Prepend')
+        self.assertEqual(gcb('\U00013438'), 'Control')
+        self.assertEqual(gcb('\U0001E2EF'), 'Extend')
+        self.assertEqual(gcb('\U00016F87'), 'SpacingMark')
+        # New in 13.0.0
+        self.assertEqual(gcb('\U00011941'), 'Prepend')
+        self.assertEqual(gcb('\U00016FE4'), 'Extend')
+        self.assertEqual(gcb('\U00011942'), 'SpacingMark')
+        # New in 14.0.0
+        self.assertEqual(gcb('\u0891'), 'Prepend')
+        self.assertEqual(gcb('\U0001E2AE'), 'Extend')
+        # New in 15.0.0
+        self.assertEqual(gcb('\U00011F02'), 'Prepend')
+        self.assertEqual(gcb('\U0001343F'), 'Control')
+        self.assertEqual(gcb('\U0001E4EF'), 'Extend')
+        self.assertEqual(gcb('\U00011F3F'), 'SpacingMark')
+        # New in 16.0.0
+        self.assertEqual(gcb('\U000113D1'), 'Prepend')
+        self.assertEqual(gcb('\U0001E5EF'), 'Extend')
+        self.assertEqual(gcb('\U0001612C'), 'SpacingMark')
+        self.assertEqual(gcb('\U00016D63'), 'V')
+        # New in 17.0.0
+        self.assertEqual(gcb('\u1AEB'), 'Extend')
+        self.assertEqual(gcb('\U00011B67'), 'SpacingMark')
+
+        self.assertRaises(TypeError, gcb)
+        self.assertRaises(TypeError, gcb, b'x')
+        self.assertRaises(TypeError, gcb, 120)
+        self.assertRaises(TypeError, gcb, '')
+        self.assertRaises(TypeError, gcb, 'xx')
+
+    def test_indic_conjunct_break(self):
+        incb = self.db.indic_conjunct_break
+        self.assertEqual(incb(' '), 'None')
+        self.assertEqual(incb('x'), 'None')
+        self.assertEqual(incb('\U0010FFFF'), 'None')
+        # New in 15.1.0
+        self.assertEqual(incb('\u094D'), 'Linker')
+        self.assertEqual(incb('\u0D4D'), 'Linker')
+        self.assertEqual(incb('\u0915'), 'Consonant')
+        self.assertEqual(incb('\u0D3A'), 'Consonant')
+        self.assertEqual(incb('\u0300'), 'Extend')
+        self.assertEqual(incb('\U0001E94A'), 'Extend')
+        # New in 16.0.0
+        self.assertEqual(incb('\u034F'), 'Extend')
+        self.assertEqual(incb('\U000E01EF'), 'Extend')
+        # New in 17.0.0
+        self.assertEqual(incb('\u1039'), 'Linker')
+        self.assertEqual(incb('\U00011F42'), 'Linker')
+        self.assertEqual(incb('\u1000'), 'Consonant')
+        self.assertEqual(incb('\U00011F33'), 'Consonant')
+        self.assertEqual(incb('\U0001E6F5'), 'Extend')
+
+        self.assertRaises(TypeError, incb)
+        self.assertRaises(TypeError, incb, b'x')
+        self.assertRaises(TypeError, incb, 120)
+        self.assertRaises(TypeError, incb, '')
+        self.assertRaises(TypeError, incb, 'xx')
+
+    def test_extended_pictographic(self):
+        ext_pict = self.db.extended_pictographic
+        self.assertIs(ext_pict(' '), False)
+        self.assertIs(ext_pict('x'), False)
+        self.assertIs(ext_pict('\U0010FFFF'), False)
+        # New in 13.0.0
+        self.assertIs(ext_pict('\xA9'), True)
+        self.assertIs(ext_pict('\u203C'), True)
+        self.assertIs(ext_pict('\U0001FAD6'), True)
+        self.assertIs(ext_pict('\U0001FFFD'), True)
+        # New in 17.0.0
+        self.assertIs(ext_pict('\u2388'), False)
+        self.assertIs(ext_pict('\U0001FA6D'), False)
+
+        self.assertRaises(TypeError, ext_pict)
+        self.assertRaises(TypeError, ext_pict, b'x')
+        self.assertRaises(TypeError, ext_pict, 120)
+        self.assertRaises(TypeError, ext_pict, '')
+        self.assertRaises(TypeError, ext_pict, 'xx')
+
+    def test_grapheme_break(self):
+        def graphemes(*args):
+            return list(map(str, self.db.iter_graphemes(*args)))
+
+        self.assertRaises(TypeError, self.db.iter_graphemes)
+        self.assertRaises(TypeError, self.db.iter_graphemes, b'x')
+        self.assertRaises(TypeError, self.db.iter_graphemes, 'x', 0, 0, 0)
+
+        self.assertEqual(graphemes(''), [])
+        self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
+        self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
+        self.assertEqual(graphemes('abcd', 3, 1), [])
+        self.assertEqual(graphemes('abcd', 5), [])
+        self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 0, -5), [])
+        # GB3
+        self.assertEqual(graphemes('\r\n'), ['\r\n'])
+        # GB4
+        self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
+        self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
+        self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
+        # GB5
+        self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
+        self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
+        self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
+        # GB6
+        self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
+        self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
+        self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
+        # GB7
+        self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
+        self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
+        self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
+        self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
+        # GB8
+        self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
+        self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
+        # GB9
+        self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
+        self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
+        # GB9a
+        self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
+        # GB9b
+        self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
+        # GB9c
+        self.assertEqual(graphemes('\u0915\u094d\u0924'),
+                         ['\u0915\u094d\u0924'])
+        self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
+                         ['\u0915\u094D\u094D\u0924'])
+        self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
+                         ['\u0915\u094D\u0924\u094D\u092F'])
+        # GB11
+        self.assertEqual(graphemes(
+                '\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
+                '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
+                ['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
+                '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
+        # GB12
+        self.assertEqual(graphemes(
+            '\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
+            ['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
+        # GB13
+        self.assertEqual(graphemes(
+            'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
+            ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
+

 class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
    db = unicodedata.ucd_3_2_0
@ -624,6 +839,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
                        if quicktest else
                        'f217b8688d7bdff31db4207e078a96702f091597')

+    test_grapheme_cluster_break = None
+    test_indic_conjunct_break = None
+    test_extended_pictographic = None
+    test_grapheme_break = None
+

 class UnicodeMiscTest(unittest.TestCase):
    db = unicodedata
@ -726,6 +946,17 @@ def test_linebreak_7643(self):
                self.assertEqual(len(lines), 1,
                                 r"%a should not be a linebreak" % c)

+    def test_segment_object(self):
+        segments = list(unicodedata.iter_graphemes('spa\u0300m'))
+        self.assertEqual(len(segments), 4, segments)
+        segment = segments[2]
+        self.assertEqual(segment.start, 2)
+        self.assertEqual(segment.end, 4)
+        self.assertEqual(str(segment), 'a\u0300')
+        self.assertEqual(repr(segment), '<Segment 2:4>')
+        self.assertRaises(TypeError, iter, segment)
+        self.assertRaises(TypeError, len, segment)
+

 class NormalizationTest(unittest.TestCase):
    @staticmethod
@ -848,5 +1079,61 @@ class MyStr(str):
                    self.assertIs(type(normalize(form, MyStr(input_str))), str)


+class GraphemeBreakTest(unittest.TestCase):
+    @staticmethod
+    def check_version(testfile):
+        hdr = testfile.readline()
+        return unicodedata.unidata_version in hdr
+
+    @requires_resource('network')
+    def test_grapheme_break(self):
+        TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
+        TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
+
+        # Hit the exception early
+        try:
+            testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
+                                        check=self.check_version)
+        except PermissionError:
+            self.skipTest(f"Permission error when downloading {TESTDATAURL} "
+                          f"into the test data directory")
+        except (OSError, HTTPException) as exc:
+            self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
+
+        with testdata:
+            self.run_grapheme_break_tests(testdata)
+
+    def run_grapheme_break_tests(self, testdata):
+        for line in testdata:
+            line, _, comment = line.partition('#')
+            line = line.strip()
+            if not line:
+                continue
+            comment = comment.strip()
+
+            chunks = []
+            breaks = []
+            pos = 0
+            for field in line.replace('×', ' ').split():
+                if field == '÷':
+                    chunks.append('')
+                    breaks.append(pos)
+                else:
+                    chunks[-1] += chr(int(field, 16))
+                    pos += 1
+            self.assertEqual(chunks.pop(), '', line)
+            input = ''.join(chunks)
+            with self.subTest(line):
+                result = list(unicodedata.iter_graphemes(input))
+                self.assertEqual(list(map(str, result)), chunks, comment)
+                self.assertEqual([x.start for x in result], breaks[:-1], comment)
+                self.assertEqual([x.end for x in result], breaks[1:], comment)
+                for i in range(1, len(breaks) - 1):
+                    result = list(unicodedata.iter_graphemes(input, breaks[i]))
+                    self.assertEqual(list(map(str, result)), chunks[i:], comment)
+                    self.assertEqual([x.start for x in result], breaks[i:-1], comment)
+                    self.assertEqual([x.end for x in result], breaks[i+1:], comment)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -1664,6 +1664,7 @@ Victor Salgado
 Rich Salz
 Kevin Samborn
 Adrian Sampson
+Guillaume Sanchez
 Nevada Sanchez
 James Sanders
 Ilya Sandler
--- a/Misc/NEWS.d/next/Library/2025-12-22-18-25-54.gh-issue-74902.HqrWUV.rst
+++ b/Misc/NEWS.d/next/Library/2025-12-22-18-25-54.gh-issue-74902.HqrWUV.rst
@ -0,0 +1,8 @@
+Add the :func:`~unicodedata.iter_graphemes` function in the
+:mod:`unicodedata` module to iterate over grapheme clusters according to
+rules defined in `Unicode Standard Annex #29, "Unicode Text Segmentation"
+<https://www.unicode.org/reports/tr29/>`_. Add
+:func:`~unicodedata.grapheme_cluster_break`,
+:func:`~unicodedata.indic_conjunct_break` and
+:func:`~unicodedata.extended_pictographic` functions to get the properties
+of the character which are related to the above algorithm.
--- a/Modules/clinic/unicodedata.c.h
+++ b/Modules/clinic/unicodedata.c.h
@ -2,6 +2,7 @@
 preserve
 [clinic start generated code]*/

+#include "pycore_abstract.h"      // _PyNumber_Index()
 #include "pycore_modsupport.h"    // _PyArg_CheckPositional()

 PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
@ -621,4 +622,180 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
 exit:
    return return_value;
 }
-/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(unicodedata_iter_graphemes__doc__,
+"iter_graphemes($module, unistr, start=0, end=sys.maxsize, /)\n"
+"--\n"
+"\n"
+"Returns an iterator to iterate over grapheme clusters.\n"
+"\n"
+"It uses extended grapheme cluster rules from TR29.");
+
+#define UNICODEDATA_ITER_GRAPHEMES_METHODDEF    \
+    {"iter_graphemes", _PyCFunction_CAST(unicodedata_iter_graphemes), METH_FASTCALL, unicodedata_iter_graphemes__doc__},
+
+static PyObject *
+unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
+                                Py_ssize_t start, Py_ssize_t end);
+
+static PyObject *
+unicodedata_iter_graphemes(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
+{
+    PyObject *return_value = NULL;
+    PyObject *unistr;
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PY_SSIZE_T_MAX;
+
+    if (!_PyArg_CheckPositional("iter_graphemes", nargs, 1, 3)) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("iter_graphemes", "argument 1", "str", args[0]);
+        goto exit;
+    }
+    unistr = args[0];
+    if (nargs < 2) {
+        goto skip_optional;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[1]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        start = ival;
+    }
+    if (nargs < 3) {
+        goto skip_optional;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[2]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        end = ival;
+    }
+skip_optional:
+    return_value = unicodedata_iter_graphemes_impl(module, unistr, start, end);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
+"grapheme_cluster_break($module, chr, /)\n"
+"--\n"
+"\n"
+"Returns the Grapheme_Cluster_Break property assigned to the character.");
+
+#define UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF    \
+    {"grapheme_cluster_break", (PyCFunction)unicodedata_grapheme_cluster_break, METH_O, unicodedata_grapheme_cluster_break__doc__},
+
+static PyObject *
+unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr);
+
+static PyObject *
+unicodedata_grapheme_cluster_break(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("grapheme_cluster_break", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "grapheme_cluster_break(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_grapheme_cluster_break_impl(module, chr);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(unicodedata_indic_conjunct_break__doc__,
+"indic_conjunct_break($module, chr, /)\n"
+"--\n"
+"\n"
+"Returns the Indic_Conjunct_Break property assigned to the character.");
+
+#define UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF    \
+    {"indic_conjunct_break", (PyCFunction)unicodedata_indic_conjunct_break, METH_O, unicodedata_indic_conjunct_break__doc__},
+
+static PyObject *
+unicodedata_indic_conjunct_break_impl(PyObject *module, int chr);
+
+static PyObject *
+unicodedata_indic_conjunct_break(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("indic_conjunct_break", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "indic_conjunct_break(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_indic_conjunct_break_impl(module, chr);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(unicodedata_extended_pictographic__doc__,
+"extended_pictographic($module, chr, /)\n"
+"--\n"
+"\n"
+"Returns the Extended_Pictographic property assigned to the character, as boolean.");
+
+#define UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF    \
+    {"extended_pictographic", (PyCFunction)unicodedata_extended_pictographic, METH_O, unicodedata_extended_pictographic__doc__},
+
+static PyObject *
+unicodedata_extended_pictographic_impl(PyObject *module, int chr);
+
+static PyObject *
+unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("extended_pictographic", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "extended_pictographic(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_extended_pictographic_impl(module, chr);
+
+exit:
+    return return_value;
+}
+/*[clinic end generated code: output=6991246310e3f2aa input=a9049054013a1b77]*/
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -24,6 +24,26 @@
 #include <stdbool.h>
 #include <stddef.h>               // offsetof()

+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+    do {                                \
+        if (end > len) {                \
+            end = len;                  \
+        }                               \
+        else if (end < 0) {             \
+            end += len;                 \
+            if (end < 0) {              \
+                end = 0;                \
+            }                           \
+        }                               \
+        if (start < 0) {                \
+            start += len;               \
+            if (start < 0) {            \
+                start = 0;              \
+            }                           \
+        }                               \
+    } while (0)
+
 /*[clinic input]
 module unicodedata
 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
@ -42,6 +62,11 @@ typedef struct {
    const unsigned char east_asian_width;       /* index into
                                                   _PyUnicode_EastAsianWidth */
    const unsigned char normalization_quick_check; /* see is_normalized() */
+    const unsigned char grapheme_cluster_break; /* index into
+                                                   _PyUnicode_GraphemeBreakNames */
+    const unsigned char incb;           /* index into
+                                           _PyUnicode_IndicConjunctBreakNames */
+    const unsigned char ext_pict;       /* true if Extended_Pictographic */
 } _PyUnicode_DatabaseRecord;

 typedef struct change_record {
@ -71,6 +96,19 @@ _getrecord_ex(Py_UCS4 code)
    return &_PyUnicode_Database_Records[index];
 }

+typedef struct {
+    PyObject *SegmentType;
+    PyObject *GraphemeBreakIteratorType;
+} unicodedatastate;
+
+static inline unicodedatastate *
+get_unicodedata_state(PyObject *module)
+{
+    void *state = _PyModule_GetState(module);
+    assert(state != NULL);
+    return (unicodedatastate *)state;
+}
+
 /* ------------- Previous-version API ------------------------------------- */
 typedef struct previous_version {
    PyObject_HEAD
@ -1628,11 +1666,469 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
    return PyUnicode_FromOrdinal(code);
 }

+
+/* Grapheme Cluster Break algorithm */
+
+enum ExtPictState {
+    ExtPictState_Init,
+    // \p{Extended_Pictographic} Extend*
+    ExtPictState_Started,
+    // ... ZWJ
+    ExtPictState_ZWJ,
+    // ... \p{Extended_Pictographic}
+    ExtPictState_Matched,
+};
+
+enum InCBState {
+    InCBState_Init,
+    // \p{InCB=Consonant} \p{InCB=Extend}*
+    InCBState_Started,
+    // ... \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]*
+    InCBState_Linker,
+    // ... \p{InCB=Consonant}
+    InCBState_Matched,
+};
+
+typedef struct {
+    PyObject *str;
+    Py_ssize_t start;
+    Py_ssize_t pos;
+    Py_ssize_t end;
+    int gcb;
+    enum ExtPictState ep_state;
+    enum InCBState incb_state;
+    bool ri_flag;
+} _PyGraphemeBreak;
+
+static inline enum ExtPictState
+update_ext_pict_state(enum ExtPictState state, int gcb, bool ext_pict)
+{
+    if (ext_pict) {
+        return (state == ExtPictState_ZWJ) ? ExtPictState_Matched : ExtPictState_Started;
+    }
+    if (state == ExtPictState_Started || state == ExtPictState_Matched) {
+        if (gcb == GCB_Extend) {
+            return ExtPictState_Started;
+        }
+        if (gcb == GCB_ZWJ) {
+            return ExtPictState_ZWJ;
+        }
+    }
+    return ExtPictState_Init;
+}
+
+static inline enum InCBState
+update_incb_state(enum InCBState state, int incb)
+{
+    if (incb == InCB_Consonant) {
+        return (state == InCBState_Linker) ? InCBState_Matched : InCBState_Started;
+    }
+    if (state != InCBState_Init) {
+        if (incb == InCB_Extend) {
+            return (state == InCBState_Linker) ? InCBState_Linker : InCBState_Started;
+        }
+        if (incb == InCB_Linker) {
+            return InCBState_Linker;
+        }
+    }
+    return InCBState_Init;
+}
+
+static inline bool
+update_ri_flag(bool flag, int gcb)
+{
+    if (gcb == GCB_Regional_Indicator) {
+        return !flag;
+    }
+    else {
+        return false;
+    }
+}
+
+static inline bool
+grapheme_break(int prev_gcb, int curr_gcb, enum ExtPictState ep_state,
+               bool ri_flag, enum InCBState incb_state)
+{
+    /* GB3 */
+    if (prev_gcb == GCB_CR && curr_gcb == GCB_LF) {
+        return false;
+    }
+
+    /* GB4 */
+    if (prev_gcb == GCB_CR ||
+        prev_gcb == GCB_LF ||
+        prev_gcb == GCB_Control)
+    {
+        return true;
+    }
+
+    /* GB5 */
+    if (curr_gcb == GCB_CR ||
+        curr_gcb == GCB_LF ||
+        curr_gcb == GCB_Control)
+    {
+        return true;
+    }
+
+    /* GB6 */
+    if (prev_gcb == GCB_L &&
+        (curr_gcb == GCB_L ||
+         curr_gcb == GCB_V ||
+         curr_gcb == GCB_LV ||
+         curr_gcb == GCB_LVT))
+    {
+        return false;
+    }
+
+    /* GB7 */
+    if ((prev_gcb == GCB_LV || prev_gcb == GCB_V) &&
+        (curr_gcb == GCB_V || curr_gcb == GCB_T))
+    {
+        return false;
+    }
+
+    /* GB8 */
+    if ((prev_gcb == GCB_LVT || prev_gcb == GCB_T) &&
+        curr_gcb == GCB_T)
+    {
+        return false;
+    }
+
+    /* GB9 */
+    if (curr_gcb == GCB_Extend || curr_gcb == GCB_ZWJ) {
+        return false;
+    }
+
+    /* GB9a */
+    if (curr_gcb == GCB_SpacingMark) {
+        return false;
+    }
+
+    /* GB9b */
+    if (prev_gcb == GCB_Prepend) {
+        return false;
+    }
+
+    /* GB9c */
+    if (incb_state == InCBState_Matched) {
+        return false;
+    }
+
+    /* GB11 */
+    if (ep_state == ExtPictState_Matched) {
+        return false;
+    }
+
+    /* GB12 and GB13 */
+    if (prev_gcb == GCB_Regional_Indicator && curr_gcb == prev_gcb) {
+        return ri_flag;
+    }
+
+    /* GB999 */
+    return true;
+}
+
+static void
+_Py_InitGraphemeBreak(_PyGraphemeBreak *iter, PyObject *str,
+                      Py_ssize_t start, Py_ssize_t end)
+{
+    iter->str = str;
+    iter->start = iter->pos = start;
+    iter->end = end;
+    iter->gcb = 0;
+    iter->ep_state = ExtPictState_Init;
+    iter->ri_flag = false;
+    iter->incb_state = InCBState_Init;
+}
+
+static Py_ssize_t
+_Py_NextGraphemeBreak(_PyGraphemeBreak *iter)
+{
+    if (iter->start >= iter->end) {
+        return -1;
+    }
+
+    int kind = PyUnicode_KIND(iter->str);
+    void *pstr = PyUnicode_DATA(iter->str);
+    while (iter->pos < iter->end) {
+        Py_UCS4 chr = PyUnicode_READ(kind, pstr, iter->pos);
+        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(chr);
+        int gcb = record->grapheme_cluster_break;
+        iter->ep_state = update_ext_pict_state(iter->ep_state, gcb, record->ext_pict);
+        iter->ri_flag = update_ri_flag(iter->ri_flag, gcb);
+        iter->incb_state = update_incb_state(iter->incb_state, record->incb);
+        int prev_gcb = iter->gcb;
+        iter->gcb = gcb;
+        if (iter->pos != iter->start &&
+            grapheme_break(prev_gcb, gcb, iter->ep_state, iter->ri_flag,
+                           iter->incb_state))
+        {
+            iter->start = iter->pos;
+            return iter->pos++;
+        }
+        ++iter->pos;
+    }
+    iter->start = iter->pos;
+    return iter->pos;
+}
+
+
+/* Text Segment object */
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *string;
+    Py_ssize_t start;
+    Py_ssize_t end;
+} SegmentObject;
+
+static void
+Segment_dealloc(PyObject *self)
+{
+    PyObject_GC_UnTrack(self);
+    Py_DECREF(((SegmentObject *)self)->string);
+    PyObject_GC_Del(self);
+}
+
+static int
+Segment_traverse(PyObject *self, visitproc visit, void *arg)
+{
+    Py_VISIT(((SegmentObject *)self)->string);
+    return 0;
+}
+
+static int
+Segment_clear(PyObject *self)
+{
+    Py_CLEAR(((SegmentObject *)self)->string);
+    return 0;
+}
+
+static PyObject *
+Segment_str(PyObject *self)
+{
+    SegmentObject *s = (SegmentObject *)self;
+    return PyUnicode_Substring(s->string, s->start, s->end);
+}
+
+static PyObject *
+Segment_repr(PyObject *self)
+{
+    SegmentObject *s = (SegmentObject *)self;
+    return PyUnicode_FromFormat("<Segment %zd:%zd>", s->start, s->end);
+}
+
+static PyMemberDef Segment_members[] = {
+    {"start", Py_T_PYSSIZET, offsetof(SegmentObject, start), 0,
+        PyDoc_STR("grapheme start")},
+    {"end", Py_T_PYSSIZET, offsetof(SegmentObject, end), 0,
+        PyDoc_STR("grapheme end")},
+    {NULL}  /* Sentinel */
+};
+
+static PyType_Slot Segment_slots[] = {
+    {Py_tp_dealloc, Segment_dealloc},
+    {Py_tp_traverse, Segment_traverse},
+    {Py_tp_clear, Segment_clear},
+    {Py_tp_str, Segment_str},
+    {Py_tp_repr, Segment_repr},
+    {Py_tp_members, Segment_members},
+    {0, 0},
+};
+
+static PyType_Spec Segment_spec = {
+    .name = "unicodedata.Segment",
+    .basicsize = sizeof(SegmentObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_HAVE_GC
+        | Py_TPFLAGS_DISALLOW_INSTANTIATION
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = Segment_slots
+};
+
+
+/* Grapheme Cluster iterator */
+
+typedef struct {
+    PyObject_HEAD
+    _PyGraphemeBreak iter;
+} GraphemeBreakIterator;
+
+static void
+GBI_dealloc(PyObject *self)
+{
+    PyObject_GC_UnTrack(self);
+    Py_DECREF(((GraphemeBreakIterator *)self)->iter.str);
+    PyObject_GC_Del(self);
+}
+
+static int
+GBI_traverse(PyObject *self, visitproc visit, void *arg)
+{
+    Py_VISIT(((GraphemeBreakIterator *)self)->iter.str);
+    return 0;
+}
+
+static int
+GBI_clear(PyObject *self)
+{
+    Py_CLEAR(((GraphemeBreakIterator *)self)->iter.str);
+    return 0;
+}
+
+static PyObject *
+GBI_iternext(PyObject *self)
+{
+    GraphemeBreakIterator *it = (GraphemeBreakIterator *)self;
+    Py_ssize_t start = it->iter.start;
+    Py_ssize_t pos = _Py_NextGraphemeBreak(&it->iter);
+
+    if (pos < 0) {
+        return NULL;
+    }
+    PyObject *module = PyType_GetModule(Py_TYPE(it));
+    PyObject *SegmentType = get_unicodedata_state(module)->SegmentType;
+    SegmentObject *s = PyObject_GC_New(SegmentObject,
+                                       (PyTypeObject *)SegmentType);
+    if (!s) {
+        return NULL;
+    }
+    s->string = Py_NewRef(it->iter.str);
+    s->start = start;
+    s->end = pos;
+    PyObject_GC_Track(s);
+    return (PyObject *)s;
+}
+
+
+static PyType_Slot GraphemeBreakIterator_slots[] = {
+    {Py_tp_dealloc, GBI_dealloc},
+    {Py_tp_iter, PyObject_SelfIter},
+    {Py_tp_iternext, GBI_iternext},
+    {Py_tp_traverse, GBI_traverse},
+    {Py_tp_clear, GBI_clear},
+    {0, 0},
+};
+
+static PyType_Spec GraphemeBreakIterator_spec = {
+    .name = "unicodedata.GraphemeBreakIterator",
+    .basicsize = sizeof(GraphemeBreakIterator),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_HAVE_GC
+        | Py_TPFLAGS_DISALLOW_INSTANTIATION
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = GraphemeBreakIterator_slots
+};
+
+
+/*[clinic input]
+unicodedata.iter_graphemes
+
+    unistr: unicode
+    start: Py_ssize_t = 0
+    end: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
+    /
+
+Returns an iterator to iterate over grapheme clusters.
+
+It uses extended grapheme cluster rules from TR29.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
+                                Py_ssize_t start, Py_ssize_t end)
+/*[clinic end generated code: output=b0b831944265d36f input=a1454d9e8135951f]*/
+{
+    PyObject *GraphemeBreakIteratorType = get_unicodedata_state(module)->GraphemeBreakIteratorType;
+    GraphemeBreakIterator *gbi = PyObject_GC_New(GraphemeBreakIterator,
+            (PyTypeObject *)GraphemeBreakIteratorType);
+    if (!gbi) {
+        return NULL;
+    }
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unistr);
+    ADJUST_INDICES(start, end, len);
+    Py_INCREF(unistr);
+    _Py_InitGraphemeBreak(&gbi->iter, unistr, start, end);
+    PyObject_GC_Track(gbi);
+    return (PyObject*)gbi;
+}
+
+/*[clinic input]
+unicodedata.grapheme_cluster_break
+
+    chr: int(accept={str})
+    /
+
+Returns the Grapheme_Cluster_Break property assigned to the character.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr)
+/*[clinic end generated code: output=39542e0f63bba36f input=5da75e86435576fd]*/
+{
+    Py_UCS4 c = (Py_UCS4)chr;
+    int index = (int) _getrecord_ex(c)->grapheme_cluster_break;
+    return PyUnicode_FromString(_PyUnicode_GraphemeBreakNames[index]);
+}
+
+/*[clinic input]
+unicodedata.indic_conjunct_break
+
+    chr: int(accept={str})
+    /
+
+Returns the Indic_Conjunct_Break property assigned to the character.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_indic_conjunct_break_impl(PyObject *module, int chr)
+/*[clinic end generated code: output=673eff2caf797f08 input=5c730f78e469f2e8]*/
+{
+    Py_UCS4 c = (Py_UCS4)chr;
+    int index = (int) _getrecord_ex(c)->incb;
+    return PyUnicode_FromString(_PyUnicode_IndicConjunctBreakNames[index]);
+}
+
+/*[clinic input]
+@permit_long_summary
+unicodedata.extended_pictographic
+
+    chr: int(accept={str})
+    /
+
+Returns the Extended_Pictographic property assigned to the character, as boolean.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_extended_pictographic_impl(PyObject *module, int chr)
+/*[clinic end generated code: output=b6bbb349427370b1 input=250d7bd988997eb3]*/
+{
+    Py_UCS4 c = (Py_UCS4)chr;
+    int index = (int) _getrecord_ex(c)->ext_pict;
+    return PyBool_FromLong(index);
+}
+
+
 // List of functions used to define module functions *AND* unicodedata.UCD
 // methods. For module functions, self is the module. For UCD methods, self
 // is an UCD instance. The UCD_Check() macro is used to check if self is
 // an UCD instance.
 static PyMethodDef unicodedata_functions[] = {
+    // Module only functions.
+    UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
+    UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
+    UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
+    UNICODEDATA_ITER_GRAPHEMES_METHODDEF
+
+    // The following definitions are shared between the module
+    // and the UCD class.
+#define DB_methods (unicodedata_functions + 4)
+
    UNICODEDATA_UCD_DECIMAL_METHODDEF
    UNICODEDATA_UCD_DIGIT_METHODDEF
    UNICODEDATA_UCD_NUMERIC_METHODDEF
@ -1664,7 +2160,7 @@ static PyType_Slot ucd_type_slots[] = {
    {Py_tp_dealloc, ucd_dealloc},
    {Py_tp_traverse, _PyObject_VisitType},
    {Py_tp_getattro, PyObject_GenericGetAttr},
-    {Py_tp_methods, unicodedata_functions},
+    {Py_tp_methods, DB_methods},
    {Py_tp_members, DB_members},
    {0, 0}
 };
@ -1677,6 +2173,7 @@ static PyType_Spec ucd_type_spec = {
    .slots = ucd_type_slots
 };

+
 PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
@ -1686,9 +2183,47 @@ this database is based on the UnicodeData.txt file version\n\
 The module uses the same names and symbols as defined by the\n\
 UnicodeData File Format " UNIDATA_VERSION ".");

+static int
+unicodedata_traverse(PyObject *module, visitproc visit, void *arg)
+{
+    unicodedatastate *state = get_unicodedata_state(module);
+    Py_VISIT(state->SegmentType);
+    Py_VISIT(state->GraphemeBreakIteratorType);
+    return 0;
+}
+
+static int
+unicodedata_clear(PyObject *module)
+{
+    unicodedatastate *state = get_unicodedata_state(module);
+    Py_CLEAR(state->SegmentType);
+    Py_CLEAR(state->GraphemeBreakIteratorType);
+    return 0;
+}
+
+static void
+unicodedata_free(void *module)
+{
+    unicodedata_clear((PyObject *)module);
+}
+
 static int
 unicodedata_exec(PyObject *module)
 {
+    unicodedatastate *state = get_unicodedata_state(module);
+
+    PyObject *SegmentType = PyType_FromModuleAndSpec(module, &Segment_spec, NULL);
+    if (SegmentType == NULL) {
+        return -1;
+    }
+    state->SegmentType = SegmentType;
+
+    PyObject *GraphemeBreakIteratorType = PyType_FromModuleAndSpec(module, &GraphemeBreakIterator_spec, NULL);
+    if (GraphemeBreakIteratorType == NULL) {
+        return -1;
+    }
+    state->GraphemeBreakIteratorType = GraphemeBreakIteratorType;
+
    if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
        return -1;
    }
@ -1730,9 +2265,12 @@ static struct PyModuleDef unicodedata_module = {
    PyModuleDef_HEAD_INIT,
    .m_name = "unicodedata",
    .m_doc = unicodedata_docstring,
-    .m_size = 0,
+    .m_size = sizeof(unicodedatastate),
    .m_methods = unicodedata_functions,
    .m_slots = unicodedata_slots,
+    .m_traverse = unicodedata_traverse,
+    .m_clear = unicodedata_clear,
+    .m_free = unicodedata_free,
 };

 PyMODINIT_FUNC
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -56,6 +56,8 @@
 NAMED_SEQUENCES = "NamedSequences%s.txt"
 SPECIAL_CASING = "SpecialCasing%s.txt"
 CASE_FOLDING = "CaseFolding%s.txt"
+GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
+EMOJI_DATA = "emoji/emoji-data%s.txt"

 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@ -77,6 +79,14 @@
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON", "LRI", "RLI", "FSI", "PDI" ]

+# "Other" needs to be the first entry, see the comment in makeunicodedata
+GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
+    'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
+    'ZWJ' ]
+
+# "None" needs to be the first entry, see the comment in makeunicodedata
+INDIC_CONJUNCT_BREAK_NAMES = [ 'None', 'Linker', 'Consonant', 'Extend' ]
+
 # "N" needs to be the first entry, see the comment in makeunicodedata
 EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]

@ -147,7 +157,9 @@ def makeunicodedata(unicode, trace):
    # EastAsianWidth.txt
    # see https://unicode.org/reports/tr11/#Unassigned
    assert EASTASIANWIDTH_NAMES[0] == "N"
-    dummy = (0, 0, 0, 0, 0, 0)
+    assert GRAPHEME_CLUSTER_NAMES[0] == "Other"
+    assert INDIC_CONJUNCT_BREAK_NAMES[0] == "None"
+    dummy = (0, 0, 0, 0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
@ -160,23 +172,25 @@ def makeunicodedata(unicode, trace):

    for char in unicode.chars:
        record = unicode.table[char]
+        eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
+        graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
+        extpict = unicode.ext_picts[char]
        if record:
            # extract database properties
            category = CATEGORY_NAMES.index(record.general_category)
            combining = int(record.canonical_combining_class)
            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
            mirrored = record.bidi_mirrored == "Y"
-            eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
            normalizationquickcheck = record.quick_check
+            incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
            item = (
                category, combining, bidirectional, mirrored, eastasianwidth,
-                normalizationquickcheck
+                normalizationquickcheck, graphemebreak, incb, extpict,
                )
-        elif unicode.widths[char] is not None:
+        elif eastasianwidth or graphemebreak or extpict:
            # an unassigned but reserved character, with a known
-            # east_asian_width
-            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
-            item = (0, 0, 0, 0, eastasianwidth, 0)
+            # east_asian_width or grapheme_break or ext_pict
+            item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
        else:
            continue

@ -296,7 +310,7 @@ def makeunicodedata(unicode, trace):
        fprint("/* a list of unique database records */")
        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
        for item in table:
-            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
+            fprint("    {%d, %d, %d, %d, %d, %d, %d, %d, %d}," % item)
        fprint("};")
        fprint()

@ -337,6 +351,24 @@ def makeunicodedata(unicode, trace):
        fprint("    NULL")
        fprint("};")

+        for i, name in enumerate(GRAPHEME_CLUSTER_NAMES):
+            fprint("#define GCB_%s %d" % (name, i))
+
+        fprint("const char * const _PyUnicode_GraphemeBreakNames[] = {")
+        for name in GRAPHEME_CLUSTER_NAMES:
+            fprint('    "%s",' % name)
+        fprint("    NULL")
+        fprint("};")
+
+        for i, name in enumerate(INDIC_CONJUNCT_BREAK_NAMES):
+            fprint("#define InCB_%s %d" % (name, i))
+
+        fprint("const char * const _PyUnicode_IndicConjunctBreakNames[] = {")
+        for name in INDIC_CONJUNCT_BREAK_NAMES:
+            fprint('    "%s",' % name)
+        fprint("    NULL")
+        fprint("};")
+
        fprint("static const char *decomp_prefix[] = {")
        for name in decomp_prefix:
            fprint("    \"%s\"," % name)
@ -783,6 +815,10 @@ def merge_old_version(version, new, old):
                        # normalization quickchecks are not performed
                        # for older versions
                        pass
+                    elif k == 18:
+                        # The Indic_Conjunct_Break property did not exist for
+                        # older versions
+                        pass
                    else:
                        class Difference(Exception):pass
                        raise Difference(hex(i), k, old.table[i], new.table[i])
@ -804,7 +840,7 @@ def open_data(template, version):
            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
        else:
            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
-        os.makedirs(DATA_DIR, exist_ok=True)
+        os.makedirs(os.path.dirname(local), exist_ok=True)
        urllib.request.urlretrieve(url, filename=local)
    if local.endswith('.txt'):
        return open(local, encoding='utf-8')
@ -892,9 +928,13 @@ class UcdRecord:
    # We store them as a bitmask.
    quick_check: int

+    # The Indic_Conjunct_Break property from DerivedCoreProperties.txt.  See:
+    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
+    incb: str
+

 def from_row(row: List[str]) -> UcdRecord:
-    return UcdRecord(*row, None, set(), 0)
+    return UcdRecord(*row, None, set(), 0, "None")


 # --------------------------------------------------------------------
@ -992,14 +1032,14 @@ def __init__(self, version, cjk_check=True):
        self.widths = widths

        for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
-            if propinfo:
-                # this is not a binary property, ignore it
-                continue
-
-            if table[char]:
-                # Some properties (e.g. Default_Ignorable_Code_Point)
-                # apply to unassigned code points; ignore them
-                table[char].binary_properties.add(propname)
+            if not propinfo:
+                # binary property
+                if table[char]:
+                    # Some properties (e.g. Default_Ignorable_Code_Point)
+                    # apply to unassigned code points; ignore them
+                    table[char].binary_properties.add(propname)
+            elif propname == 'InCB':  # Indic_Conjunct_Break
+                table[char].incb, = propinfo

        for char_range, value in UcdFile(LINE_BREAK, version):
            if value not in MANDATORY_LINE_BREAKS:
@ -1068,6 +1108,19 @@ def __init__(self, version, cjk_check=True):
                    c = int(data[0], 16)
                    cf[c] = [int(char, 16) for char in data[2].split()]

+        if version != "3.2.0":
+            grapheme_breaks = [None] * 0x110000
+            for char, (prop,) in UcdFile(GRAPHEME_CLUSTER_BREAK, version).expanded():
+                grapheme_breaks[char] = prop
+            self.grapheme_breaks = grapheme_breaks
+
+            ext_picts = [False] * 0x110000
+            for char, (prop,) in UcdFile(EMOJI_DATA, version).expanded():
+                if prop == 'Extended_Pictographic':
+                    ext_picts[char] = True
+            self.ext_picts = ext_picts
+
+
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = list(range(256))