mirror of
https://github.com/python/cpython.git
synced 2026-01-22 07:08:40 +00:00
gh-74902: Add Unicode Grapheme Cluster Break algorithm (GH-143076)
Add the unicodedata.iter_graphemes() function to iterate over grapheme clusters according to rules defined in Unicode Standard Annex #29. Add unicodedata.grapheme_cluster_break(), unicodedata.indic_conjunct_break() and unicodedata.extended_pictographic() functions to get the properties of the character which are related to the above algorithm. Co-authored-by: Guillaume "Vermeille" Sanchez <guillaume.v.sanchez@gmail.com>
This commit is contained in:
parent
0e0d51cdce
commit
bab1d7a561
9 changed files with 4350 additions and 3039 deletions
|
|
@ -184,6 +184,28 @@ following functions:
|
|||
'0041 0303'
|
||||
|
||||
|
||||
.. function:: grapheme_cluster_break(chr, /)
|
||||
|
||||
Returns the Grapheme_Cluster_Break property assigned to the character.
|
||||
|
||||
.. versionadded:: next
|
||||
|
||||
|
||||
.. function:: indic_conjunct_break(chr, /)
|
||||
|
||||
Returns the Indic_Conjunct_Break property assigned to the character.
|
||||
|
||||
.. versionadded:: next
|
||||
|
||||
|
||||
.. function:: extended_pictographic(chr, /)
|
||||
|
||||
Returns ``True`` if the character has the Extended_Pictographic property,
|
||||
``False`` otherwise.
|
||||
|
||||
.. versionadded:: next
|
||||
|
||||
|
||||
.. function:: normalize(form, unistr, /)
|
||||
|
||||
Return the normal form *form* for the Unicode string *unistr*. Valid values for
|
||||
|
|
@ -225,6 +247,24 @@ following functions:
|
|||
.. versionadded:: 3.8
|
||||
|
||||
|
||||
.. function:: iter_graphemes(unistr, start=0, end=sys.maxsize, /)
|
||||
|
||||
Returns an iterator to iterate over grapheme clusters.
|
||||
With optional *start*, iteration begins at that position.
|
||||
With optional *end*, iteration stops at that position.
|
||||
|
||||
Converting an emitted item to string returns a substring corresponding to
|
||||
the grapheme cluster.
|
||||
Its ``start`` and ``end`` attributes denote the start and end of
|
||||
the grapheme cluster.
|
||||
|
||||
It uses extended grapheme cluster rules defined by Unicode
|
||||
Standard Annex #29, `"Unicode Text Segmentation"
|
||||
<https://www.unicode.org/reports/tr29/>`_.
|
||||
|
||||
.. versionadded:: next
|
||||
|
||||
|
||||
In addition, the module exposes the following constant:
|
||||
|
||||
.. data:: unidata_version
|
||||
|
|
@ -234,7 +274,7 @@ In addition, the module exposes the following constant:
|
|||
|
||||
.. data:: ucd_3_2_0
|
||||
|
||||
This is an object that has the same methods as the entire module, but uses the
|
||||
This is an object that has most of the methods of the entire module, but uses the
|
||||
Unicode database version 3.2 instead, for applications that require this
|
||||
specific version of the Unicode database (such as IDNA).
|
||||
|
||||
|
|
|
|||
|
|
@ -811,6 +811,16 @@ unicodedata
|
|||
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
|
||||
(Contributed by Stan Ulbrych in :gh:`129117`.)
|
||||
|
||||
* Add the :func:`~unicodedata.iter_graphemes`
|
||||
function to iterate over grapheme clusters according to rules defined in
|
||||
`Unicode Standard Annex #29, "Unicode Text Segmentation"
|
||||
<https://www.unicode.org/reports/tr29/>`_.
|
||||
Add :func:`~unicodedata.grapheme_cluster_break`,
|
||||
:func:`~unicodedata.indic_conjunct_break` and
|
||||
:func:`~unicodedata.extended_pictographic` functions to get the properties
|
||||
of the character which are related to the above algorithm.
|
||||
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
|
||||
|
||||
|
||||
unittest
|
||||
--------
|
||||
|
|
|
|||
|
|
@ -616,6 +616,221 @@ def test_isxidcontinue(self):
|
|||
self.assertRaises(TypeError, self.db.isxidcontinue)
|
||||
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
|
||||
|
||||
def test_grapheme_cluster_break(self):
|
||||
gcb = self.db.grapheme_cluster_break
|
||||
self.assertEqual(gcb(' '), 'Other')
|
||||
self.assertEqual(gcb('x'), 'Other')
|
||||
self.assertEqual(gcb('\U0010FFFF'), 'Other')
|
||||
self.assertEqual(gcb('\r'), 'CR')
|
||||
self.assertEqual(gcb('\n'), 'LF')
|
||||
self.assertEqual(gcb('\0'), 'Control')
|
||||
self.assertEqual(gcb('\t'), 'Control')
|
||||
self.assertEqual(gcb('\x1F'), 'Control')
|
||||
self.assertEqual(gcb('\x7F'), 'Control')
|
||||
self.assertEqual(gcb('\x9F'), 'Control')
|
||||
self.assertEqual(gcb('\U000E0001'), 'Control')
|
||||
self.assertEqual(gcb('\u0300'), 'Extend')
|
||||
self.assertEqual(gcb('\u200C'), 'Extend')
|
||||
self.assertEqual(gcb('\U000E01EF'), 'Extend')
|
||||
self.assertEqual(gcb('\u1159'), 'L')
|
||||
self.assertEqual(gcb('\u11F9'), 'T')
|
||||
self.assertEqual(gcb('\uD788'), 'LV')
|
||||
self.assertEqual(gcb('\uD7A3'), 'LVT')
|
||||
# New in 5.0.0
|
||||
self.assertEqual(gcb('\u05BA'), 'Extend')
|
||||
self.assertEqual(gcb('\u20EF'), 'Extend')
|
||||
# New in 5.1.0
|
||||
self.assertEqual(gcb('\u2064'), 'Control')
|
||||
self.assertEqual(gcb('\uAA4D'), 'SpacingMark')
|
||||
# New in 5.2.0
|
||||
self.assertEqual(gcb('\u0816'), 'Extend')
|
||||
self.assertEqual(gcb('\uA97C'), 'L')
|
||||
self.assertEqual(gcb('\uD7C6'), 'V')
|
||||
self.assertEqual(gcb('\uD7FB'), 'T')
|
||||
# New in 6.0.0
|
||||
self.assertEqual(gcb('\u093A'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011002'), 'SpacingMark')
|
||||
# New in 6.1.0
|
||||
self.assertEqual(gcb('\U000E0FFF'), 'Control')
|
||||
self.assertEqual(gcb('\U00016F7E'), 'SpacingMark')
|
||||
# New in 6.2.0
|
||||
self.assertEqual(gcb('\U0001F1E6'), 'Regional_Indicator')
|
||||
self.assertEqual(gcb('\U0001F1FF'), 'Regional_Indicator')
|
||||
# New in 6.3.0
|
||||
self.assertEqual(gcb('\u180E'), 'Control')
|
||||
self.assertEqual(gcb('\u1A1B'), 'Extend')
|
||||
# New in 7.0.0
|
||||
self.assertEqual(gcb('\u0E33'), 'SpacingMark')
|
||||
self.assertEqual(gcb('\u0EB3'), 'SpacingMark')
|
||||
self.assertEqual(gcb('\U0001BCA3'), 'Control')
|
||||
self.assertEqual(gcb('\U0001E8D6'), 'Extend')
|
||||
self.assertEqual(gcb('\U0001163E'), 'SpacingMark')
|
||||
# New in 8.0.0
|
||||
self.assertEqual(gcb('\u08E3'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011726'), 'SpacingMark')
|
||||
# New in 9.0.0
|
||||
self.assertEqual(gcb('\u0600'), 'Prepend')
|
||||
self.assertEqual(gcb('\U000E007F'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011CB4'), 'SpacingMark')
|
||||
self.assertEqual(gcb('\u200D'), 'ZWJ')
|
||||
# New in 10.0.0
|
||||
self.assertEqual(gcb('\U00011D46'), 'Prepend')
|
||||
self.assertEqual(gcb('\U00011D47'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011A97'), 'SpacingMark')
|
||||
# New in 11.0.0
|
||||
self.assertEqual(gcb('\U000110CD'), 'Prepend')
|
||||
self.assertEqual(gcb('\u07FD'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011EF6'), 'SpacingMark')
|
||||
# New in 12.0.0
|
||||
self.assertEqual(gcb('\U00011A84'), 'Prepend')
|
||||
self.assertEqual(gcb('\U00013438'), 'Control')
|
||||
self.assertEqual(gcb('\U0001E2EF'), 'Extend')
|
||||
self.assertEqual(gcb('\U00016F87'), 'SpacingMark')
|
||||
# New in 13.0.0
|
||||
self.assertEqual(gcb('\U00011941'), 'Prepend')
|
||||
self.assertEqual(gcb('\U00016FE4'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011942'), 'SpacingMark')
|
||||
# New in 14.0.0
|
||||
self.assertEqual(gcb('\u0891'), 'Prepend')
|
||||
self.assertEqual(gcb('\U0001E2AE'), 'Extend')
|
||||
# New in 15.0.0
|
||||
self.assertEqual(gcb('\U00011F02'), 'Prepend')
|
||||
self.assertEqual(gcb('\U0001343F'), 'Control')
|
||||
self.assertEqual(gcb('\U0001E4EF'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011F3F'), 'SpacingMark')
|
||||
# New in 16.0.0
|
||||
self.assertEqual(gcb('\U000113D1'), 'Prepend')
|
||||
self.assertEqual(gcb('\U0001E5EF'), 'Extend')
|
||||
self.assertEqual(gcb('\U0001612C'), 'SpacingMark')
|
||||
self.assertEqual(gcb('\U00016D63'), 'V')
|
||||
# New in 17.0.0
|
||||
self.assertEqual(gcb('\u1AEB'), 'Extend')
|
||||
self.assertEqual(gcb('\U00011B67'), 'SpacingMark')
|
||||
|
||||
self.assertRaises(TypeError, gcb)
|
||||
self.assertRaises(TypeError, gcb, b'x')
|
||||
self.assertRaises(TypeError, gcb, 120)
|
||||
self.assertRaises(TypeError, gcb, '')
|
||||
self.assertRaises(TypeError, gcb, 'xx')
|
||||
|
||||
def test_indic_conjunct_break(self):
|
||||
incb = self.db.indic_conjunct_break
|
||||
self.assertEqual(incb(' '), 'None')
|
||||
self.assertEqual(incb('x'), 'None')
|
||||
self.assertEqual(incb('\U0010FFFF'), 'None')
|
||||
# New in 15.1.0
|
||||
self.assertEqual(incb('\u094D'), 'Linker')
|
||||
self.assertEqual(incb('\u0D4D'), 'Linker')
|
||||
self.assertEqual(incb('\u0915'), 'Consonant')
|
||||
self.assertEqual(incb('\u0D3A'), 'Consonant')
|
||||
self.assertEqual(incb('\u0300'), 'Extend')
|
||||
self.assertEqual(incb('\U0001E94A'), 'Extend')
|
||||
# New in 16.0.0
|
||||
self.assertEqual(incb('\u034F'), 'Extend')
|
||||
self.assertEqual(incb('\U000E01EF'), 'Extend')
|
||||
# New in 17.0.0
|
||||
self.assertEqual(incb('\u1039'), 'Linker')
|
||||
self.assertEqual(incb('\U00011F42'), 'Linker')
|
||||
self.assertEqual(incb('\u1000'), 'Consonant')
|
||||
self.assertEqual(incb('\U00011F33'), 'Consonant')
|
||||
self.assertEqual(incb('\U0001E6F5'), 'Extend')
|
||||
|
||||
self.assertRaises(TypeError, incb)
|
||||
self.assertRaises(TypeError, incb, b'x')
|
||||
self.assertRaises(TypeError, incb, 120)
|
||||
self.assertRaises(TypeError, incb, '')
|
||||
self.assertRaises(TypeError, incb, 'xx')
|
||||
|
||||
def test_extended_pictographic(self):
|
||||
ext_pict = self.db.extended_pictographic
|
||||
self.assertIs(ext_pict(' '), False)
|
||||
self.assertIs(ext_pict('x'), False)
|
||||
self.assertIs(ext_pict('\U0010FFFF'), False)
|
||||
# New in 13.0.0
|
||||
self.assertIs(ext_pict('\xA9'), True)
|
||||
self.assertIs(ext_pict('\u203C'), True)
|
||||
self.assertIs(ext_pict('\U0001FAD6'), True)
|
||||
self.assertIs(ext_pict('\U0001FFFD'), True)
|
||||
# New in 17.0.0
|
||||
self.assertIs(ext_pict('\u2388'), False)
|
||||
self.assertIs(ext_pict('\U0001FA6D'), False)
|
||||
|
||||
self.assertRaises(TypeError, ext_pict)
|
||||
self.assertRaises(TypeError, ext_pict, b'x')
|
||||
self.assertRaises(TypeError, ext_pict, 120)
|
||||
self.assertRaises(TypeError, ext_pict, '')
|
||||
self.assertRaises(TypeError, ext_pict, 'xx')
|
||||
|
||||
def test_grapheme_break(self):
|
||||
def graphemes(*args):
|
||||
return list(map(str, self.db.iter_graphemes(*args)))
|
||||
|
||||
self.assertRaises(TypeError, self.db.iter_graphemes)
|
||||
self.assertRaises(TypeError, self.db.iter_graphemes, b'x')
|
||||
self.assertRaises(TypeError, self.db.iter_graphemes, 'x', 0, 0, 0)
|
||||
|
||||
self.assertEqual(graphemes(''), [])
|
||||
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
|
||||
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
|
||||
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
|
||||
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
|
||||
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
|
||||
self.assertEqual(graphemes('abcd', 3, 1), [])
|
||||
self.assertEqual(graphemes('abcd', 5), [])
|
||||
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
|
||||
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
|
||||
self.assertEqual(graphemes('abcd', 0, -5), [])
|
||||
# GB3
|
||||
self.assertEqual(graphemes('\r\n'), ['\r\n'])
|
||||
# GB4
|
||||
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
|
||||
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
|
||||
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
|
||||
# GB5
|
||||
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
|
||||
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
|
||||
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
|
||||
# GB6
|
||||
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
|
||||
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
|
||||
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
|
||||
# GB7
|
||||
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
|
||||
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
|
||||
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
|
||||
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
|
||||
# GB8
|
||||
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
|
||||
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
|
||||
# GB9
|
||||
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
|
||||
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
|
||||
# GB9a
|
||||
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
|
||||
# GB9b
|
||||
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
|
||||
# GB9c
|
||||
self.assertEqual(graphemes('\u0915\u094d\u0924'),
|
||||
['\u0915\u094d\u0924'])
|
||||
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
|
||||
['\u0915\u094D\u094D\u0924'])
|
||||
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
|
||||
['\u0915\u094D\u0924\u094D\u092F'])
|
||||
# GB11
|
||||
self.assertEqual(graphemes(
|
||||
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
|
||||
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
|
||||
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
|
||||
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
|
||||
# GB12
|
||||
self.assertEqual(graphemes(
|
||||
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
|
||||
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
|
||||
# GB13
|
||||
self.assertEqual(graphemes(
|
||||
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
|
||||
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
|
||||
|
||||
|
||||
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
|
||||
db = unicodedata.ucd_3_2_0
|
||||
|
|
@ -624,6 +839,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
|
|||
if quicktest else
|
||||
'f217b8688d7bdff31db4207e078a96702f091597')
|
||||
|
||||
test_grapheme_cluster_break = None
|
||||
test_indic_conjunct_break = None
|
||||
test_extended_pictographic = None
|
||||
test_grapheme_break = None
|
||||
|
||||
|
||||
class UnicodeMiscTest(unittest.TestCase):
|
||||
db = unicodedata
|
||||
|
|
@ -726,6 +946,17 @@ def test_linebreak_7643(self):
|
|||
self.assertEqual(len(lines), 1,
|
||||
r"%a should not be a linebreak" % c)
|
||||
|
||||
def test_segment_object(self):
|
||||
segments = list(unicodedata.iter_graphemes('spa\u0300m'))
|
||||
self.assertEqual(len(segments), 4, segments)
|
||||
segment = segments[2]
|
||||
self.assertEqual(segment.start, 2)
|
||||
self.assertEqual(segment.end, 4)
|
||||
self.assertEqual(str(segment), 'a\u0300')
|
||||
self.assertEqual(repr(segment), '<Segment 2:4>')
|
||||
self.assertRaises(TypeError, iter, segment)
|
||||
self.assertRaises(TypeError, len, segment)
|
||||
|
||||
|
||||
class NormalizationTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
|
|
@ -848,5 +1079,61 @@ class MyStr(str):
|
|||
self.assertIs(type(normalize(form, MyStr(input_str))), str)
|
||||
|
||||
|
||||
class GraphemeBreakTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def check_version(testfile):
|
||||
hdr = testfile.readline()
|
||||
return unicodedata.unidata_version in hdr
|
||||
|
||||
@requires_resource('network')
|
||||
def test_grapheme_break(self):
|
||||
TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
|
||||
TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
|
||||
|
||||
# Hit the exception early
|
||||
try:
|
||||
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
|
||||
check=self.check_version)
|
||||
except PermissionError:
|
||||
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
|
||||
f"into the test data directory")
|
||||
except (OSError, HTTPException) as exc:
|
||||
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
|
||||
|
||||
with testdata:
|
||||
self.run_grapheme_break_tests(testdata)
|
||||
|
||||
def run_grapheme_break_tests(self, testdata):
|
||||
for line in testdata:
|
||||
line, _, comment = line.partition('#')
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
comment = comment.strip()
|
||||
|
||||
chunks = []
|
||||
breaks = []
|
||||
pos = 0
|
||||
for field in line.replace('×', ' ').split():
|
||||
if field == '÷':
|
||||
chunks.append('')
|
||||
breaks.append(pos)
|
||||
else:
|
||||
chunks[-1] += chr(int(field, 16))
|
||||
pos += 1
|
||||
self.assertEqual(chunks.pop(), '', line)
|
||||
input = ''.join(chunks)
|
||||
with self.subTest(line):
|
||||
result = list(unicodedata.iter_graphemes(input))
|
||||
self.assertEqual(list(map(str, result)), chunks, comment)
|
||||
self.assertEqual([x.start for x in result], breaks[:-1], comment)
|
||||
self.assertEqual([x.end for x in result], breaks[1:], comment)
|
||||
for i in range(1, len(breaks) - 1):
|
||||
result = list(unicodedata.iter_graphemes(input, breaks[i]))
|
||||
self.assertEqual(list(map(str, result)), chunks[i:], comment)
|
||||
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
|
||||
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
|
|
@ -1664,6 +1664,7 @@ Victor Salgado
|
|||
Rich Salz
|
||||
Kevin Samborn
|
||||
Adrian Sampson
|
||||
Guillaume Sanchez
|
||||
Nevada Sanchez
|
||||
James Sanders
|
||||
Ilya Sandler
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
Add the :func:`~unicodedata.iter_graphemes` function in the
|
||||
:mod:`unicodedata` module to iterate over grapheme clusters according to
|
||||
rules defined in `Unicode Standard Annex #29, "Unicode Text Segmentation"
|
||||
<https://www.unicode.org/reports/tr29/>`_. Add
|
||||
:func:`~unicodedata.grapheme_cluster_break`,
|
||||
:func:`~unicodedata.indic_conjunct_break` and
|
||||
:func:`~unicodedata.extended_pictographic` functions to get the properties
|
||||
of the character which are related to the above algorithm.
|
||||
179
Modules/clinic/unicodedata.c.h
generated
179
Modules/clinic/unicodedata.c.h
generated
|
|
@ -2,6 +2,7 @@
|
|||
preserve
|
||||
[clinic start generated code]*/
|
||||
|
||||
#include "pycore_abstract.h" // _PyNumber_Index()
|
||||
#include "pycore_modsupport.h" // _PyArg_CheckPositional()
|
||||
|
||||
PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
|
||||
|
|
@ -621,4 +622,180 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
|
|||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
|
||||
|
||||
PyDoc_STRVAR(unicodedata_iter_graphemes__doc__,
|
||||
"iter_graphemes($module, unistr, start=0, end=sys.maxsize, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Returns an iterator to iterate over grapheme clusters.\n"
|
||||
"\n"
|
||||
"It uses extended grapheme cluster rules from TR29.");
|
||||
|
||||
#define UNICODEDATA_ITER_GRAPHEMES_METHODDEF \
|
||||
{"iter_graphemes", _PyCFunction_CAST(unicodedata_iter_graphemes), METH_FASTCALL, unicodedata_iter_graphemes__doc__},
|
||||
|
||||
static PyObject *
|
||||
unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
|
||||
Py_ssize_t start, Py_ssize_t end);
|
||||
|
||||
static PyObject *
|
||||
unicodedata_iter_graphemes(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
PyObject *unistr;
|
||||
Py_ssize_t start = 0;
|
||||
Py_ssize_t end = PY_SSIZE_T_MAX;
|
||||
|
||||
if (!_PyArg_CheckPositional("iter_graphemes", nargs, 1, 3)) {
|
||||
goto exit;
|
||||
}
|
||||
if (!PyUnicode_Check(args[0])) {
|
||||
_PyArg_BadArgument("iter_graphemes", "argument 1", "str", args[0]);
|
||||
goto exit;
|
||||
}
|
||||
unistr = args[0];
|
||||
if (nargs < 2) {
|
||||
goto skip_optional;
|
||||
}
|
||||
{
|
||||
Py_ssize_t ival = -1;
|
||||
PyObject *iobj = _PyNumber_Index(args[1]);
|
||||
if (iobj != NULL) {
|
||||
ival = PyLong_AsSsize_t(iobj);
|
||||
Py_DECREF(iobj);
|
||||
}
|
||||
if (ival == -1 && PyErr_Occurred()) {
|
||||
goto exit;
|
||||
}
|
||||
start = ival;
|
||||
}
|
||||
if (nargs < 3) {
|
||||
goto skip_optional;
|
||||
}
|
||||
{
|
||||
Py_ssize_t ival = -1;
|
||||
PyObject *iobj = _PyNumber_Index(args[2]);
|
||||
if (iobj != NULL) {
|
||||
ival = PyLong_AsSsize_t(iobj);
|
||||
Py_DECREF(iobj);
|
||||
}
|
||||
if (ival == -1 && PyErr_Occurred()) {
|
||||
goto exit;
|
||||
}
|
||||
end = ival;
|
||||
}
|
||||
skip_optional:
|
||||
return_value = unicodedata_iter_graphemes_impl(module, unistr, start, end);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
|
||||
"grapheme_cluster_break($module, chr, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Returns the Grapheme_Cluster_Break property assigned to the character.");
|
||||
|
||||
#define UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF \
|
||||
{"grapheme_cluster_break", (PyCFunction)unicodedata_grapheme_cluster_break, METH_O, unicodedata_grapheme_cluster_break__doc__},
|
||||
|
||||
static PyObject *
|
||||
unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr);
|
||||
|
||||
static PyObject *
|
||||
unicodedata_grapheme_cluster_break(PyObject *module, PyObject *arg)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
int chr;
|
||||
|
||||
if (!PyUnicode_Check(arg)) {
|
||||
_PyArg_BadArgument("grapheme_cluster_break", "argument", "a unicode character", arg);
|
||||
goto exit;
|
||||
}
|
||||
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"grapheme_cluster_break(): argument must be a unicode character, "
|
||||
"not a string of length %zd",
|
||||
PyUnicode_GET_LENGTH(arg));
|
||||
goto exit;
|
||||
}
|
||||
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||
return_value = unicodedata_grapheme_cluster_break_impl(module, chr);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_indic_conjunct_break__doc__,
|
||||
"indic_conjunct_break($module, chr, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Returns the Indic_Conjunct_Break property assigned to the character.");
|
||||
|
||||
#define UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF \
|
||||
{"indic_conjunct_break", (PyCFunction)unicodedata_indic_conjunct_break, METH_O, unicodedata_indic_conjunct_break__doc__},
|
||||
|
||||
static PyObject *
|
||||
unicodedata_indic_conjunct_break_impl(PyObject *module, int chr);
|
||||
|
||||
static PyObject *
|
||||
unicodedata_indic_conjunct_break(PyObject *module, PyObject *arg)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
int chr;
|
||||
|
||||
if (!PyUnicode_Check(arg)) {
|
||||
_PyArg_BadArgument("indic_conjunct_break", "argument", "a unicode character", arg);
|
||||
goto exit;
|
||||
}
|
||||
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"indic_conjunct_break(): argument must be a unicode character, "
|
||||
"not a string of length %zd",
|
||||
PyUnicode_GET_LENGTH(arg));
|
||||
goto exit;
|
||||
}
|
||||
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||
return_value = unicodedata_indic_conjunct_break_impl(module, chr);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_extended_pictographic__doc__,
|
||||
"extended_pictographic($module, chr, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Returns the Extended_Pictographic property assigned to the character, as boolean.");
|
||||
|
||||
#define UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF \
|
||||
{"extended_pictographic", (PyCFunction)unicodedata_extended_pictographic, METH_O, unicodedata_extended_pictographic__doc__},
|
||||
|
||||
static PyObject *
|
||||
unicodedata_extended_pictographic_impl(PyObject *module, int chr);
|
||||
|
||||
static PyObject *
|
||||
unicodedata_extended_pictographic(PyObject *module, PyObject *arg)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
int chr;
|
||||
|
||||
if (!PyUnicode_Check(arg)) {
|
||||
_PyArg_BadArgument("extended_pictographic", "argument", "a unicode character", arg);
|
||||
goto exit;
|
||||
}
|
||||
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"extended_pictographic(): argument must be a unicode character, "
|
||||
"not a string of length %zd",
|
||||
PyUnicode_GET_LENGTH(arg));
|
||||
goto exit;
|
||||
}
|
||||
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||
return_value = unicodedata_extended_pictographic_impl(module, chr);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=6991246310e3f2aa input=a9049054013a1b77]*/
|
||||
|
|
|
|||
|
|
@ -24,6 +24,26 @@
|
|||
#include <stdbool.h>
|
||||
#include <stddef.h> // offsetof()
|
||||
|
||||
/* helper macro to fixup start/end slice values */
|
||||
#define ADJUST_INDICES(start, end, len) \
|
||||
do { \
|
||||
if (end > len) { \
|
||||
end = len; \
|
||||
} \
|
||||
else if (end < 0) { \
|
||||
end += len; \
|
||||
if (end < 0) { \
|
||||
end = 0; \
|
||||
} \
|
||||
} \
|
||||
if (start < 0) { \
|
||||
start += len; \
|
||||
if (start < 0) { \
|
||||
start = 0; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/*[clinic input]
|
||||
module unicodedata
|
||||
class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
|
||||
|
|
@ -42,6 +62,11 @@ typedef struct {
|
|||
const unsigned char east_asian_width; /* index into
|
||||
_PyUnicode_EastAsianWidth */
|
||||
const unsigned char normalization_quick_check; /* see is_normalized() */
|
||||
const unsigned char grapheme_cluster_break; /* index into
|
||||
_PyUnicode_GraphemeBreakNames */
|
||||
const unsigned char incb; /* index into
|
||||
_PyUnicode_IndicConjunctBreakNames */
|
||||
const unsigned char ext_pict; /* true if Extended_Pictographic */
|
||||
} _PyUnicode_DatabaseRecord;
|
||||
|
||||
typedef struct change_record {
|
||||
|
|
@ -71,6 +96,19 @@ _getrecord_ex(Py_UCS4 code)
|
|||
return &_PyUnicode_Database_Records[index];
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
PyObject *SegmentType;
|
||||
PyObject *GraphemeBreakIteratorType;
|
||||
} unicodedatastate;
|
||||
|
||||
static inline unicodedatastate *
|
||||
get_unicodedata_state(PyObject *module)
|
||||
{
|
||||
void *state = _PyModule_GetState(module);
|
||||
assert(state != NULL);
|
||||
return (unicodedatastate *)state;
|
||||
}
|
||||
|
||||
/* ------------- Previous-version API ------------------------------------- */
|
||||
typedef struct previous_version {
|
||||
PyObject_HEAD
|
||||
|
|
@ -1628,11 +1666,469 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|||
return PyUnicode_FromOrdinal(code);
|
||||
}
|
||||
|
||||
|
||||
/* Grapheme Cluster Break algorithm */
|
||||
|
||||
enum ExtPictState {
|
||||
ExtPictState_Init,
|
||||
// \p{Extended_Pictographic} Extend*
|
||||
ExtPictState_Started,
|
||||
// ... ZWJ
|
||||
ExtPictState_ZWJ,
|
||||
// ... \p{Extended_Pictographic}
|
||||
ExtPictState_Matched,
|
||||
};
|
||||
|
||||
enum InCBState {
|
||||
InCBState_Init,
|
||||
// \p{InCB=Consonant} \p{InCB=Extend}*
|
||||
InCBState_Started,
|
||||
// ... \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]*
|
||||
InCBState_Linker,
|
||||
// ... \p{InCB=Consonant}
|
||||
InCBState_Matched,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
PyObject *str;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t pos;
|
||||
Py_ssize_t end;
|
||||
int gcb;
|
||||
enum ExtPictState ep_state;
|
||||
enum InCBState incb_state;
|
||||
bool ri_flag;
|
||||
} _PyGraphemeBreak;
|
||||
|
||||
static inline enum ExtPictState
|
||||
update_ext_pict_state(enum ExtPictState state, int gcb, bool ext_pict)
|
||||
{
|
||||
if (ext_pict) {
|
||||
return (state == ExtPictState_ZWJ) ? ExtPictState_Matched : ExtPictState_Started;
|
||||
}
|
||||
if (state == ExtPictState_Started || state == ExtPictState_Matched) {
|
||||
if (gcb == GCB_Extend) {
|
||||
return ExtPictState_Started;
|
||||
}
|
||||
if (gcb == GCB_ZWJ) {
|
||||
return ExtPictState_ZWJ;
|
||||
}
|
||||
}
|
||||
return ExtPictState_Init;
|
||||
}
|
||||
|
||||
static inline enum InCBState
|
||||
update_incb_state(enum InCBState state, int incb)
|
||||
{
|
||||
if (incb == InCB_Consonant) {
|
||||
return (state == InCBState_Linker) ? InCBState_Matched : InCBState_Started;
|
||||
}
|
||||
if (state != InCBState_Init) {
|
||||
if (incb == InCB_Extend) {
|
||||
return (state == InCBState_Linker) ? InCBState_Linker : InCBState_Started;
|
||||
}
|
||||
if (incb == InCB_Linker) {
|
||||
return InCBState_Linker;
|
||||
}
|
||||
}
|
||||
return InCBState_Init;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
update_ri_flag(bool flag, int gcb)
|
||||
{
|
||||
if (gcb == GCB_Regional_Indicator) {
|
||||
return !flag;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
grapheme_break(int prev_gcb, int curr_gcb, enum ExtPictState ep_state,
|
||||
bool ri_flag, enum InCBState incb_state)
|
||||
{
|
||||
/* GB3 */
|
||||
if (prev_gcb == GCB_CR && curr_gcb == GCB_LF) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB4 */
|
||||
if (prev_gcb == GCB_CR ||
|
||||
prev_gcb == GCB_LF ||
|
||||
prev_gcb == GCB_Control)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/* GB5 */
|
||||
if (curr_gcb == GCB_CR ||
|
||||
curr_gcb == GCB_LF ||
|
||||
curr_gcb == GCB_Control)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/* GB6 */
|
||||
if (prev_gcb == GCB_L &&
|
||||
(curr_gcb == GCB_L ||
|
||||
curr_gcb == GCB_V ||
|
||||
curr_gcb == GCB_LV ||
|
||||
curr_gcb == GCB_LVT))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB7 */
|
||||
if ((prev_gcb == GCB_LV || prev_gcb == GCB_V) &&
|
||||
(curr_gcb == GCB_V || curr_gcb == GCB_T))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB8 */
|
||||
if ((prev_gcb == GCB_LVT || prev_gcb == GCB_T) &&
|
||||
curr_gcb == GCB_T)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB9 */
|
||||
if (curr_gcb == GCB_Extend || curr_gcb == GCB_ZWJ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB9a */
|
||||
if (curr_gcb == GCB_SpacingMark) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB9b */
|
||||
if (prev_gcb == GCB_Prepend) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB9c */
|
||||
if (incb_state == InCBState_Matched) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB11 */
|
||||
if (ep_state == ExtPictState_Matched) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GB12 and GB13 */
|
||||
if (prev_gcb == GCB_Regional_Indicator && curr_gcb == prev_gcb) {
|
||||
return ri_flag;
|
||||
}
|
||||
|
||||
/* GB999 */
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
_Py_InitGraphemeBreak(_PyGraphemeBreak *iter, PyObject *str,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
iter->str = str;
|
||||
iter->start = iter->pos = start;
|
||||
iter->end = end;
|
||||
iter->gcb = 0;
|
||||
iter->ep_state = ExtPictState_Init;
|
||||
iter->ri_flag = false;
|
||||
iter->incb_state = InCBState_Init;
|
||||
}
|
||||
|
||||
static Py_ssize_t
|
||||
_Py_NextGraphemeBreak(_PyGraphemeBreak *iter)
|
||||
{
|
||||
if (iter->start >= iter->end) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int kind = PyUnicode_KIND(iter->str);
|
||||
void *pstr = PyUnicode_DATA(iter->str);
|
||||
while (iter->pos < iter->end) {
|
||||
Py_UCS4 chr = PyUnicode_READ(kind, pstr, iter->pos);
|
||||
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(chr);
|
||||
int gcb = record->grapheme_cluster_break;
|
||||
iter->ep_state = update_ext_pict_state(iter->ep_state, gcb, record->ext_pict);
|
||||
iter->ri_flag = update_ri_flag(iter->ri_flag, gcb);
|
||||
iter->incb_state = update_incb_state(iter->incb_state, record->incb);
|
||||
int prev_gcb = iter->gcb;
|
||||
iter->gcb = gcb;
|
||||
if (iter->pos != iter->start &&
|
||||
grapheme_break(prev_gcb, gcb, iter->ep_state, iter->ri_flag,
|
||||
iter->incb_state))
|
||||
{
|
||||
iter->start = iter->pos;
|
||||
return iter->pos++;
|
||||
}
|
||||
++iter->pos;
|
||||
}
|
||||
iter->start = iter->pos;
|
||||
return iter->pos;
|
||||
}
|
||||
|
||||
|
||||
/* Text Segment object */
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
PyObject *string;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
} SegmentObject;
|
||||
|
||||
static void
|
||||
Segment_dealloc(PyObject *self)
|
||||
{
|
||||
PyObject_GC_UnTrack(self);
|
||||
Py_DECREF(((SegmentObject *)self)->string);
|
||||
PyObject_GC_Del(self);
|
||||
}
|
||||
|
||||
static int
|
||||
Segment_traverse(PyObject *self, visitproc visit, void *arg)
|
||||
{
|
||||
Py_VISIT(((SegmentObject *)self)->string);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
Segment_clear(PyObject *self)
|
||||
{
|
||||
Py_CLEAR(((SegmentObject *)self)->string);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Segment_str(PyObject *self)
|
||||
{
|
||||
SegmentObject *s = (SegmentObject *)self;
|
||||
return PyUnicode_Substring(s->string, s->start, s->end);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Segment_repr(PyObject *self)
|
||||
{
|
||||
SegmentObject *s = (SegmentObject *)self;
|
||||
return PyUnicode_FromFormat("<Segment %zd:%zd>", s->start, s->end);
|
||||
}
|
||||
|
||||
static PyMemberDef Segment_members[] = {
|
||||
{"start", Py_T_PYSSIZET, offsetof(SegmentObject, start), 0,
|
||||
PyDoc_STR("grapheme start")},
|
||||
{"end", Py_T_PYSSIZET, offsetof(SegmentObject, end), 0,
|
||||
PyDoc_STR("grapheme end")},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyType_Slot Segment_slots[] = {
|
||||
{Py_tp_dealloc, Segment_dealloc},
|
||||
{Py_tp_traverse, Segment_traverse},
|
||||
{Py_tp_clear, Segment_clear},
|
||||
{Py_tp_str, Segment_str},
|
||||
{Py_tp_repr, Segment_repr},
|
||||
{Py_tp_members, Segment_members},
|
||||
{0, 0},
|
||||
};
|
||||
|
||||
static PyType_Spec Segment_spec = {
|
||||
.name = "unicodedata.Segment",
|
||||
.basicsize = sizeof(SegmentObject),
|
||||
.flags = (
|
||||
Py_TPFLAGS_DEFAULT
|
||||
| Py_TPFLAGS_HAVE_GC
|
||||
| Py_TPFLAGS_DISALLOW_INSTANTIATION
|
||||
| Py_TPFLAGS_IMMUTABLETYPE
|
||||
),
|
||||
.slots = Segment_slots
|
||||
};
|
||||
|
||||
|
||||
/* Grapheme Cluster iterator */
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
_PyGraphemeBreak iter;
|
||||
} GraphemeBreakIterator;
|
||||
|
||||
static void
|
||||
GBI_dealloc(PyObject *self)
|
||||
{
|
||||
PyObject_GC_UnTrack(self);
|
||||
Py_DECREF(((GraphemeBreakIterator *)self)->iter.str);
|
||||
PyObject_GC_Del(self);
|
||||
}
|
||||
|
||||
static int
|
||||
GBI_traverse(PyObject *self, visitproc visit, void *arg)
|
||||
{
|
||||
Py_VISIT(((GraphemeBreakIterator *)self)->iter.str);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
GBI_clear(PyObject *self)
|
||||
{
|
||||
Py_CLEAR(((GraphemeBreakIterator *)self)->iter.str);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
GBI_iternext(PyObject *self)
|
||||
{
|
||||
GraphemeBreakIterator *it = (GraphemeBreakIterator *)self;
|
||||
Py_ssize_t start = it->iter.start;
|
||||
Py_ssize_t pos = _Py_NextGraphemeBreak(&it->iter);
|
||||
|
||||
if (pos < 0) {
|
||||
return NULL;
|
||||
}
|
||||
PyObject *module = PyType_GetModule(Py_TYPE(it));
|
||||
PyObject *SegmentType = get_unicodedata_state(module)->SegmentType;
|
||||
SegmentObject *s = PyObject_GC_New(SegmentObject,
|
||||
(PyTypeObject *)SegmentType);
|
||||
if (!s) {
|
||||
return NULL;
|
||||
}
|
||||
s->string = Py_NewRef(it->iter.str);
|
||||
s->start = start;
|
||||
s->end = pos;
|
||||
PyObject_GC_Track(s);
|
||||
return (PyObject *)s;
|
||||
}
|
||||
|
||||
|
||||
static PyType_Slot GraphemeBreakIterator_slots[] = {
|
||||
{Py_tp_dealloc, GBI_dealloc},
|
||||
{Py_tp_iter, PyObject_SelfIter},
|
||||
{Py_tp_iternext, GBI_iternext},
|
||||
{Py_tp_traverse, GBI_traverse},
|
||||
{Py_tp_clear, GBI_clear},
|
||||
{0, 0},
|
||||
};
|
||||
|
||||
static PyType_Spec GraphemeBreakIterator_spec = {
|
||||
.name = "unicodedata.GraphemeBreakIterator",
|
||||
.basicsize = sizeof(GraphemeBreakIterator),
|
||||
.flags = (
|
||||
Py_TPFLAGS_DEFAULT
|
||||
| Py_TPFLAGS_HAVE_GC
|
||||
| Py_TPFLAGS_DISALLOW_INSTANTIATION
|
||||
| Py_TPFLAGS_IMMUTABLETYPE
|
||||
),
|
||||
.slots = GraphemeBreakIterator_slots
|
||||
};
|
||||
|
||||
|
||||
/*[clinic input]
|
||||
unicodedata.iter_graphemes
|
||||
|
||||
unistr: unicode
|
||||
start: Py_ssize_t = 0
|
||||
end: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
|
||||
/
|
||||
|
||||
Returns an iterator to iterate over grapheme clusters.
|
||||
|
||||
It uses extended grapheme cluster rules from TR29.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
/*[clinic end generated code: output=b0b831944265d36f input=a1454d9e8135951f]*/
|
||||
{
|
||||
PyObject *GraphemeBreakIteratorType = get_unicodedata_state(module)->GraphemeBreakIteratorType;
|
||||
GraphemeBreakIterator *gbi = PyObject_GC_New(GraphemeBreakIterator,
|
||||
(PyTypeObject *)GraphemeBreakIteratorType);
|
||||
if (!gbi) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(unistr);
|
||||
ADJUST_INDICES(start, end, len);
|
||||
Py_INCREF(unistr);
|
||||
_Py_InitGraphemeBreak(&gbi->iter, unistr, start, end);
|
||||
PyObject_GC_Track(gbi);
|
||||
return (PyObject*)gbi;
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
unicodedata.grapheme_cluster_break
|
||||
|
||||
chr: int(accept={str})
|
||||
/
|
||||
|
||||
Returns the Grapheme_Cluster_Break property assigned to the character.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
unicodedata_grapheme_cluster_break_impl(PyObject *module, int chr)
|
||||
/*[clinic end generated code: output=39542e0f63bba36f input=5da75e86435576fd]*/
|
||||
{
|
||||
Py_UCS4 c = (Py_UCS4)chr;
|
||||
int index = (int) _getrecord_ex(c)->grapheme_cluster_break;
|
||||
return PyUnicode_FromString(_PyUnicode_GraphemeBreakNames[index]);
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
unicodedata.indic_conjunct_break
|
||||
|
||||
chr: int(accept={str})
|
||||
/
|
||||
|
||||
Returns the Indic_Conjunct_Break property assigned to the character.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
unicodedata_indic_conjunct_break_impl(PyObject *module, int chr)
|
||||
/*[clinic end generated code: output=673eff2caf797f08 input=5c730f78e469f2e8]*/
|
||||
{
|
||||
Py_UCS4 c = (Py_UCS4)chr;
|
||||
int index = (int) _getrecord_ex(c)->incb;
|
||||
return PyUnicode_FromString(_PyUnicode_IndicConjunctBreakNames[index]);
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
@permit_long_summary
|
||||
unicodedata.extended_pictographic
|
||||
|
||||
chr: int(accept={str})
|
||||
/
|
||||
|
||||
Returns the Extended_Pictographic property assigned to the character, as boolean.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
unicodedata_extended_pictographic_impl(PyObject *module, int chr)
|
||||
/*[clinic end generated code: output=b6bbb349427370b1 input=250d7bd988997eb3]*/
|
||||
{
|
||||
Py_UCS4 c = (Py_UCS4)chr;
|
||||
int index = (int) _getrecord_ex(c)->ext_pict;
|
||||
return PyBool_FromLong(index);
|
||||
}
|
||||
|
||||
|
||||
// List of functions used to define module functions *AND* unicodedata.UCD
|
||||
// methods. For module functions, self is the module. For UCD methods, self
|
||||
// is an UCD instance. The UCD_Check() macro is used to check if self is
|
||||
// an UCD instance.
|
||||
static PyMethodDef unicodedata_functions[] = {
|
||||
// Module only functions.
|
||||
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
|
||||
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
|
||||
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
|
||||
UNICODEDATA_ITER_GRAPHEMES_METHODDEF
|
||||
|
||||
// The following definitions are shared between the module
|
||||
// and the UCD class.
|
||||
#define DB_methods (unicodedata_functions + 4)
|
||||
|
||||
UNICODEDATA_UCD_DECIMAL_METHODDEF
|
||||
UNICODEDATA_UCD_DIGIT_METHODDEF
|
||||
UNICODEDATA_UCD_NUMERIC_METHODDEF
|
||||
|
|
@ -1664,7 +2160,7 @@ static PyType_Slot ucd_type_slots[] = {
|
|||
{Py_tp_dealloc, ucd_dealloc},
|
||||
{Py_tp_traverse, _PyObject_VisitType},
|
||||
{Py_tp_getattro, PyObject_GenericGetAttr},
|
||||
{Py_tp_methods, unicodedata_functions},
|
||||
{Py_tp_methods, DB_methods},
|
||||
{Py_tp_members, DB_members},
|
||||
{0, 0}
|
||||
};
|
||||
|
|
@ -1677,6 +2173,7 @@ static PyType_Spec ucd_type_spec = {
|
|||
.slots = ucd_type_slots
|
||||
};
|
||||
|
||||
|
||||
PyDoc_STRVAR(unicodedata_docstring,
|
||||
"This module provides access to the Unicode Character Database which\n\
|
||||
defines character properties for all Unicode characters. The data in\n\
|
||||
|
|
@ -1686,9 +2183,47 @@ this database is based on the UnicodeData.txt file version\n\
|
|||
The module uses the same names and symbols as defined by the\n\
|
||||
UnicodeData File Format " UNIDATA_VERSION ".");
|
||||
|
||||
static int
|
||||
unicodedata_traverse(PyObject *module, visitproc visit, void *arg)
|
||||
{
|
||||
unicodedatastate *state = get_unicodedata_state(module);
|
||||
Py_VISIT(state->SegmentType);
|
||||
Py_VISIT(state->GraphemeBreakIteratorType);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
unicodedata_clear(PyObject *module)
|
||||
{
|
||||
unicodedatastate *state = get_unicodedata_state(module);
|
||||
Py_CLEAR(state->SegmentType);
|
||||
Py_CLEAR(state->GraphemeBreakIteratorType);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
unicodedata_free(void *module)
|
||||
{
|
||||
unicodedata_clear((PyObject *)module);
|
||||
}
|
||||
|
||||
static int
|
||||
unicodedata_exec(PyObject *module)
|
||||
{
|
||||
unicodedatastate *state = get_unicodedata_state(module);
|
||||
|
||||
PyObject *SegmentType = PyType_FromModuleAndSpec(module, &Segment_spec, NULL);
|
||||
if (SegmentType == NULL) {
|
||||
return -1;
|
||||
}
|
||||
state->SegmentType = SegmentType;
|
||||
|
||||
PyObject *GraphemeBreakIteratorType = PyType_FromModuleAndSpec(module, &GraphemeBreakIterator_spec, NULL);
|
||||
if (GraphemeBreakIteratorType == NULL) {
|
||||
return -1;
|
||||
}
|
||||
state->GraphemeBreakIteratorType = GraphemeBreakIteratorType;
|
||||
|
||||
if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -1730,9 +2265,12 @@ static struct PyModuleDef unicodedata_module = {
|
|||
PyModuleDef_HEAD_INIT,
|
||||
.m_name = "unicodedata",
|
||||
.m_doc = unicodedata_docstring,
|
||||
.m_size = 0,
|
||||
.m_size = sizeof(unicodedatastate),
|
||||
.m_methods = unicodedata_functions,
|
||||
.m_slots = unicodedata_slots,
|
||||
.m_traverse = unicodedata_traverse,
|
||||
.m_clear = unicodedata_clear,
|
||||
.m_free = unicodedata_free,
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC
|
||||
|
|
|
|||
6231
Modules/unicodedata_db.h
generated
6231
Modules/unicodedata_db.h
generated
File diff suppressed because it is too large
Load diff
|
|
@ -56,6 +56,8 @@
|
|||
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
||||
SPECIAL_CASING = "SpecialCasing%s.txt"
|
||||
CASE_FOLDING = "CaseFolding%s.txt"
|
||||
GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
|
||||
EMOJI_DATA = "emoji/emoji-data%s.txt"
|
||||
|
||||
# Private Use Areas -- in planes 1, 15, 16
|
||||
PUA_1 = range(0xE000, 0xF900)
|
||||
|
|
@ -77,6 +79,14 @@
|
|||
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
||||
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
||||
|
||||
# "Other" needs to be the first entry, see the comment in makeunicodedata
|
||||
GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
|
||||
'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
|
||||
'ZWJ' ]
|
||||
|
||||
# "None" needs to be the first entry, see the comment in makeunicodedata
|
||||
INDIC_CONJUNCT_BREAK_NAMES = [ 'None', 'Linker', 'Consonant', 'Extend' ]
|
||||
|
||||
# "N" needs to be the first entry, see the comment in makeunicodedata
|
||||
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
|
||||
|
||||
|
|
@ -147,7 +157,9 @@ def makeunicodedata(unicode, trace):
|
|||
# EastAsianWidth.txt
|
||||
# see https://unicode.org/reports/tr11/#Unassigned
|
||||
assert EASTASIANWIDTH_NAMES[0] == "N"
|
||||
dummy = (0, 0, 0, 0, 0, 0)
|
||||
assert GRAPHEME_CLUSTER_NAMES[0] == "Other"
|
||||
assert INDIC_CONJUNCT_BREAK_NAMES[0] == "None"
|
||||
dummy = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
table = [dummy]
|
||||
cache = {0: dummy}
|
||||
index = [0] * len(unicode.chars)
|
||||
|
|
@ -160,23 +172,25 @@ def makeunicodedata(unicode, trace):
|
|||
|
||||
for char in unicode.chars:
|
||||
record = unicode.table[char]
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
|
||||
graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
|
||||
extpict = unicode.ext_picts[char]
|
||||
if record:
|
||||
# extract database properties
|
||||
category = CATEGORY_NAMES.index(record.general_category)
|
||||
combining = int(record.canonical_combining_class)
|
||||
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
|
||||
mirrored = record.bidi_mirrored == "Y"
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
||||
normalizationquickcheck = record.quick_check
|
||||
incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
|
||||
item = (
|
||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||
normalizationquickcheck
|
||||
normalizationquickcheck, graphemebreak, incb, extpict,
|
||||
)
|
||||
elif unicode.widths[char] is not None:
|
||||
elif eastasianwidth or graphemebreak or extpict:
|
||||
# an unassigned but reserved character, with a known
|
||||
# east_asian_width
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
|
||||
item = (0, 0, 0, 0, eastasianwidth, 0)
|
||||
# east_asian_width or grapheme_break or ext_pict
|
||||
item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
|
||||
else:
|
||||
continue
|
||||
|
||||
|
|
@ -296,7 +310,7 @@ def makeunicodedata(unicode, trace):
|
|||
fprint("/* a list of unique database records */")
|
||||
fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
|
||||
for item in table:
|
||||
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
|
||||
fprint(" {%d, %d, %d, %d, %d, %d, %d, %d, %d}," % item)
|
||||
fprint("};")
|
||||
fprint()
|
||||
|
||||
|
|
@ -337,6 +351,24 @@ def makeunicodedata(unicode, trace):
|
|||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
for i, name in enumerate(GRAPHEME_CLUSTER_NAMES):
|
||||
fprint("#define GCB_%s %d" % (name, i))
|
||||
|
||||
fprint("const char * const _PyUnicode_GraphemeBreakNames[] = {")
|
||||
for name in GRAPHEME_CLUSTER_NAMES:
|
||||
fprint(' "%s",' % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
for i, name in enumerate(INDIC_CONJUNCT_BREAK_NAMES):
|
||||
fprint("#define InCB_%s %d" % (name, i))
|
||||
|
||||
fprint("const char * const _PyUnicode_IndicConjunctBreakNames[] = {")
|
||||
for name in INDIC_CONJUNCT_BREAK_NAMES:
|
||||
fprint(' "%s",' % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
fprint("static const char *decomp_prefix[] = {")
|
||||
for name in decomp_prefix:
|
||||
fprint(" \"%s\"," % name)
|
||||
|
|
@ -783,6 +815,10 @@ def merge_old_version(version, new, old):
|
|||
# normalization quickchecks are not performed
|
||||
# for older versions
|
||||
pass
|
||||
elif k == 18:
|
||||
# The Indic_Conjunct_Break property did not exist for
|
||||
# older versions
|
||||
pass
|
||||
else:
|
||||
class Difference(Exception):pass
|
||||
raise Difference(hex(i), k, old.table[i], new.table[i])
|
||||
|
|
@ -804,7 +840,7 @@ def open_data(template, version):
|
|||
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
|
||||
else:
|
||||
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
os.makedirs(os.path.dirname(local), exist_ok=True)
|
||||
urllib.request.urlretrieve(url, filename=local)
|
||||
if local.endswith('.txt'):
|
||||
return open(local, encoding='utf-8')
|
||||
|
|
@ -892,9 +928,13 @@ class UcdRecord:
|
|||
# We store them as a bitmask.
|
||||
quick_check: int
|
||||
|
||||
# The Indic_Conjunct_Break property from DerivedCoreProperties.txt. See:
|
||||
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
|
||||
incb: str
|
||||
|
||||
|
||||
def from_row(row: List[str]) -> UcdRecord:
|
||||
return UcdRecord(*row, None, set(), 0)
|
||||
return UcdRecord(*row, None, set(), 0, "None")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
|
@ -992,14 +1032,14 @@ def __init__(self, version, cjk_check=True):
|
|||
self.widths = widths
|
||||
|
||||
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||
if propinfo:
|
||||
# this is not a binary property, ignore it
|
||||
continue
|
||||
|
||||
if table[char]:
|
||||
# Some properties (e.g. Default_Ignorable_Code_Point)
|
||||
# apply to unassigned code points; ignore them
|
||||
table[char].binary_properties.add(propname)
|
||||
if not propinfo:
|
||||
# binary property
|
||||
if table[char]:
|
||||
# Some properties (e.g. Default_Ignorable_Code_Point)
|
||||
# apply to unassigned code points; ignore them
|
||||
table[char].binary_properties.add(propname)
|
||||
elif propname == 'InCB': # Indic_Conjunct_Break
|
||||
table[char].incb, = propinfo
|
||||
|
||||
for char_range, value in UcdFile(LINE_BREAK, version):
|
||||
if value not in MANDATORY_LINE_BREAKS:
|
||||
|
|
@ -1068,6 +1108,19 @@ def __init__(self, version, cjk_check=True):
|
|||
c = int(data[0], 16)
|
||||
cf[c] = [int(char, 16) for char in data[2].split()]
|
||||
|
||||
if version != "3.2.0":
|
||||
grapheme_breaks = [None] * 0x110000
|
||||
for char, (prop,) in UcdFile(GRAPHEME_CLUSTER_BREAK, version).expanded():
|
||||
grapheme_breaks[char] = prop
|
||||
self.grapheme_breaks = grapheme_breaks
|
||||
|
||||
ext_picts = [False] * 0x110000
|
||||
for char, (prop,) in UcdFile(EMOJI_DATA, version).expanded():
|
||||
if prop == 'Extended_Pictographic':
|
||||
ext_picts[char] = True
|
||||
self.ext_picts = ext_picts
|
||||
|
||||
|
||||
def uselatin1(self):
|
||||
# restrict character range to ISO Latin 1
|
||||
self.chars = list(range(256))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue