mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-129117: Add unicodedata.isxidstart() function (#140269)
Expose `_PyUnicode_IsXidContinue/Start` in `unicodedata`: add isxidstart() and isxidcontinue() functions. Co-authored-by: Victor Stinner <vstinner@python.org>
This commit is contained in:
parent
25bd72d683
commit
dbe3950a76
13 changed files with 225 additions and 13 deletions
|
|
@ -144,6 +144,36 @@ following functions:
|
||||||
1
|
1
|
||||||
|
|
||||||
|
|
||||||
|
.. function:: isxidstart(chr, /)
|
||||||
|
|
||||||
|
Return ``True`` if *chr* is a valid identifier start per the
|
||||||
|
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
|
||||||
|
that is, it has the ``XID_Start`` property. Return ``False`` otherwise.
|
||||||
|
For example::
|
||||||
|
|
||||||
|
>>> unicodedata.isxidstart('S')
|
||||||
|
True
|
||||||
|
>>> unicodedata.isxidstart('0')
|
||||||
|
False
|
||||||
|
|
||||||
|
.. versionadded:: next
|
||||||
|
|
||||||
|
|
||||||
|
.. function:: isxidcontinue(chr, /)
|
||||||
|
|
||||||
|
Return ``True`` if *chr* is a valid identifier character per the
|
||||||
|
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
|
||||||
|
that is, it has the ``XID_Continue`` property. Return ``False`` otherwise.
|
||||||
|
For example::
|
||||||
|
|
||||||
|
>>> unicodedata.isxidcontinue('S')
|
||||||
|
True
|
||||||
|
>>> unicodedata.isxidcontinue(' ')
|
||||||
|
False
|
||||||
|
|
||||||
|
.. versionadded:: next
|
||||||
|
|
||||||
|
|
||||||
.. function:: decomposition(chr)
|
.. function:: decomposition(chr)
|
||||||
|
|
||||||
Returns the character decomposition mapping assigned to the character
|
Returns the character decomposition mapping assigned to the character
|
||||||
|
|
|
||||||
|
|
@ -794,6 +794,11 @@ unicodedata
|
||||||
|
|
||||||
* The Unicode database has been updated to Unicode 17.0.0.
|
* The Unicode database has been updated to Unicode 17.0.0.
|
||||||
|
|
||||||
|
* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue`
|
||||||
|
functions to check whether a character can start or continue a
|
||||||
|
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
|
||||||
|
(Contributed by Stan Ulbrych in :gh:`129117`.)
|
||||||
|
|
||||||
|
|
||||||
wave
|
wave
|
||||||
----
|
----
|
||||||
|
|
|
||||||
25
Include/internal/pycore_unicodectype.h
Normal file
25
Include/internal/pycore_unicodectype.h
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef Py_INTERNAL_UNICODECTYPE_H
|
||||||
|
#define Py_INTERNAL_UNICODECTYPE_H
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef Py_BUILD_CORE
|
||||||
|
# error "this header requires Py_BUILD_CORE define"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
|
||||||
|
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
|
||||||
|
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
|
||||||
|
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
|
||||||
|
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
|
||||||
|
extern int _PyUnicode_IsCased(Py_UCS4 ch);
|
||||||
|
|
||||||
|
// Export for 'unicodedata' shared extension.
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif /* !Py_INTERNAL_UNICODECTYPE_H */
|
||||||
|
|
@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* --- Characters Type APIs ----------------------------------------------- */
|
|
||||||
|
|
||||||
extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
|
|
||||||
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
|
|
||||||
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
|
|
||||||
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
|
|
||||||
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
|
|
||||||
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
|
|
||||||
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
|
|
||||||
extern int _PyUnicode_IsCased(Py_UCS4 ch);
|
|
||||||
|
|
||||||
/* --- Unicode API -------------------------------------------------------- */
|
/* --- Unicode API -------------------------------------------------------- */
|
||||||
|
|
||||||
// Export for '_json' shared extension
|
// Export for '_json' shared extension
|
||||||
|
|
|
||||||
|
|
@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self):
|
||||||
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
|
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
|
||||||
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
|
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
|
||||||
|
|
||||||
|
def test_isxidstart(self):
|
||||||
|
self.assertTrue(self.db.isxidstart('S'))
|
||||||
|
self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM
|
||||||
|
self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA
|
||||||
|
self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
|
||||||
|
self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU
|
||||||
|
self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM
|
||||||
|
|
||||||
|
self.assertFalse(self.db.isxidstart(' '))
|
||||||
|
self.assertFalse(self.db.isxidstart('0'))
|
||||||
|
self.assertRaises(TypeError, self.db.isxidstart)
|
||||||
|
self.assertRaises(TypeError, self.db.isxidstart, 'xx')
|
||||||
|
|
||||||
|
def test_isxidcontinue(self):
|
||||||
|
self.assertTrue(self.db.isxidcontinue('S'))
|
||||||
|
self.assertTrue(self.db.isxidcontinue('_'))
|
||||||
|
self.assertTrue(self.db.isxidcontinue('0'))
|
||||||
|
self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
|
||||||
|
self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL
|
||||||
|
self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH
|
||||||
|
self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
|
||||||
|
self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO
|
||||||
|
|
||||||
|
self.assertFalse(self.db.isxidcontinue(' '))
|
||||||
|
self.assertRaises(TypeError, self.db.isxidcontinue)
|
||||||
|
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
|
||||||
|
|
||||||
class UnicodeMiscTest(UnicodeDatabaseTest):
|
class UnicodeMiscTest(UnicodeDatabaseTest):
|
||||||
|
|
||||||
@cpython_only
|
@cpython_only
|
||||||
|
|
|
||||||
|
|
@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \
|
||||||
$(srcdir)/Include/internal/pycore_typeobject.h \
|
$(srcdir)/Include/internal/pycore_typeobject.h \
|
||||||
$(srcdir)/Include/internal/pycore_typevarobject.h \
|
$(srcdir)/Include/internal/pycore_typevarobject.h \
|
||||||
$(srcdir)/Include/internal/pycore_ucnhash.h \
|
$(srcdir)/Include/internal/pycore_ucnhash.h \
|
||||||
|
$(srcdir)/Include/internal/pycore_unicodectype.h \
|
||||||
$(srcdir)/Include/internal/pycore_unicodeobject.h \
|
$(srcdir)/Include/internal/pycore_unicodeobject.h \
|
||||||
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
|
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
|
||||||
$(srcdir)/Include/internal/pycore_unionobject.h \
|
$(srcdir)/Include/internal/pycore_unionobject.h \
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and
|
||||||
|
:func:`~unicodedata.isxidcontinue` functions to check whether a character can
|
||||||
|
start or continue a `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
|
||||||
74
Modules/clinic/unicodedata.c.h
generated
74
Modules/clinic/unicodedata.c.h
generated
|
|
@ -518,6 +518,78 @@ exit:
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__,
|
||||||
|
"isxidstart($self, chr, /)\n"
|
||||||
|
"--\n"
|
||||||
|
"\n"
|
||||||
|
"Return True if the character has the XID_Start property, else False.");
|
||||||
|
|
||||||
|
#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF \
|
||||||
|
{"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__},
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidstart_impl(PyObject *self, int chr);
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg)
|
||||||
|
{
|
||||||
|
PyObject *return_value = NULL;
|
||||||
|
int chr;
|
||||||
|
|
||||||
|
if (!PyUnicode_Check(arg)) {
|
||||||
|
_PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg);
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"isxidstart(): argument must be a unicode character, "
|
||||||
|
"not a string of length %zd",
|
||||||
|
PyUnicode_GET_LENGTH(arg));
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||||
|
return_value = unicodedata_UCD_isxidstart_impl(self, chr);
|
||||||
|
|
||||||
|
exit:
|
||||||
|
return return_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__,
|
||||||
|
"isxidcontinue($self, chr, /)\n"
|
||||||
|
"--\n"
|
||||||
|
"\n"
|
||||||
|
"Return True if the character has the XID_Continue property, else False.");
|
||||||
|
|
||||||
|
#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF \
|
||||||
|
{"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__},
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr);
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg)
|
||||||
|
{
|
||||||
|
PyObject *return_value = NULL;
|
||||||
|
int chr;
|
||||||
|
|
||||||
|
if (!PyUnicode_Check(arg)) {
|
||||||
|
_PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg);
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
if (PyUnicode_GET_LENGTH(arg) != 1) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"isxidcontinue(): argument must be a unicode character, "
|
||||||
|
"not a string of length %zd",
|
||||||
|
PyUnicode_GET_LENGTH(arg));
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
chr = PyUnicode_READ_CHAR(arg, 0);
|
||||||
|
return_value = unicodedata_UCD_isxidcontinue_impl(self, chr);
|
||||||
|
|
||||||
|
exit:
|
||||||
|
return return_value;
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_UCD_lookup__doc__,
|
PyDoc_STRVAR(unicodedata_UCD_lookup__doc__,
|
||||||
"lookup($self, name, /)\n"
|
"lookup($self, name, /)\n"
|
||||||
"--\n"
|
"--\n"
|
||||||
|
|
@ -549,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
|
||||||
exit:
|
exit:
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/
|
/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_object.h" // _PyObject_VisitType()
|
#include "pycore_object.h" // _PyObject_VisitType()
|
||||||
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
|
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
|
||||||
|
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart()
|
||||||
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stddef.h> // offsetof()
|
#include <stddef.h> // offsetof()
|
||||||
|
|
@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
|
||||||
return PyUnicode_FromString(name);
|
return PyUnicode_FromString(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*[clinic input]
|
||||||
|
unicodedata.UCD.isxidstart
|
||||||
|
|
||||||
|
self: self
|
||||||
|
chr: int(accept={str})
|
||||||
|
/
|
||||||
|
|
||||||
|
Return True if the character has the XID_Start property, else False.
|
||||||
|
|
||||||
|
[clinic start generated code]*/
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidstart_impl(PyObject *self, int chr)
|
||||||
|
/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/
|
||||||
|
{
|
||||||
|
if (UCD_Check(self)) {
|
||||||
|
const change_record *old = get_old_record(self, chr);
|
||||||
|
if (old->category_changed == 0) {
|
||||||
|
/* unassigned */
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return PyBool_FromLong(_PyUnicode_IsXidStart(chr));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*[clinic input]
|
||||||
|
unicodedata.UCD.isxidcontinue
|
||||||
|
|
||||||
|
self: self
|
||||||
|
chr: int(accept={str})
|
||||||
|
/
|
||||||
|
|
||||||
|
Return True if the character has the XID_Continue property, else False.
|
||||||
|
|
||||||
|
[clinic start generated code]*/
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr)
|
||||||
|
/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/
|
||||||
|
{
|
||||||
|
if (UCD_Check(self)) {
|
||||||
|
const change_record *old = get_old_record(self, chr);
|
||||||
|
if (old->category_changed == 0) {
|
||||||
|
/* unassigned */
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return PyBool_FromLong(_PyUnicode_IsXidContinue(chr));
|
||||||
|
}
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
unicodedata.UCD.lookup
|
unicodedata.UCD.lookup
|
||||||
|
|
||||||
|
|
@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = {
|
||||||
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
|
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
|
||||||
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
|
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
|
||||||
UNICODEDATA_UCD_NAME_METHODDEF
|
UNICODEDATA_UCD_NAME_METHODDEF
|
||||||
|
UNICODEDATA_UCD_ISXIDSTART_METHODDEF
|
||||||
|
UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF
|
||||||
UNICODEDATA_UCD_LOOKUP_METHODDEF
|
UNICODEDATA_UCD_LOOKUP_METHODDEF
|
||||||
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
|
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
|
||||||
UNICODEDATA_UCD_NORMALIZE_METHODDEF
|
UNICODEDATA_UCD_NORMALIZE_METHODDEF
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
|
#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue()
|
||||||
|
|
||||||
#define ALPHA_MASK 0x01
|
#define ALPHA_MASK 0x01
|
||||||
#define DECIMAL_MASK 0x02
|
#define DECIMAL_MASK 0x02
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
|
#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
|
||||||
#include "pycore_pystate.h" // _PyInterpreterState_GET()
|
#include "pycore_pystate.h" // _PyInterpreterState_GET()
|
||||||
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
|
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
|
||||||
|
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart
|
||||||
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
|
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
|
||||||
#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
|
#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -328,6 +328,7 @@
|
||||||
<ClInclude Include="..\Include\internal\pycore_typevarobject.h" />
|
<ClInclude Include="..\Include\internal\pycore_typevarobject.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_ucnhash.h" />
|
<ClInclude Include="..\Include\internal\pycore_ucnhash.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_unionobject.h" />
|
<ClInclude Include="..\Include\internal\pycore_unionobject.h" />
|
||||||
|
<ClInclude Include="..\Include\internal\pycore_unicodectype.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h" />
|
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_unicodeobject_generated.h" />
|
<ClInclude Include="..\Include\internal\pycore_unicodeobject_generated.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_uniqueid.h" />
|
<ClInclude Include="..\Include\internal\pycore_uniqueid.h" />
|
||||||
|
|
|
||||||
|
|
@ -528,6 +528,9 @@
|
||||||
<ClInclude Include="..\Include\cpython\initconfig.h">
|
<ClInclude Include="..\Include\cpython\initconfig.h">
|
||||||
<Filter>Include\cpython</Filter>
|
<Filter>Include\cpython</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="..\Include\internal\pycore_unicodectype.h">
|
||||||
|
<Filter>Include\internal</Filter>
|
||||||
|
</ClInclude>
|
||||||
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h">
|
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h">
|
||||||
<Filter>Include\internal</Filter>
|
<Filter>Include\internal</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue