gh-55531: Implement normalize_encoding in C (#136643)

Closes gh-55531
This commit is contained in:
Stan Ulbrych 2025-10-30 14:31:47 +00:00 committed by GitHub
parent 6826166280
commit a3ce2f77f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 123 additions and 22 deletions

View file

@ -30,6 +30,7 @@
import codecs
import sys
from _codecs import _normalize_encoding
from . import aliases
_cache = {}
@ -55,18 +56,7 @@ def normalize_encoding(encoding):
if isinstance(encoding, bytes):
encoding = str(encoding, "ascii")
chars = []
punct = False
for c in encoding:
if c.isalnum() or c == '.':
if punct and chars:
chars.append('_')
if c.isascii():
chars.append(c)
punct = False
else:
punct = True
return ''.join(chars)
return _normalize_encoding(encoding)
def search_function(encoding):

View file

@ -0,0 +1,4 @@
:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
by implementing the function in C using the private
``_Py_normalize_encoding`` which has been modified to make lowercase
conversion optional.

View file

@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
return PyCodec_LookupError(name);
}
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
/*[clinic input]
_codecs._normalize_encoding
encoding: unicode
Normalize an encoding name *encoding*.
Used for encodings.normalize_encoding. Does not convert to lower case.
[clinic start generated code]*/
static PyObject *
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
{
Py_ssize_t len;
const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
if (cstr == NULL) {
return NULL;
}
if (len > PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError, "encoding is too large");
return NULL;
}
char *normalized = PyMem_Malloc(len + 1);
if (normalized == NULL) {
return PyErr_NoMemory();
}
if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
PyMem_Free(normalized);
return NULL;
}
PyObject *result = PyUnicode_FromString(normalized);
PyMem_Free(normalized);
return result;
}
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
_CODECS_REGISTER_ERROR_METHODDEF
_CODECS__UNREGISTER_ERROR_METHODDEF
_CODECS_LOOKUP_ERROR_METHODDEF
_CODECS__NORMALIZE_ENCODING_METHODDEF
{NULL, NULL} /* sentinel */
};

View file

@ -2779,6 +2779,70 @@ exit:
return return_value;
}
PyDoc_STRVAR(_codecs__normalize_encoding__doc__,
"_normalize_encoding($module, /, encoding)\n"
"--\n"
"\n"
"Normalize an encoding name *encoding*.\n"
"\n"
"Used for encodings.normalize_encoding. Does not convert to lower case.");
#define _CODECS__NORMALIZE_ENCODING_METHODDEF \
{"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__},
static PyObject *
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding);
static PyObject *
_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
{
PyObject *return_value = NULL;
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
#define NUM_KEYWORDS 1
static struct {
PyGC_Head _this_is_not_used;
PyObject_VAR_HEAD
Py_hash_t ob_hash;
PyObject *ob_item[NUM_KEYWORDS];
} _kwtuple = {
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
.ob_hash = -1,
.ob_item = { &_Py_ID(encoding), },
};
#undef NUM_KEYWORDS
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
#else // !Py_BUILD_CORE
# define KWTUPLE NULL
#endif // !Py_BUILD_CORE
static const char * const _keywords[] = {"encoding", NULL};
static _PyArg_Parser _parser = {
.keywords = _keywords,
.fname = "_normalize_encoding",
.kwtuple = KWTUPLE,
};
#undef KWTUPLE
PyObject *argsbuf[1];
PyObject *encoding;
args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
/*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
if (!args) {
goto exit;
}
if (!PyUnicode_Check(args[0])) {
_PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]);
goto exit;
}
encoding = args[0];
return_value = _codecs__normalize_encoding_impl(module, encoding);
exit:
return return_value;
}
#ifndef _CODECS_MBCS_DECODE_METHODDEF
#define _CODECS_MBCS_DECODE_METHODDEF
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
@ -2802,4 +2866,4 @@ exit:
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/
/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/

View file

@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
return v;
}
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
also convert to lowercase. Return 1 on success, or 0 on error (encoding is
longer than lower_len-1). */
/* Normalize an encoding name like encodings.normalize_encoding()
but allow to convert to lowercase if *to_lower* is true.
Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
int
_Py_normalize_encoding(const char *encoding,
char *lower,
size_t lower_len)
size_t lower_len,
int to_lower)
{
const char *e;
char *l;
@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
if (l == l_end) {
return 0;
}
*l++ = Py_TOLOWER(c);
*l++ = to_lower ? Py_TOLOWER(c) : c;
}
else {
punct = 1;
@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
}
/* Shortcuts for common default encodings */
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */
@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
}
/* Shortcuts for common default encodings */
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */

View file

@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
#define USE_FORCE_ASCII
extern int _Py_normalize_encoding(const char *, char *, size_t);
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@ -229,7 +229,7 @@ check_force_ascii(void)
}
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
goto error;
}