gh-88886: Remove excessive encoding name normalization (GH-137167)

The codecs lookup function now performs only minimal normalization of
the encoding name before passing it to the search functions:
all ASCII letters are converted to lower case, spaces are replaced
with hyphens.

Excessive normalization broke third-party codecs providers, like
python-iconv.

Revert "bpo-37751: Fix codecs.lookup() normalization (GH-15092)"

This reverts commit 20f59fe1f7.
This commit is contained in:
Serhiy Storchaka 2025-09-09 21:07:21 +03:00 committed by GitHub
parent 6b7b9d00a9
commit af58a6f883
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 53 additions and 44 deletions

View file

@ -85,14 +85,15 @@ PyCodec_Unregister(PyObject *search_function)
extern int _Py_normalize_encoding(const char *, char *, size_t);
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
converted to lower case, spaces and hyphens are replaced with underscores. */
/* Convert a string to a normalized Python string: all ASCII letters are
converted to lower case, spaces are replaced with hyphens. */
static
PyObject *normalizestring(const char *string)
static PyObject*
normalizestring(const char *string)
{
size_t i;
size_t len = strlen(string);
char *encoding;
char *p;
PyObject *v;
if (len > PY_SSIZE_T_MAX) {
@ -100,28 +101,30 @@ PyObject *normalizestring(const char *string)
return NULL;
}
encoding = PyMem_Malloc(len + 1);
if (encoding == NULL)
p = PyMem_Malloc(len + 1);
if (p == NULL)
return PyErr_NoMemory();
if (!_Py_normalize_encoding(string, encoding, len + 1))
{
PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
PyMem_Free(encoding);
return NULL;
for (i = 0; i < len; i++) {
char ch = string[i];
if (ch == ' ')
ch = '-';
else
ch = Py_TOLOWER(Py_CHARMASK(ch));
p[i] = ch;
}
v = PyUnicode_FromString(encoding);
PyMem_Free(encoding);
p[i] = '\0';
v = PyUnicode_FromString(p);
PyMem_Free(p);
return v;
}
/* Lookup the given encoding and return a tuple providing the codec
facilities.
The encoding string is looked up converted to all lower-case
characters. This makes encodings looked up through this mechanism
effectively case-insensitive.
ASCII letters in the encoding string is looked up converted to all
lower case. This makes encodings looked up through this mechanism
effectively case-insensitive. Spaces are replaced with hyphens for
names like "US ASCII" and "ISO 8859-1".
If no codec is found, a LookupError is set and NULL returned.
@ -142,8 +145,8 @@ PyObject *_PyCodec_Lookup(const char *encoding)
assert(interp->codecs.initialized);
/* Convert the encoding to a normalized Python string: all
characters are converted to lower case, spaces and hyphens are
replaced with underscores. */
ASCII letters are converted to lower case, spaces are
replaced with hyphens. */
PyObject *v = normalizestring(encoding);
if (v == NULL) {
return NULL;