gh-88886: Remove excessive encoding name normalization (GH-137167)

The codecs lookup function now performs only minimal normalization of the encoding name before passing it to the search functions: all ASCII letters are converted to lower case, spaces are replaced with hyphens. Excessive normalization broke third-party codecs providers, like python-iconv. Revert "bpo-37751: Fix codecs.lookup() normalization (GH-15092)" This reverts commit 20f59fe1f7.
2025-10-19 16:03:42 +00:00 · 2025-09-09 21:07:21 +03:00 · 2025-09-09 21:07:21 +03:00 · af58a6f883
commit af58a6f883
parent 6b7b9d00a9
5 changed files with 53 additions and 44 deletions
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -85,14 +85,15 @@ PyCodec_Unregister(PyObject *search_function)

 extern int _Py_normalize_encoding(const char *, char *, size_t);

-/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
-   converted to lower case, spaces and hyphens are replaced with underscores. */
+/* Convert a string to a normalized Python string: all ASCII letters are
+   converted to lower case, spaces are replaced with hyphens. */

-static
-PyObject *normalizestring(const char *string)
+static PyObject*
+normalizestring(const char *string)
 {
+    size_t i;
    size_t len = strlen(string);
-    char *encoding;
+    char *p;
    PyObject *v;

    if (len > PY_SSIZE_T_MAX) {
@ -100,28 +101,30 @@ PyObject *normalizestring(const char *string)
        return NULL;
    }

-    encoding = PyMem_Malloc(len + 1);
-    if (encoding == NULL)
+    p = PyMem_Malloc(len + 1);
+    if (p == NULL)
        return PyErr_NoMemory();
-
-    if (!_Py_normalize_encoding(string, encoding, len + 1))
-    {
-        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
-        PyMem_Free(encoding);
-        return NULL;
+    for (i = 0; i < len; i++) {
+        char ch = string[i];
+        if (ch == ' ')
+            ch = '-';
+        else
+            ch = Py_TOLOWER(Py_CHARMASK(ch));
+        p[i] = ch;
    }
-
-    v = PyUnicode_FromString(encoding);
-    PyMem_Free(encoding);
+    p[i] = '\0';
+    v = PyUnicode_FromString(p);
+    PyMem_Free(p);
    return v;
 }

 /* Lookup the given encoding and return a tuple providing the codec
   facilities.

-   The encoding string is looked up converted to all lower-case
-   characters. This makes encodings looked up through this mechanism
-   effectively case-insensitive.
+   ASCII letters in the encoding string is looked up converted to all
+   lower case. This makes encodings looked up through this mechanism
+   effectively case-insensitive. Spaces are replaced with hyphens for
+   names like "US ASCII" and "ISO 8859-1".

   If no codec is found, a LookupError is set and NULL returned.

@ -142,8 +145,8 @@ PyObject *_PyCodec_Lookup(const char *encoding)
    assert(interp->codecs.initialized);

    /* Convert the encoding to a normalized Python string: all
-       characters are converted to lower case, spaces and hyphens are
-       replaced with underscores. */
+       ASCII letters are converted to lower case, spaces are
+       replaced with hyphens. */
    PyObject *v = normalizestring(encoding);
    if (v == NULL) {
        return NULL;