bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)

Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c. (cherry picked from commit 9976834f80) Co-authored-by: Victor Stinner <vstinner@python.org>
2025-10-31 13:41:24 +00:00 · 2021-03-17 14:11:14 -07:00 · 2021-03-17 14:11:14 -07:00 · aa967ec4d4
commit aa967ec4d4
parent db73376106
4 changed files with 149 additions and 70 deletions
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@ -33,6 +33,13 @@ extern int winerror_to_errno(int);
 int _Py_open_cloexec_works = -1;
 #endif

+// The value must be the same in unicodeobject.c.
+#define MAX_UNICODE 0x10ffff
+
+// mbstowcs() and mbrtowc() errors
+static const size_t DECODE_ERROR = ((size_t)-1);
+static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
+

 static int
 get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@ -85,6 +92,57 @@ _Py_device_encoding(int fd)
    Py_RETURN_NONE;
 }

+
+static size_t
+is_valid_wide_char(wchar_t ch)
+{
+    if (Py_UNICODE_IS_SURROGATE(ch)) {
+        // Reject lone surrogate characters
+        return 0;
+    }
+    if (ch > MAX_UNICODE) {
+        // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
+        // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
+        // it creates characters outside the [U+0000; U+10ffff] range:
+        // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
+        return 0;
+    }
+    return 1;
+}
+
+
+static size_t
+_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
+{
+    size_t count = mbstowcs(dest, src, n);
+    if (dest != NULL && count != DECODE_ERROR) {
+        for (size_t i=0; i < count; i++) {
+            wchar_t ch = dest[i];
+            if (!is_valid_wide_char(ch)) {
+                return DECODE_ERROR;
+            }
+        }
+    }
+    return count;
+}
+
+
+#ifdef HAVE_MBRTOWC
+static size_t
+_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
+{
+    assert(pwc != NULL);
+    size_t count = mbrtowc(pwc, str, len, pmbs);
+    if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
+        if (!is_valid_wide_char(*pwc)) {
+            return DECODE_ERROR;
+        }
+    }
+    return count;
+}
+#endif
+
+
 #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)

 #define USE_FORCE_ASCII
@ -151,8 +209,8 @@ check_force_ascii(void)
        size_t res;

        ch = (unsigned char)0xA7;
-        res = mbstowcs(&wch, (char*)&ch, 1);
-        if (res != (size_t)-1 && wch == L'\xA7') {
+        res = _Py_mbstowcs(&wch, (char*)&ch, 1);
+        if (res != DECODE_ERROR && wch == L'\xA7') {
            /* On HP-UX withe C locale or the POSIX locale,
               nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
               Latin1 encoding in practice. Force ASCII in this case.
@ -199,8 +257,8 @@ check_force_ascii(void)

        unsigned uch = (unsigned char)i;
        ch[0] = (char)uch;
-        res = mbstowcs(wch, ch, 1);
-        if (res != (size_t)-1) {
+        res = _Py_mbstowcs(wch, ch, 1);
+        if (res != DECODE_ERROR) {
            /* decoding a non-ASCII character from the locale encoding succeed:
               the locale encoding is not ASCII, force ASCII */
            return 1;
@ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
     */
    argsize = strlen(arg);
 #else
-    argsize = mbstowcs(NULL, arg, 0);
+    argsize = _Py_mbstowcs(NULL, arg, 0);
 #endif
-    if (argsize != (size_t)-1) {
+    if (argsize != DECODE_ERROR) {
        if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
            return -1;
        }
@ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
            return -1;
        }

-        count = mbstowcs(res, arg, argsize + 1);
-        if (count != (size_t)-1) {
-            wchar_t *tmp;
-            /* Only use the result if it contains no
-               surrogate characters. */
-            for (tmp = res; *tmp != 0 &&
-                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
-                ;
-            if (*tmp == 0) {
-                if (wlen != NULL) {
-                    *wlen = count;
-                }
-                *wstr = res;
-                return 0;
+        count = _Py_mbstowcs(res, arg, argsize + 1);
+        if (count != DECODE_ERROR) {
+            *wstr = res;
+            if (wlen != NULL) {
+                *wlen = count;
            }
+            return 0;
        }
        PyMem_RawFree(res);
    }
@ -439,13 +489,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
-        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
+        size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
        if (converted == 0) {
            /* Reached end of string; null char stored. */
            break;
        }

-        if (converted == (size_t)-2) {
+        if (converted == INCOMPLETE_CHARACTER) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
@ -453,32 +503,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
            goto decode_error;
        }

-        if (converted == (size_t)-1) {
+        if (converted == DECODE_ERROR) {
            if (!surrogateescape) {
                goto decode_error;
            }

-            /* Conversion error. Escape as UTF-8b, and start over
-               in the initial shift state. */
+            /* Decoding error. Escape as UTF-8b, and start over in the initial
+               shift state. */
            *out++ = 0xdc00 + *in++;
            argsize--;
            memset(&mbs, 0, sizeof mbs);
            continue;
        }

-        if (Py_UNICODE_IS_SURROGATE(*out)) {
-            if (!surrogateescape) {
-                goto decode_error;
-            }
+        // _Py_mbrtowc() reject lone surrogate characters
+        assert(!Py_UNICODE_IS_SURROGATE(*out));

-            /* Surrogate character.  Escape the original
-               byte sequence with surrogateescape. */
-            argsize -= converted;
-            while (converted--) {
-                *out++ = 0xdc00 + *in++;
-            }
-            continue;
-        }
        /* successfully converted some bytes */
        in += converted;
        argsize -= converted;
@ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str,
                else {
                    converted = wcstombs(NULL, buf, 0);
                }
-                if (converted == (size_t)-1) {
+                if (converted == DECODE_ERROR) {
                    goto encode_error;
                }
                if (bytes != NULL) {
@ -1374,7 +1414,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
    char cmode[10];
    size_t r;
    r = wcstombs(cmode, mode, 10);
-    if (r == (size_t)-1 || r >= 10) {
+    if (r == DECODE_ERROR || r >= 10) {
        errno = EINVAL;
        return NULL;
    }