mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)
Python no longer fails at startup with a fatal error if a command
line argument contains an invalid Unicode character.
The Py_DecodeLocale() function now escapes byte sequences which would
be decoded as Unicode characters outside the [U+0000; U+10ffff]
range.
Use MAX_UNICODE constant in unicodeobject.c.
(cherry picked from commit 9976834f80)
Co-authored-by: Victor Stinner <vstinner@python.org>
			
			
This commit is contained in:
		
							parent
							
								
									db73376106
								
							
						
					
					
						commit
						aa967ec4d4
					
				
					 4 changed files with 149 additions and 70 deletions
				
			
		|  | @ -33,6 +33,13 @@ extern int winerror_to_errno(int); | |||
| int _Py_open_cloexec_works = -1; | ||||
| #endif | ||||
| 
 | ||||
| // The value must be the same in unicodeobject.c.
 | ||||
| #define MAX_UNICODE 0x10ffff | ||||
| 
 | ||||
| // mbstowcs() and mbrtowc() errors
 | ||||
| static const size_t DECODE_ERROR = ((size_t)-1); | ||||
| static const size_t INCOMPLETE_CHARACTER = (size_t)-2; | ||||
| 
 | ||||
| 
 | ||||
| static int | ||||
| get_surrogateescape(_Py_error_handler errors, int *surrogateescape) | ||||
|  | @ -85,6 +92,57 @@ _Py_device_encoding(int fd) | |||
|     Py_RETURN_NONE; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| static size_t | ||||
| is_valid_wide_char(wchar_t ch) | ||||
| { | ||||
|     if (Py_UNICODE_IS_SURROGATE(ch)) { | ||||
|         // Reject lone surrogate characters
 | ||||
|         return 0; | ||||
|     } | ||||
|     if (ch > MAX_UNICODE) { | ||||
|         // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
 | ||||
|         // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
 | ||||
|         // it creates characters outside the [U+0000; U+10ffff] range:
 | ||||
|         // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
 | ||||
|         return 0; | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| static size_t | ||||
| _Py_mbstowcs(wchar_t *dest, const char *src, size_t n) | ||||
| { | ||||
|     size_t count = mbstowcs(dest, src, n); | ||||
|     if (dest != NULL && count != DECODE_ERROR) { | ||||
|         for (size_t i=0; i < count; i++) { | ||||
|             wchar_t ch = dest[i]; | ||||
|             if (!is_valid_wide_char(ch)) { | ||||
|                 return DECODE_ERROR; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     return count; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| #ifdef HAVE_MBRTOWC | ||||
| static size_t | ||||
| _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) | ||||
| { | ||||
|     assert(pwc != NULL); | ||||
|     size_t count = mbrtowc(pwc, str, len, pmbs); | ||||
|     if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) { | ||||
|         if (!is_valid_wide_char(*pwc)) { | ||||
|             return DECODE_ERROR; | ||||
|         } | ||||
|     } | ||||
|     return count; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS) | ||||
| 
 | ||||
| #define USE_FORCE_ASCII | ||||
|  | @ -151,8 +209,8 @@ check_force_ascii(void) | |||
|         size_t res; | ||||
| 
 | ||||
|         ch = (unsigned char)0xA7; | ||||
|         res = mbstowcs(&wch, (char*)&ch, 1); | ||||
|         if (res != (size_t)-1 && wch == L'\xA7') { | ||||
|         res = _Py_mbstowcs(&wch, (char*)&ch, 1); | ||||
|         if (res != DECODE_ERROR && wch == L'\xA7') { | ||||
|             /* On HP-UX withe C locale or the POSIX locale,
 | ||||
|                nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses | ||||
|                Latin1 encoding in practice. Force ASCII in this case. | ||||
|  | @ -199,8 +257,8 @@ check_force_ascii(void) | |||
| 
 | ||||
|         unsigned uch = (unsigned char)i; | ||||
|         ch[0] = (char)uch; | ||||
|         res = mbstowcs(wch, ch, 1); | ||||
|         if (res != (size_t)-1) { | ||||
|         res = _Py_mbstowcs(wch, ch, 1); | ||||
|         if (res != DECODE_ERROR) { | ||||
|             /* decoding a non-ASCII character from the locale encoding succeed:
 | ||||
|                the locale encoding is not ASCII, force ASCII */ | ||||
|             return 1; | ||||
|  | @ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, | |||
|      */ | ||||
|     argsize = strlen(arg); | ||||
| #else | ||||
|     argsize = mbstowcs(NULL, arg, 0); | ||||
|     argsize = _Py_mbstowcs(NULL, arg, 0); | ||||
| #endif | ||||
|     if (argsize != (size_t)-1) { | ||||
|     if (argsize != DECODE_ERROR) { | ||||
|         if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { | ||||
|             return -1; | ||||
|         } | ||||
|  | @ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, | |||
|             return -1; | ||||
|         } | ||||
| 
 | ||||
|         count = mbstowcs(res, arg, argsize + 1); | ||||
|         if (count != (size_t)-1) { | ||||
|             wchar_t *tmp; | ||||
|             /* Only use the result if it contains no
 | ||||
|                surrogate characters. */ | ||||
|             for (tmp = res; *tmp != 0 && | ||||
|                          !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) | ||||
|                 ; | ||||
|             if (*tmp == 0) { | ||||
|                 if (wlen != NULL) { | ||||
|                     *wlen = count; | ||||
|                 } | ||||
|                 *wstr = res; | ||||
|                 return 0; | ||||
|         count = _Py_mbstowcs(res, arg, argsize + 1); | ||||
|         if (count != DECODE_ERROR) { | ||||
|             *wstr = res; | ||||
|             if (wlen != NULL) { | ||||
|                 *wlen = count; | ||||
|             } | ||||
|             return 0; | ||||
|         } | ||||
|         PyMem_RawFree(res); | ||||
|     } | ||||
|  | @ -439,13 +489,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, | |||
|     out = res; | ||||
|     memset(&mbs, 0, sizeof mbs); | ||||
|     while (argsize) { | ||||
|         size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); | ||||
|         size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs); | ||||
|         if (converted == 0) { | ||||
|             /* Reached end of string; null char stored. */ | ||||
|             break; | ||||
|         } | ||||
| 
 | ||||
|         if (converted == (size_t)-2) { | ||||
|         if (converted == INCOMPLETE_CHARACTER) { | ||||
|             /* Incomplete character. This should never happen,
 | ||||
|                since we provide everything that we have - | ||||
|                unless there is a bug in the C library, or I | ||||
|  | @ -453,32 +503,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, | |||
|             goto decode_error; | ||||
|         } | ||||
| 
 | ||||
|         if (converted == (size_t)-1) { | ||||
|         if (converted == DECODE_ERROR) { | ||||
|             if (!surrogateescape) { | ||||
|                 goto decode_error; | ||||
|             } | ||||
| 
 | ||||
|             /* Conversion error. Escape as UTF-8b, and start over
 | ||||
|                in the initial shift state. */ | ||||
|             /* Decoding error. Escape as UTF-8b, and start over in the initial
 | ||||
|                shift state. */ | ||||
|             *out++ = 0xdc00 + *in++; | ||||
|             argsize--; | ||||
|             memset(&mbs, 0, sizeof mbs); | ||||
|             continue; | ||||
|         } | ||||
| 
 | ||||
|         if (Py_UNICODE_IS_SURROGATE(*out)) { | ||||
|             if (!surrogateescape) { | ||||
|                 goto decode_error; | ||||
|             } | ||||
|         // _Py_mbrtowc() reject lone surrogate characters
 | ||||
|         assert(!Py_UNICODE_IS_SURROGATE(*out)); | ||||
| 
 | ||||
|             /* Surrogate character.  Escape the original
 | ||||
|                byte sequence with surrogateescape. */ | ||||
|             argsize -= converted; | ||||
|             while (converted--) { | ||||
|                 *out++ = 0xdc00 + *in++; | ||||
|             } | ||||
|             continue; | ||||
|         } | ||||
|         /* successfully converted some bytes */ | ||||
|         in += converted; | ||||
|         argsize -= converted; | ||||
|  | @ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str, | |||
|                 else { | ||||
|                     converted = wcstombs(NULL, buf, 0); | ||||
|                 } | ||||
|                 if (converted == (size_t)-1) { | ||||
|                 if (converted == DECODE_ERROR) { | ||||
|                     goto encode_error; | ||||
|                 } | ||||
|                 if (bytes != NULL) { | ||||
|  | @ -1374,7 +1414,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode) | |||
|     char cmode[10]; | ||||
|     size_t r; | ||||
|     r = wcstombs(cmode, mode, 10); | ||||
|     if (r == (size_t)-1 || r >= 10) { | ||||
|     if (r == DECODE_ERROR || r >= 10) { | ||||
|         errno = EINVAL; | ||||
|         return NULL; | ||||
|     } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Miss Islington (bot)
						Miss Islington (bot)