mirror of
				https://github.com/python/cpython.git
				synced 2025-10-24 18:33:49 +00:00 
			
		
		
		
	Issue #20574: Implement incremental decoder for cp65001 code
(Windows code page 65001, Microsoft UTF-8).
This commit is contained in:
		
							parent
							
								
									c49926748b
								
							
						
					
					
						commit
						7d00cc1a64
					
				
					 4 changed files with 22 additions and 43 deletions
				
			
		|  | @ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj) | |||
|     return PyBytes_AS_STRING(*obj); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| is_dbcs_lead_byte(UINT code_page, const char *s, int offset) | ||||
| { | ||||
|     const char *curr = s + offset; | ||||
|     const char *prev; | ||||
| 
 | ||||
|     if (!IsDBCSLeadByteEx(code_page, *curr)) | ||||
|         return 0; | ||||
| 
 | ||||
|     prev = CharPrevExA(code_page, s, curr, 0); | ||||
|     if (prev == curr) | ||||
|         return 1; | ||||
|     /* FIXME: This code is limited to "true" double-byte encodings,
 | ||||
|        as it assumes an incomplete character consists of a single | ||||
|        byte. */ | ||||
|     if (curr - prev == 2) | ||||
|         return 1; | ||||
|     if (!IsDBCSLeadByteEx(code_page, *prev)) | ||||
|         return 1; | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static DWORD | ||||
| decode_code_page_flags(UINT code_page) | ||||
| { | ||||
|  | @ -6913,7 +6891,7 @@ static int | |||
| decode_code_page_errors(UINT code_page, | ||||
|                         PyObject **v, | ||||
|                         const char *in, const int size, | ||||
|                         const char *errors) | ||||
|                         const char *errors, int final) | ||||
| { | ||||
|     const char *startin = in; | ||||
|     const char *endin = in + size; | ||||
|  | @ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page, | |||
|     if (encoding == NULL) | ||||
|         return -1; | ||||
| 
 | ||||
|     if (errors == NULL || strcmp(errors, "strict") == 0) { | ||||
|     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { | ||||
|         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
 | ||||
|            UnicodeDecodeError. */ | ||||
|         make_decode_exception(&exc, encoding, in, size, 0, 0, reason); | ||||
|  | @ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page, | |||
|         if (outsize <= 0) { | ||||
|             Py_ssize_t startinpos, endinpos, outpos; | ||||
| 
 | ||||
|             /* last character in partial decode? */ | ||||
|             if (in + insize >= endin && !final) | ||||
|                 break; | ||||
| 
 | ||||
|             startinpos = in - startin; | ||||
|             endinpos = startinpos + 1; | ||||
|             outpos = out - PyUnicode_AS_UNICODE(*v); | ||||
|  | @ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page, | |||
|     assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); | ||||
|     if (unicode_resize(v, outsize) < 0) | ||||
|         goto error; | ||||
|     ret = size; | ||||
|     ret = in - startin; | ||||
| 
 | ||||
| error: | ||||
|     Py_XDECREF(encoding_obj); | ||||
|  | @ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page, | |||
|             done = 1; | ||||
|         } | ||||
| 
 | ||||
|         /* Skip trailing lead-byte unless 'final' is set */ | ||||
|         if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) | ||||
|             --chunk_size; | ||||
| 
 | ||||
|         if (chunk_size == 0 && done) { | ||||
|             if (v != NULL) | ||||
|                 break; | ||||
|             _Py_RETURN_UNICODE_EMPTY(); | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
|         converted = decode_code_page_strict(code_page, &v, | ||||
|                                             s, chunk_size); | ||||
|         if (converted == -2) | ||||
|             converted = decode_code_page_errors(code_page, &v, | ||||
|                                                 s, chunk_size, | ||||
|                                                 errors); | ||||
|         assert(converted != 0); | ||||
|                                                 errors, final); | ||||
|         assert(converted != 0 || done); | ||||
| 
 | ||||
|         if (converted < 0) { | ||||
|             Py_XDECREF(v); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner