mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub- sequent BOM characters are no longer interpreted and removed. UTF-16-LE and -BE pass through all BOM mark characters. These changes should get the UTF-16 codec more in line with what the Unicode FAQ recommends w/r to BOM marks.
This commit is contained in:
		
							parent
							
								
									f52d27e52d
								
							
						
					
					
						commit
						489b56e044
					
				
					 2 changed files with 31 additions and 22 deletions
				
			
		|  | @ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( | |||
| 	*byteorder == 0:  native order | ||||
| 	*byteorder == 1:  big endian | ||||
| 
 | ||||
|    and then switches according to all BOM marks it finds in the input | ||||
|    data. BOM marks are not copied into the resulting Unicode string. | ||||
|    After completion, *byteorder is set to the current byte order at | ||||
|    the end of input data. | ||||
|    In native mode, the first two bytes of the stream are checked for a | ||||
|    BOM mark. If found, the BOM mark is analysed, the byte order | ||||
|    adjusted and the BOM skipped.  In the other modes, no BOM mark | ||||
|    interpretation is done. After completion, *byteorder is set to the | ||||
|    current byte order at the end of input data. | ||||
| 
 | ||||
|    If byteorder is NULL, the codec starts in native order mode. | ||||
| 
 | ||||
|  |  | |||
|  | @ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, | |||
|     if (byteorder) | ||||
| 	bo = *byteorder; | ||||
| 
 | ||||
|     /* Check for BOM marks (U+FEFF) in the input and adjust current
 | ||||
|        byte order setting accordingly. In native mode, the leading BOM | ||||
|        mark is skipped, in all other modes, it is copied to the output | ||||
|        stream as-is (giving a ZWNBSP character). */ | ||||
|     if (bo == 0) { | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
| 	if (*q == 0xFEFF) { | ||||
| 	    q++; | ||||
| 	    bo = -1; | ||||
| 	} else if (*q == 0xFFFE) { | ||||
| 	    q++; | ||||
| 	    bo = 1; | ||||
| 	} | ||||
| #else     | ||||
| 	if (*q == 0xFEFF) { | ||||
| 	    q++; | ||||
| 	    bo = 1; | ||||
| 	} else if (*q == 0xFFFE) { | ||||
| 	    q++; | ||||
| 	    bo = -1; | ||||
| 	} | ||||
| #endif | ||||
|     } | ||||
|      | ||||
|     while (q < e) { | ||||
| 	register Py_UNICODE ch = *q++; | ||||
| 
 | ||||
| 	/* Check for BOM marks (U+FEFF) in the input and adjust
 | ||||
| 	   current byte order setting accordingly. Swap input | ||||
| 	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 | ||||
| 	   !) */ | ||||
| 	/* Swap input bytes if needed. (This assumes
 | ||||
| 	   sizeof(Py_UNICODE) == 2 !) */ | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
| 	if (ch == 0xFEFF) { | ||||
| 	    bo = -1; | ||||
| 	    continue; | ||||
| 	} else if (ch == 0xFFFE) { | ||||
| 	    bo = 1; | ||||
| 	    continue; | ||||
| 	} | ||||
| 	if (bo == 1) | ||||
| 	    ch = (ch >> 8) | (ch << 8); | ||||
| #else     | ||||
| 	if (ch == 0xFEFF) { | ||||
| 	    bo = 1; | ||||
| 	    continue; | ||||
| 	} else if (ch == 0xFFFE) { | ||||
| 	    bo = -1; | ||||
| 	    continue; | ||||
| 	} | ||||
| 	if (bo == -1) | ||||
| 	    ch = (ch >> 8) | (ch << 8); | ||||
| #endif | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Marc-André Lemburg
						Marc-André Lemburg