mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
This commit is contained in:
		
							parent
							
								
									437e6a3b15
								
							
						
					
					
						commit
						6e39080649
					
				
					 12 changed files with 999 additions and 2 deletions
				
			
		|  | @ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) | |||
| 				NULL); | ||||
| } | ||||
| 
 | ||||
| /* --- UTF-32 Codec ------------------------------------------------------- */ | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_DecodeUTF32(const char *s, | ||||
| 		      Py_ssize_t size, | ||||
| 		      const char *errors, | ||||
| 		      int *byteorder) | ||||
| { | ||||
|     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_DecodeUTF32Stateful(const char *s, | ||||
| 			      Py_ssize_t size, | ||||
| 			      const char *errors, | ||||
| 			      int *byteorder, | ||||
| 			      Py_ssize_t *consumed) | ||||
| { | ||||
|     const char *starts = s; | ||||
|     Py_ssize_t startinpos; | ||||
|     Py_ssize_t endinpos; | ||||
|     Py_ssize_t outpos; | ||||
|     PyUnicodeObject *unicode; | ||||
|     Py_UNICODE *p; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     int i, pairs; | ||||
| #else | ||||
|     const int pairs = 0; | ||||
| #endif | ||||
|     const unsigned char *q, *e; | ||||
|     int bo = 0;       /* assume native ordering by default */ | ||||
|     const char *errmsg = ""; | ||||
|     /* On narrow builds we split characters outside the BMP into two
 | ||||
|        codepoints => count how much extra space we need. */ | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     for (i = pairs = 0; i < size/4; i++) | ||||
| 	if (((Py_UCS4 *)s)[i] >= 0x10000) | ||||
| 	    pairs++; | ||||
| #endif | ||||
|     /* Offsets from q for retrieving bytes in the right order. */ | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
|     int iorder[] = {0, 1, 2, 3}; | ||||
| #else | ||||
|     int iorder[] = {3, 2, 1, 0}; | ||||
| #endif | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|     /* This might be one to much, because of a BOM */ | ||||
|     unicode = _PyUnicode_New((size+3)/4+pairs); | ||||
|     if (!unicode) | ||||
|         return NULL; | ||||
|     if (size == 0) | ||||
|         return (PyObject *)unicode; | ||||
| 
 | ||||
|     /* Unpack UTF-32 encoded data */ | ||||
|     p = unicode->str; | ||||
|     q = (unsigned char *)s; | ||||
|     e = q + size; | ||||
| 
 | ||||
|     if (byteorder) | ||||
|         bo = *byteorder; | ||||
| 
 | ||||
|     /* Check for BOM marks (U+FEFF) in the input and adjust current
 | ||||
|        byte order setting accordingly. In native mode, the leading BOM | ||||
|        mark is skipped, in all other modes, it is copied to the output | ||||
|        stream as-is (giving a ZWNBSP character). */ | ||||
|     if (bo == 0) { | ||||
|         if (size >= 4) { | ||||
|             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
|                                 (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
| 	    if (bom == 0x0000FEFF) { | ||||
| 		q += 4; | ||||
| 		bo = -1; | ||||
| 	    } | ||||
| 	    else if (bom == 0xFFFE0000) { | ||||
| 		q += 4; | ||||
| 		bo = 1; | ||||
| 	    } | ||||
| #else | ||||
| 	    if (bom == 0x0000FEFF) { | ||||
| 		q += 4; | ||||
| 		bo = 1; | ||||
| 	    } | ||||
| 	    else if (bom == 0xFFFE0000) { | ||||
| 		q += 4; | ||||
| 		bo = -1; | ||||
| 	    } | ||||
| #endif | ||||
| 	} | ||||
|     } | ||||
| 
 | ||||
|     if (bo == -1) { | ||||
|         /* force LE */ | ||||
|         iorder[0] = 0; | ||||
|         iorder[1] = 1; | ||||
|         iorder[2] = 2; | ||||
|         iorder[3] = 3; | ||||
|     } | ||||
|     else if (bo == 1) { | ||||
|         /* force BE */ | ||||
|         iorder[0] = 3; | ||||
|         iorder[1] = 2; | ||||
|         iorder[2] = 1; | ||||
|         iorder[3] = 0; | ||||
|     } | ||||
| 
 | ||||
|     while (q < e) { | ||||
| 	Py_UCS4 ch; | ||||
| 	/* remaining bytes at the end? (size should be divisible by 4) */ | ||||
| 	if (e-q<4) { | ||||
| 	    if (consumed) | ||||
| 		break; | ||||
| 	    errmsg = "truncated data"; | ||||
| 	    startinpos = ((const char *)q)-starts; | ||||
| 	    endinpos = ((const char *)e)-starts; | ||||
| 	    goto utf32Error; | ||||
| 	    /* The remaining input chars are ignored if the callback
 | ||||
| 	       chooses to skip the input */ | ||||
| 	} | ||||
| 	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
| 	     (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| 
 | ||||
| 	if (ch >= 0x110000) | ||||
| 	{ | ||||
| 	    errmsg = "codepoint not in range(0x110000)"; | ||||
| 	    startinpos = ((const char *)q)-starts; | ||||
| 	    endinpos = startinpos+4; | ||||
| 	    goto utf32Error; | ||||
| 	} | ||||
| #ifndef Py_UNICODE_WIDE | ||||
| 	if (ch >= 0x10000) | ||||
| 	{ | ||||
| 	    *p++ = 0xD800 | ((ch-0x10000) >> 10); | ||||
| 	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); | ||||
| 	} | ||||
| 	else | ||||
| #endif | ||||
| 	    *p++ = ch; | ||||
| 	q += 4; | ||||
| 	continue; | ||||
|     utf32Error: | ||||
| 	outpos = p-PyUnicode_AS_UNICODE(unicode); | ||||
|     if (unicode_decode_call_errorhandler( | ||||
|          errors, &errorHandler, | ||||
|          "utf32", errmsg, | ||||
|          starts, size, &startinpos, &endinpos, &exc, &s, | ||||
|          (PyObject **)&unicode, &outpos, &p)) | ||||
| 	    goto onError; | ||||
|     } | ||||
| 
 | ||||
|     if (byteorder) | ||||
|         *byteorder = bo; | ||||
| 
 | ||||
|     if (consumed) | ||||
| 	*consumed = (const char *)q-starts; | ||||
| 
 | ||||
|     /* Adjust length */ | ||||
|     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) | ||||
|         goto onError; | ||||
| 
 | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return (PyObject *)unicode; | ||||
| 
 | ||||
| onError: | ||||
|     Py_DECREF(unicode); | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return NULL; | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_EncodeUTF32(const Py_UNICODE *s, | ||||
| 		      Py_ssize_t size, | ||||
| 		      const char *errors, | ||||
| 		      int byteorder) | ||||
| { | ||||
|     PyObject *v; | ||||
|     unsigned char *p; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     int i, pairs; | ||||
| #else | ||||
|     const int pairs = 0; | ||||
| #endif | ||||
|     /* Offsets from p for storing byte pairs in the right order. */ | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
|     int iorder[] = {0, 1, 2, 3}; | ||||
| #else | ||||
|     int iorder[] = {3, 2, 1, 0}; | ||||
| #endif | ||||
| 
 | ||||
| #define STORECHAR(CH)                       \ | ||||
|     do {                                    \ | ||||
|         p[iorder[3]] = ((CH) >> 24) & 0xff; \ | ||||
|         p[iorder[2]] = ((CH) >> 16) & 0xff; \ | ||||
|         p[iorder[1]] = ((CH) >> 8) & 0xff;  \ | ||||
|         p[iorder[0]] = (CH) & 0xff;         \ | ||||
|         p += 4;                             \ | ||||
|     } while(0) | ||||
| 
 | ||||
|     /* In narrow builds we can output surrogate pairs as one codepoint,
 | ||||
|        so we need less space. */ | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     for (i = pairs = 0; i < size-1; i++) | ||||
| 	if (0xD800 <= s[i] && s[i] <= 0xDBFF && | ||||
| 	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) | ||||
| 	    pairs++; | ||||
| #endif | ||||
|     v = PyString_FromStringAndSize(NULL, | ||||
| 		  4 * (size - pairs + (byteorder == 0))); | ||||
|     if (v == NULL) | ||||
|         return NULL; | ||||
| 
 | ||||
|     p = (unsigned char *)PyString_AS_STRING(v); | ||||
|     if (byteorder == 0) | ||||
| 	STORECHAR(0xFEFF); | ||||
|     if (size == 0) | ||||
|         return v; | ||||
| 
 | ||||
|     if (byteorder == -1) { | ||||
|         /* force LE */ | ||||
|         iorder[0] = 0; | ||||
|         iorder[1] = 1; | ||||
|         iorder[2] = 2; | ||||
|         iorder[3] = 3; | ||||
|     } | ||||
|     else if (byteorder == 1) { | ||||
|         /* force BE */ | ||||
|         iorder[0] = 3; | ||||
|         iorder[1] = 2; | ||||
|         iorder[2] = 1; | ||||
|         iorder[3] = 0; | ||||
|     } | ||||
| 
 | ||||
|     while (size-- > 0) { | ||||
| 	Py_UCS4 ch = *s++; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
| 	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { | ||||
| 	    Py_UCS4 ch2 = *s; | ||||
| 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | ||||
| 		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; | ||||
| 		s++; | ||||
| 		size--; | ||||
| 	    } | ||||
| 	} | ||||
| #endif | ||||
|         STORECHAR(ch); | ||||
|     } | ||||
|     return v; | ||||
| #undef STORECHAR | ||||
| } | ||||
| 
 | ||||
| PyObject *PyUnicode_AsUTF32String(PyObject *unicode) | ||||
| { | ||||
|     if (!PyUnicode_Check(unicode)) { | ||||
|         PyErr_BadArgument(); | ||||
|         return NULL; | ||||
|     } | ||||
|     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), | ||||
| 				 PyUnicode_GET_SIZE(unicode), | ||||
| 				 NULL, | ||||
| 				 0); | ||||
| } | ||||
| 
 | ||||
| /* --- UTF-16 Codec ------------------------------------------------------- */ | ||||
| 
 | ||||
| PyObject * | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Walter Dörwald
						Walter Dörwald