mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
This commit is contained in:
		
							parent
							
								
									437e6a3b15
								
							
						
					
					
						commit
						6e39080649
					
				
					 12 changed files with 999 additions and 2 deletions
				
			
		|  | @ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs: | ||||||
|    object.  Error handling is "strict".  Return *NULL* if an exception was raised |    object.  Error handling is "strict".  Return *NULL* if an exception was raised | ||||||
|    by the codec. |    by the codec. | ||||||
| 
 | 
 | ||||||
|  | These are the UTF-32 codec APIs: | ||||||
|  | 
 | ||||||
|  | .. % --- UTF-32 Codecs ------------------------------------------------------ */ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder) | ||||||
|  | 
 | ||||||
|  |    Decode *length* bytes from a UTF-32 encoded buffer string and return the | ||||||
|  |    corresponding Unicode object.  *errors* (if non-*NULL*) defines the error | ||||||
|  |    handling. It defaults to "strict". | ||||||
|  | 
 | ||||||
|  |    If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte | ||||||
|  |    order:: | ||||||
|  | 
 | ||||||
|  |       *byteorder == -1: little endian | ||||||
|  |       *byteorder == 0:  native order | ||||||
|  |       *byteorder == 1:  big endian | ||||||
|  | 
 | ||||||
|  |    and then switches if the first four bytes of the input data are a byte order mark | ||||||
|  |    (BOM) and the specified byte order is native order.  This BOM is not copied into | ||||||
|  |    the resulting Unicode string.  After completion, *\*byteorder* is set to the | ||||||
|  |    current byte order at the end of input data. | ||||||
|  | 
 | ||||||
|  |    In a narrow build codepoints outside the BMP will be decoded as surrogate pairs. | ||||||
|  | 
 | ||||||
|  |    If *byteorder* is *NULL*, the codec starts in native order mode. | ||||||
|  | 
 | ||||||
|  |    Return *NULL* if an exception was raised by the codec. | ||||||
|  | 
 | ||||||
|  |    .. versionadded:: 2.6 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed) | ||||||
|  | 
 | ||||||
|  |    If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If | ||||||
|  |    *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat | ||||||
|  |    trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible | ||||||
|  |    by four) as an error. Those bytes will not be decoded and the number of bytes | ||||||
|  |    that have been decoded will be stored in *consumed*. | ||||||
|  | 
 | ||||||
|  |    .. versionadded:: 2.6 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder) | ||||||
|  | 
 | ||||||
|  |    Return a Python bytes object holding the UTF-32 encoded value of the Unicode | ||||||
|  |    data in *s*.  If *byteorder* is not ``0``, output is written according to the | ||||||
|  |    following byte order:: | ||||||
|  | 
 | ||||||
|  |       byteorder == -1: little endian | ||||||
|  |       byteorder == 0:  native byte order (writes a BOM mark) | ||||||
|  |       byteorder == 1:  big endian | ||||||
|  | 
 | ||||||
|  |    If byteorder is ``0``, the output string will always start with the Unicode BOM | ||||||
|  |    mark (U+FEFF). In the other two modes, no BOM mark is prepended. | ||||||
|  | 
 | ||||||
|  |    If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output | ||||||
|  |    as a single codepoint. | ||||||
|  | 
 | ||||||
|  |    Return *NULL* if an exception was raised by the codec. | ||||||
|  | 
 | ||||||
|  |    .. versionadded:: 2.6 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode) | ||||||
|  | 
 | ||||||
|  |    Return a Python string using the UTF-32 encoding in native byte order. The | ||||||
|  |    string always starts with a BOM mark.  Error handling is "strict".  Return | ||||||
|  |    *NULL* if an exception was raised by the codec. | ||||||
|  | 
 | ||||||
|  |    .. versionadded:: 2.6 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| These are the UTF-16 codec APIs: | These are the UTF-16 codec APIs: | ||||||
| 
 | 
 | ||||||
| .. % --- UTF-16 Codecs ------------------------------------------------------ */ | .. % --- UTF-16 Codecs ------------------------------------------------------ */ | ||||||
|  |  | ||||||
|  | @ -1045,6 +1045,12 @@ particular, the following variants typically exist: | ||||||
| | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       | | | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       | | ||||||
| |                 | s_jisx0213                     |                                | | |                 | s_jisx0213                     |                                | | ||||||
| +-----------------+--------------------------------+--------------------------------+ | +-----------------+--------------------------------+--------------------------------+ | ||||||
|  | | utf_32          | U32, utf32                     | all languages                  | | ||||||
|  | +-----------------+--------------------------------+--------------------------------+ | ||||||
|  | | utf_32_be       | UTF-32BE                       | all languages                  | | ||||||
|  | +-----------------+--------------------------------+--------------------------------+ | ||||||
|  | | utf_32_le       | UTF-32LE                       | all languages                  | | ||||||
|  | +-----------------+--------------------------------+--------------------------------+ | ||||||
| | utf_16          | U16, utf16                     | all languages                  | | | utf_16          | U16, utf16                     | all languages                  | | ||||||
| +-----------------+--------------------------------+--------------------------------+ | +-----------------+--------------------------------+--------------------------------+ | ||||||
| | utf_16_be       | UTF-16BE                       | all languages (BMP only)       | | | utf_16_be       | UTF-16BE                       | all languages (BMP only)       | | ||||||
|  |  | ||||||
|  | @ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString | # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString | ||||||
| # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String | # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String | ||||||
| # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString | ||||||
|  | # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String | ||||||
| # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String | # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String | ||||||
| # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String | # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String | ||||||
| # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode | # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode | ||||||
|  | @ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap | # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap | ||||||
| # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 | ||||||
| # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape | ||||||
|  | # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 | ||||||
|  | # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful | ||||||
| # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 | ||||||
| # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful | ||||||
| # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 | ||||||
|  | @ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal | # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal | ||||||
| # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 | ||||||
| # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape | ||||||
|  | # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 | ||||||
| # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 | ||||||
| # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 | ||||||
| # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape | ||||||
|  | @ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString | # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString | ||||||
| # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String | # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String | ||||||
| # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString | ||||||
|  | # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String | ||||||
| # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String | # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String | ||||||
| # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String | # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String | ||||||
| # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode | # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode | ||||||
|  | @ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap | # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap | ||||||
| # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 | ||||||
| # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape | ||||||
|  | # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 | ||||||
|  | # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful | ||||||
| # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 | ||||||
| # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful | ||||||
| # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 | ||||||
|  | @ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | ||||||
| # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal | # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal | ||||||
| # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 | ||||||
| # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape | ||||||
|  | # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 | ||||||
| # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 | ||||||
| # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 | ||||||
| # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape | ||||||
|  | @ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( | ||||||
|     const char *errors		/* error handling */ |     const char *errors		/* error handling */ | ||||||
|     ); |     ); | ||||||
| 
 | 
 | ||||||
|  | /* --- UTF-32 Codecs ------------------------------------------------------ */ | ||||||
|  | 
 | ||||||
|  | /* Decodes length bytes from a UTF-32 encoded buffer string and returns
 | ||||||
|  |    the corresponding Unicode object. | ||||||
|  | 
 | ||||||
|  |    errors (if non-NULL) defines the error handling. It defaults | ||||||
|  |    to "strict".  | ||||||
|  | 
 | ||||||
|  |    If byteorder is non-NULL, the decoder starts decoding using the | ||||||
|  |    given byte order: | ||||||
|  | 
 | ||||||
|  | 	*byteorder == -1: little endian | ||||||
|  | 	*byteorder == 0:  native order | ||||||
|  | 	*byteorder == 1:  big endian | ||||||
|  | 
 | ||||||
|  |    In native mode, the first four bytes of the stream are checked for a | ||||||
|  |    BOM mark. If found, the BOM mark is analysed, the byte order | ||||||
|  |    adjusted and the BOM skipped.  In the other modes, no BOM mark | ||||||
|  |    interpretation is done. After completion, *byteorder is set to the | ||||||
|  |    current byte order at the end of input data. | ||||||
|  | 
 | ||||||
|  |    If byteorder is NULL, the codec starts in native order mode. | ||||||
|  | 
 | ||||||
|  | */ | ||||||
|  | 
 | ||||||
|  | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( | ||||||
|  |     const char *string, 	/* UTF-32 encoded string */ | ||||||
|  |     Py_ssize_t length,	 	/* size of string */ | ||||||
|  |     const char *errors,		/* error handling */ | ||||||
|  |     int *byteorder		/* pointer to byteorder to use
 | ||||||
|  | 				   0=native;-1=LE,1=BE; updated on | ||||||
|  | 				   exit */ | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( | ||||||
|  |     const char *string, 	/* UTF-32 encoded string */ | ||||||
|  |     Py_ssize_t length,	 	/* size of string */ | ||||||
|  |     const char *errors,		/* error handling */ | ||||||
|  |     int *byteorder,		/* pointer to byteorder to use
 | ||||||
|  | 				   0=native;-1=LE,1=BE; updated on | ||||||
|  | 				   exit */ | ||||||
|  |     Py_ssize_t *consumed	/* bytes consumed */ | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  | /* Returns a Python string using the UTF-32 encoding in native byte
 | ||||||
|  |    order. The string always starts with a BOM mark.  */ | ||||||
|  | 
 | ||||||
|  | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( | ||||||
|  |     PyObject *unicode	 	/* Unicode object */ | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  | /* Returns a Python string object holding the UTF-32 encoded value of
 | ||||||
|  |    the Unicode data. | ||||||
|  | 
 | ||||||
|  |    If byteorder is not 0, output is written according to the following | ||||||
|  |    byte order: | ||||||
|  | 
 | ||||||
|  |    byteorder == -1: little endian | ||||||
|  |    byteorder == 0:  native byte order (writes a BOM mark) | ||||||
|  |    byteorder == 1:  big endian | ||||||
|  | 
 | ||||||
|  |    If byteorder is 0, the output string will always start with the | ||||||
|  |    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is | ||||||
|  |    prepended. | ||||||
|  | 
 | ||||||
|  | */ | ||||||
|  | 
 | ||||||
|  | PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( | ||||||
|  |     const Py_UNICODE *data, 	/* Unicode char buffer */ | ||||||
|  |     Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */ | ||||||
|  |     const char *errors,		/* error handling */ | ||||||
|  |     int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */ | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
| /* --- UTF-16 Codecs ------------------------------------------------------ */ | /* --- UTF-16 Codecs ------------------------------------------------------ */ | ||||||
| 
 | 
 | ||||||
| /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 | ||||||
|  |  | ||||||
|  | @ -490,6 +490,16 @@ | ||||||
|     'unicodelittleunmarked' : 'utf_16_le', |     'unicodelittleunmarked' : 'utf_16_le', | ||||||
|     'utf_16le'           : 'utf_16_le', |     'utf_16le'           : 'utf_16_le', | ||||||
| 
 | 
 | ||||||
|  |     # utf_32 codec | ||||||
|  |     'u32'                : 'utf_32', | ||||||
|  |     'utf32'              : 'utf_32', | ||||||
|  | 
 | ||||||
|  |     # utf_32_be codec | ||||||
|  |     'utf_32be'           : 'utf_32_be', | ||||||
|  | 
 | ||||||
|  |     # utf_32_le codec | ||||||
|  |     'utf_32le'           : 'utf_32_le', | ||||||
|  | 
 | ||||||
|     # utf_7 codec |     # utf_7 codec | ||||||
|     'u7'                 : 'utf_7', |     'u7'                 : 'utf_7', | ||||||
|     'utf7'               : 'utf_7', |     'utf7'               : 'utf_7', | ||||||
|  |  | ||||||
							
								
								
									
										144
									
								
								Lib/encodings/utf_32.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										144
									
								
								Lib/encodings/utf_32.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,144 @@ | ||||||
|  | """ | ||||||
|  | Python 'utf-32' Codec | ||||||
|  | """ | ||||||
|  | import codecs, sys | ||||||
|  | 
 | ||||||
|  | ### Codec APIs | ||||||
|  | 
 | ||||||
|  | encode = codecs.utf_32_encode | ||||||
|  | 
 | ||||||
|  | def decode(input, errors='strict'): | ||||||
|  |     return codecs.utf_32_decode(input, errors, True) | ||||||
|  | 
 | ||||||
|  | class IncrementalEncoder(codecs.IncrementalEncoder): | ||||||
|  |     def __init__(self, errors='strict'): | ||||||
|  |         codecs.IncrementalEncoder.__init__(self, errors) | ||||||
|  |         self.encoder = None | ||||||
|  | 
 | ||||||
|  |     def encode(self, input, final=False): | ||||||
|  |         if self.encoder is None: | ||||||
|  |             result = codecs.utf_32_encode(input, self.errors)[0] | ||||||
|  |             if sys.byteorder == 'little': | ||||||
|  |                 self.encoder = codecs.utf_32_le_encode | ||||||
|  |             else: | ||||||
|  |                 self.encoder = codecs.utf_32_be_encode | ||||||
|  |             return result | ||||||
|  |         return self.encoder(input, self.errors)[0] | ||||||
|  | 
 | ||||||
|  |     def reset(self): | ||||||
|  |         codecs.IncrementalEncoder.reset(self) | ||||||
|  |         self.encoder = None | ||||||
|  | 
 | ||||||
|  |     def getstate(self): | ||||||
|  |         # state info we return to the caller: | ||||||
|  |         # 0: stream is in natural order for this platform | ||||||
|  |         # 2: endianness hasn't been determined yet | ||||||
|  |         # (we're never writing in unnatural order) | ||||||
|  |         return (2 if self.encoder is None else 0) | ||||||
|  | 
 | ||||||
|  |     def setstate(self, state): | ||||||
|  |         if state: | ||||||
|  |             self.encoder = None | ||||||
|  |         else: | ||||||
|  |             if sys.byteorder == 'little': | ||||||
|  |                 self.encoder = codecs.utf_32_le_encode | ||||||
|  |             else: | ||||||
|  |                 self.encoder = codecs.utf_32_be_encode | ||||||
|  | 
 | ||||||
|  | class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||||
|  |     def __init__(self, errors='strict'): | ||||||
|  |         codecs.BufferedIncrementalDecoder.__init__(self, errors) | ||||||
|  |         self.decoder = None | ||||||
|  | 
 | ||||||
|  |     def _buffer_decode(self, input, errors, final): | ||||||
|  |         if self.decoder is None: | ||||||
|  |             (output, consumed, byteorder) = \ | ||||||
|  |                 codecs.utf_32_ex_decode(input, errors, 0, final) | ||||||
|  |             if byteorder == -1: | ||||||
|  |                 self.decoder = codecs.utf_32_le_decode | ||||||
|  |             elif byteorder == 1: | ||||||
|  |                 self.decoder = codecs.utf_32_be_decode | ||||||
|  |             elif consumed >= 4: | ||||||
|  |                 raise UnicodeError("UTF-32 stream does not start with BOM") | ||||||
|  |             return (output, consumed) | ||||||
|  |         return self.decoder(input, self.errors, final) | ||||||
|  | 
 | ||||||
|  |     def reset(self): | ||||||
|  |         codecs.BufferedIncrementalDecoder.reset(self) | ||||||
|  |         self.decoder = None | ||||||
|  | 
 | ||||||
|  |     def getstate(self): | ||||||
|  |         # additonal state info from the base class must be None here, | ||||||
|  |         # as it isn't passed along to the caller | ||||||
|  |         state = codecs.BufferedIncrementalDecoder.getstate(self)[0] | ||||||
|  |         # additional state info we pass to the caller: | ||||||
|  |         # 0: stream is in natural order for this platform | ||||||
|  |         # 1: stream is in unnatural order | ||||||
|  |         # 2: endianness hasn't been determined yet | ||||||
|  |         if self.decoder is None: | ||||||
|  |             return (state, 2) | ||||||
|  |         addstate = int((sys.byteorder == "big") != | ||||||
|  |                        (self.decoder is codecs.utf_32_be_decode)) | ||||||
|  |         return (state, addstate) | ||||||
|  | 
 | ||||||
|  |     def setstate(self, state): | ||||||
|  |         # state[1] will be ignored by BufferedIncrementalDecoder.setstate() | ||||||
|  |         codecs.BufferedIncrementalDecoder.setstate(self, state) | ||||||
|  |         state = state[1] | ||||||
|  |         if state == 0: | ||||||
|  |             self.decoder = (codecs.utf_32_be_decode | ||||||
|  |                             if sys.byteorder == "big" | ||||||
|  |                             else codecs.utf_32_le_decode) | ||||||
|  |         elif state == 1: | ||||||
|  |             self.decoder = (codecs.utf_32_le_decode | ||||||
|  |                             if sys.byteorder == "big" | ||||||
|  |                             else codecs.utf_32_be_decode) | ||||||
|  |         else: | ||||||
|  |             self.decoder = None | ||||||
|  | 
 | ||||||
|  | class StreamWriter(codecs.StreamWriter): | ||||||
|  |     def __init__(self, stream, errors='strict'): | ||||||
|  |         self.bom_written = False | ||||||
|  |         codecs.StreamWriter.__init__(self, stream, errors) | ||||||
|  | 
 | ||||||
|  |     def encode(self, input, errors='strict'): | ||||||
|  |         self.bom_written = True | ||||||
|  |         result = codecs.utf_32_encode(input, errors) | ||||||
|  |         if sys.byteorder == 'little': | ||||||
|  |             self.encode = codecs.utf_32_le_encode | ||||||
|  |         else: | ||||||
|  |             self.encode = codecs.utf_32_be_encode | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  | class StreamReader(codecs.StreamReader): | ||||||
|  | 
 | ||||||
|  |     def reset(self): | ||||||
|  |         codecs.StreamReader.reset(self) | ||||||
|  |         try: | ||||||
|  |             del self.decode | ||||||
|  |         except AttributeError: | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|  |     def decode(self, input, errors='strict'): | ||||||
|  |         (object, consumed, byteorder) = \ | ||||||
|  |             codecs.utf_32_ex_decode(input, errors, 0, False) | ||||||
|  |         if byteorder == -1: | ||||||
|  |             self.decode = codecs.utf_32_le_decode | ||||||
|  |         elif byteorder == 1: | ||||||
|  |             self.decode = codecs.utf_32_be_decode | ||||||
|  |         elif consumed>=4: | ||||||
|  |             raise UnicodeError,"UTF-32 stream does not start with BOM" | ||||||
|  |         return (object, consumed) | ||||||
|  | 
 | ||||||
|  | ### encodings module API | ||||||
|  | 
 | ||||||
|  | def getregentry(): | ||||||
|  |     return codecs.CodecInfo( | ||||||
|  |         name='utf-32', | ||||||
|  |         encode=encode, | ||||||
|  |         decode=decode, | ||||||
|  |         incrementalencoder=IncrementalEncoder, | ||||||
|  |         incrementaldecoder=IncrementalDecoder, | ||||||
|  |         streamreader=StreamReader, | ||||||
|  |         streamwriter=StreamWriter, | ||||||
|  |     ) | ||||||
							
								
								
									
										37
									
								
								Lib/encodings/utf_32_be.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								Lib/encodings/utf_32_be.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | """ | ||||||
|  | Python 'utf-32-be' Codec | ||||||
|  | """ | ||||||
|  | import codecs | ||||||
|  | 
 | ||||||
|  | ### Codec APIs | ||||||
|  | 
 | ||||||
|  | encode = codecs.utf_32_be_encode | ||||||
|  | 
 | ||||||
|  | def decode(input, errors='strict'): | ||||||
|  |     return codecs.utf_32_be_decode(input, errors, True) | ||||||
|  | 
 | ||||||
|  | class IncrementalEncoder(codecs.IncrementalEncoder): | ||||||
|  |     def encode(self, input, final=False): | ||||||
|  |         return codecs.utf_32_be_encode(input, self.errors)[0] | ||||||
|  | 
 | ||||||
|  | class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||||
|  |     _buffer_decode = codecs.utf_32_be_decode | ||||||
|  | 
 | ||||||
|  | class StreamWriter(codecs.StreamWriter): | ||||||
|  |     encode = codecs.utf_32_be_encode | ||||||
|  | 
 | ||||||
|  | class StreamReader(codecs.StreamReader): | ||||||
|  |     decode = codecs.utf_32_be_decode | ||||||
|  | 
 | ||||||
|  | ### encodings module API | ||||||
|  | 
 | ||||||
|  | def getregentry(): | ||||||
|  |     return codecs.CodecInfo( | ||||||
|  |         name='utf-32-be', | ||||||
|  |         encode=encode, | ||||||
|  |         decode=decode, | ||||||
|  |         incrementalencoder=IncrementalEncoder, | ||||||
|  |         incrementaldecoder=IncrementalDecoder, | ||||||
|  |         streamreader=StreamReader, | ||||||
|  |         streamwriter=StreamWriter, | ||||||
|  |     ) | ||||||
							
								
								
									
										37
									
								
								Lib/encodings/utf_32_le.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								Lib/encodings/utf_32_le.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | """ | ||||||
|  | Python 'utf-32-le' Codec | ||||||
|  | """ | ||||||
|  | import codecs | ||||||
|  | 
 | ||||||
|  | ### Codec APIs | ||||||
|  | 
 | ||||||
|  | encode = codecs.utf_32_le_encode | ||||||
|  | 
 | ||||||
|  | def decode(input, errors='strict'): | ||||||
|  |     return codecs.utf_32_le_decode(input, errors, True) | ||||||
|  | 
 | ||||||
|  | class IncrementalEncoder(codecs.IncrementalEncoder): | ||||||
|  |     def encode(self, input, final=False): | ||||||
|  |         return codecs.utf_32_le_encode(input, self.errors)[0] | ||||||
|  | 
 | ||||||
|  | class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||||
|  |     _buffer_decode = codecs.utf_32_le_decode | ||||||
|  | 
 | ||||||
|  | class StreamWriter(codecs.StreamWriter): | ||||||
|  |     encode = codecs.utf_32_le_encode | ||||||
|  | 
 | ||||||
|  | class StreamReader(codecs.StreamReader): | ||||||
|  |     decode = codecs.utf_32_le_decode | ||||||
|  | 
 | ||||||
|  | ### encodings module API | ||||||
|  | 
 | ||||||
|  | def getregentry(): | ||||||
|  |     return codecs.CodecInfo( | ||||||
|  |         name='utf-32-le', | ||||||
|  |         encode=encode, | ||||||
|  |         decode=decode, | ||||||
|  |         incrementalencoder=IncrementalEncoder, | ||||||
|  |         incrementaldecoder=IncrementalDecoder, | ||||||
|  |         streamreader=StreamReader, | ||||||
|  |         streamwriter=StreamWriter, | ||||||
|  |     ) | ||||||
|  | @ -285,7 +285,8 @@ def handler2(exc): | ||||||
| 
 | 
 | ||||||
|     def test_longstrings(self): |     def test_longstrings(self): | ||||||
|         # test long strings to check for memory overflow problems |         # test long strings to check for memory overflow problems | ||||||
|         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] |         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", | ||||||
|  |                    "backslashreplace"] | ||||||
|         # register the handlers under different names, |         # register the handlers under different names, | ||||||
|         # to prevent the codec from recognizing the name |         # to prevent the codec from recognizing the name | ||||||
|         for err in errors: |         for err in errors: | ||||||
|  | @ -293,7 +294,8 @@ def test_longstrings(self): | ||||||
|         l = 1000 |         l = 1000 | ||||||
|         errors += [ "test." + err for err in errors ] |         errors += [ "test." + err for err in errors ] | ||||||
|         for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: |         for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: | ||||||
|             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): |             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", | ||||||
|  |                         "utf-8", "utf-7", "utf-16", "utf-32"): | ||||||
|                 for err in errors: |                 for err in errors: | ||||||
|                     try: |                     try: | ||||||
|                         uni.encode(enc, err) |                         uni.encode(enc, err) | ||||||
|  |  | ||||||
|  | @ -244,6 +244,137 @@ def test_bug1098990_b(self): | ||||||
|         self.assertEqual(reader.readline(), s5) |         self.assertEqual(reader.readline(), s5) | ||||||
|         self.assertEqual(reader.readline(), u"") |         self.assertEqual(reader.readline(), u"") | ||||||
| 
 | 
 | ||||||
|  | class UTF32Test(ReadTest): | ||||||
|  |     encoding = "utf-32" | ||||||
|  | 
 | ||||||
|  |     spamle = ('\xff\xfe\x00\x00' | ||||||
|  |               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' | ||||||
|  |               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') | ||||||
|  |     spambe = ('\x00\x00\xfe\xff' | ||||||
|  |               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' | ||||||
|  |               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') | ||||||
|  | 
 | ||||||
|  |     def test_only_one_bom(self): | ||||||
|  |         _,_,reader,writer = codecs.lookup(self.encoding) | ||||||
|  |         # encode some stream | ||||||
|  |         s = StringIO.StringIO() | ||||||
|  |         f = writer(s) | ||||||
|  |         f.write(u"spam") | ||||||
|  |         f.write(u"spam") | ||||||
|  |         d = s.getvalue() | ||||||
|  |         # check whether there is exactly one BOM in it | ||||||
|  |         self.assert_(d == self.spamle or d == self.spambe) | ||||||
|  |         # try to read it back | ||||||
|  |         s = StringIO.StringIO(d) | ||||||
|  |         f = reader(s) | ||||||
|  |         self.assertEquals(f.read(), u"spamspam") | ||||||
|  | 
 | ||||||
|  |     def test_badbom(self): | ||||||
|  |         s = StringIO.StringIO(4*"\xff") | ||||||
|  |         f = codecs.getreader(self.encoding)(s) | ||||||
|  |         self.assertRaises(UnicodeError, f.read) | ||||||
|  | 
 | ||||||
|  |         s = StringIO.StringIO(8*"\xff") | ||||||
|  |         f = codecs.getreader(self.encoding)(s) | ||||||
|  |         self.assertRaises(UnicodeError, f.read) | ||||||
|  | 
 | ||||||
|  |     def test_partial(self): | ||||||
|  |         self.check_partial( | ||||||
|  |             u"\x00\xff\u0100\uffff", | ||||||
|  |             [ | ||||||
|  |                 u"", # first byte of BOM read | ||||||
|  |                 u"", # second byte of BOM read | ||||||
|  |                 u"", # third byte of BOM read | ||||||
|  |                 u"", # fourth byte of BOM read => byteorder known | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100\uffff", | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def test_errors(self): | ||||||
|  |         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, | ||||||
|  |                           "\xff", "strict", True) | ||||||
|  | 
 | ||||||
|  | class UTF32LETest(ReadTest): | ||||||
|  |     encoding = "utf-32-le" | ||||||
|  | 
 | ||||||
|  |     def test_partial(self): | ||||||
|  |         self.check_partial( | ||||||
|  |             u"\x00\xff\u0100\uffff", | ||||||
|  |             [ | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100\uffff", | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def test_simple(self): | ||||||
|  |         self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00") | ||||||
|  | 
 | ||||||
|  |     def test_errors(self): | ||||||
|  |         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, | ||||||
|  |                           "\xff", "strict", True) | ||||||
|  | 
 | ||||||
|  | class UTF32BETest(ReadTest): | ||||||
|  |     encoding = "utf-32-be" | ||||||
|  | 
 | ||||||
|  |     def test_partial(self): | ||||||
|  |         self.check_partial( | ||||||
|  |             u"\x00\xff\u0100\uffff", | ||||||
|  |             [ | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100", | ||||||
|  |                 u"\x00\xff\u0100\uffff", | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def test_simple(self): | ||||||
|  |         self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03") | ||||||
|  | 
 | ||||||
|  |     def test_errors(self): | ||||||
|  |         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, | ||||||
|  |                           "\xff", "strict", True) | ||||||
|  | 
 | ||||||
| class UTF16Test(ReadTest): | class UTF16Test(ReadTest): | ||||||
|     encoding = "utf-16" |     encoding = "utf-16" | ||||||
| 
 | 
 | ||||||
|  | @ -1278,6 +1409,9 @@ def test_streamreaderwriter(self): | ||||||
| 
 | 
 | ||||||
| def test_main(): | def test_main(): | ||||||
|     test_support.run_unittest( |     test_support.run_unittest( | ||||||
|  |         UTF32Test, | ||||||
|  |         UTF32LETest, | ||||||
|  |         UTF32BETest, | ||||||
|         UTF16Test, |         UTF16Test, | ||||||
|         UTF16LETest, |         UTF16LETest, | ||||||
|         UTF16BETest, |         UTF16BETest, | ||||||
|  |  | ||||||
|  | @ -243,6 +243,8 @@ Library | ||||||
| - GB18030 codec now can encode additional two-byte characters that | - GB18030 codec now can encode additional two-byte characters that | ||||||
|   are missing in GBK. |   are missing in GBK. | ||||||
| 
 | 
 | ||||||
|  | - Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE. | ||||||
|  | 
 | ||||||
| - Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot | - Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot | ||||||
|   represent the result in a single character. |   represent the result in a single character. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self, | ||||||
|     return tuple; |     return tuple; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_decode(PyObject *self, | ||||||
|  | 	    PyObject *args) | ||||||
|  | { | ||||||
|  |     const char *data; | ||||||
|  |     Py_ssize_t size; | ||||||
|  |     const char *errors = NULL; | ||||||
|  |     int byteorder = 0; | ||||||
|  |     int final = 0; | ||||||
|  |     Py_ssize_t consumed; | ||||||
|  |     PyObject *decoded; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode", | ||||||
|  | 			  &data, &size, &errors, &final)) | ||||||
|  | 	return NULL; | ||||||
|  |     if (size < 0) { | ||||||
|  | 	    PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||||
|  | 	    return 0; | ||||||
|  |     } | ||||||
|  |     consumed = size; /* This is overwritten unless final is true. */ | ||||||
|  |     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, | ||||||
|  | 					    final ? NULL : &consumed); | ||||||
|  |     if (decoded == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     return codec_tuple(decoded, consumed); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_le_decode(PyObject *self, | ||||||
|  | 		 PyObject *args) | ||||||
|  | { | ||||||
|  |     const char *data; | ||||||
|  |     Py_ssize_t size; | ||||||
|  |     const char *errors = NULL; | ||||||
|  |     int byteorder = -1; | ||||||
|  |     int final = 0; | ||||||
|  |     Py_ssize_t consumed; | ||||||
|  |     PyObject *decoded = NULL; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode", | ||||||
|  | 			  &data, &size, &errors, &final)) | ||||||
|  | 	return NULL; | ||||||
|  | 
 | ||||||
|  |     if (size < 0) { | ||||||
|  |           PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||||
|  |           return 0; | ||||||
|  |     } | ||||||
|  |     consumed = size; /* This is overwritten unless final is true. */ | ||||||
|  |     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, | ||||||
|  | 	&byteorder, final ? NULL : &consumed); | ||||||
|  |     if (decoded == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     return codec_tuple(decoded, consumed); | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_be_decode(PyObject *self, | ||||||
|  | 		 PyObject *args) | ||||||
|  | { | ||||||
|  |     const char *data; | ||||||
|  |     Py_ssize_t size; | ||||||
|  |     const char *errors = NULL; | ||||||
|  |     int byteorder = 1; | ||||||
|  |     int final = 0; | ||||||
|  |     Py_ssize_t consumed; | ||||||
|  |     PyObject *decoded = NULL; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode", | ||||||
|  | 			  &data, &size, &errors, &final)) | ||||||
|  | 	return NULL; | ||||||
|  |     if (size < 0) { | ||||||
|  |           PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||||
|  |           return 0; | ||||||
|  |     } | ||||||
|  |     consumed = size; /* This is overwritten unless final is true. */ | ||||||
|  |     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, | ||||||
|  | 	&byteorder, final ? NULL : &consumed); | ||||||
|  |     if (decoded == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     return codec_tuple(decoded, consumed); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* This non-standard version also provides access to the byteorder
 | ||||||
|  |    parameter of the builtin UTF-32 codec. | ||||||
|  | 
 | ||||||
|  |    It returns a tuple (unicode, bytesread, byteorder) with byteorder | ||||||
|  |    being the value in effect at the end of data. | ||||||
|  | 
 | ||||||
|  | */ | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_ex_decode(PyObject *self, | ||||||
|  | 		 PyObject *args) | ||||||
|  | { | ||||||
|  |     const char *data; | ||||||
|  |     Py_ssize_t size; | ||||||
|  |     const char *errors = NULL; | ||||||
|  |     int byteorder = 0; | ||||||
|  |     PyObject *unicode, *tuple; | ||||||
|  |     int final = 0; | ||||||
|  |     Py_ssize_t consumed; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode", | ||||||
|  | 			  &data, &size, &errors, &byteorder, &final)) | ||||||
|  | 	return NULL; | ||||||
|  |     if (size < 0) { | ||||||
|  | 	    PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||||
|  | 	    return 0; | ||||||
|  |     } | ||||||
|  |     consumed = size; /* This is overwritten unless final is true. */ | ||||||
|  |     unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, | ||||||
|  | 					    final ? NULL : &consumed); | ||||||
|  |     if (unicode == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder); | ||||||
|  |     Py_DECREF(unicode); | ||||||
|  |     return tuple; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static PyObject * | static PyObject * | ||||||
| unicode_escape_decode(PyObject *self, | unicode_escape_decode(PyObject *self, | ||||||
| 		     PyObject *args) | 		     PyObject *args) | ||||||
|  | @ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self, | ||||||
|     return v; |     return v; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* This version provides access to the byteorder parameter of the
 | ||||||
|  |    builtin UTF-32 codecs as optional third argument. It defaults to 0 | ||||||
|  |    which means: use the native byte order and prepend the data with a | ||||||
|  |    BOM mark. | ||||||
|  | 
 | ||||||
|  | */ | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_encode(PyObject *self, | ||||||
|  | 	    PyObject *args) | ||||||
|  | { | ||||||
|  |     PyObject *str, *v; | ||||||
|  |     const char *errors = NULL; | ||||||
|  |     int byteorder = 0; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode", | ||||||
|  | 			  &str, &errors, &byteorder)) | ||||||
|  | 	return NULL; | ||||||
|  | 
 | ||||||
|  |     str = PyUnicode_FromObject(str); | ||||||
|  |     if (str == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||||
|  | 					  PyUnicode_GET_SIZE(str), | ||||||
|  | 					  errors, | ||||||
|  | 					  byteorder), | ||||||
|  | 		    PyUnicode_GET_SIZE(str)); | ||||||
|  |     Py_DECREF(str); | ||||||
|  |     return v; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_le_encode(PyObject *self, | ||||||
|  | 		 PyObject *args) | ||||||
|  | { | ||||||
|  |     PyObject *str, *v; | ||||||
|  |     const char *errors = NULL; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode", | ||||||
|  | 			  &str, &errors)) | ||||||
|  | 	return NULL; | ||||||
|  | 
 | ||||||
|  |     str = PyUnicode_FromObject(str); | ||||||
|  |     if (str == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||||
|  | 					     PyUnicode_GET_SIZE(str), | ||||||
|  | 					     errors, | ||||||
|  | 					     -1), | ||||||
|  | 		       PyUnicode_GET_SIZE(str)); | ||||||
|  |     Py_DECREF(str); | ||||||
|  |     return v; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | utf_32_be_encode(PyObject *self, | ||||||
|  | 		 PyObject *args) | ||||||
|  | { | ||||||
|  |     PyObject *str, *v; | ||||||
|  |     const char *errors = NULL; | ||||||
|  | 
 | ||||||
|  |     if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode", | ||||||
|  | 			  &str, &errors)) | ||||||
|  | 	return NULL; | ||||||
|  | 
 | ||||||
|  |     str = PyUnicode_FromObject(str); | ||||||
|  |     if (str == NULL) | ||||||
|  | 	return NULL; | ||||||
|  |     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||||
|  | 					  PyUnicode_GET_SIZE(str), | ||||||
|  | 					  errors, | ||||||
|  | 					  +1), | ||||||
|  | 		    PyUnicode_GET_SIZE(str)); | ||||||
|  |     Py_DECREF(str); | ||||||
|  |     return v; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static PyObject * | static PyObject * | ||||||
| unicode_escape_encode(PyObject *self, | unicode_escape_encode(PyObject *self, | ||||||
| 		     PyObject *args) | 		     PyObject *args) | ||||||
|  | @ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = { | ||||||
|     {"utf_16_le_decode",	utf_16_le_decode,		METH_VARARGS}, |     {"utf_16_le_decode",	utf_16_le_decode,		METH_VARARGS}, | ||||||
|     {"utf_16_be_decode",	utf_16_be_decode,		METH_VARARGS}, |     {"utf_16_be_decode",	utf_16_be_decode,		METH_VARARGS}, | ||||||
|     {"utf_16_ex_decode",	utf_16_ex_decode,		METH_VARARGS}, |     {"utf_16_ex_decode",	utf_16_ex_decode,		METH_VARARGS}, | ||||||
|  |     {"utf_32_encode",		utf_32_encode,			METH_VARARGS}, | ||||||
|  |     {"utf_32_le_encode",	utf_32_le_encode,		METH_VARARGS}, | ||||||
|  |     {"utf_32_be_encode",	utf_32_be_encode,		METH_VARARGS}, | ||||||
|  |     {"utf_32_decode",		utf_32_decode,			METH_VARARGS}, | ||||||
|  |     {"utf_32_le_decode",	utf_32_le_decode,		METH_VARARGS}, | ||||||
|  |     {"utf_32_be_decode",	utf_32_be_decode,		METH_VARARGS}, | ||||||
|  |     {"utf_32_ex_decode",	utf_32_ex_decode,		METH_VARARGS}, | ||||||
|     {"unicode_escape_encode",	unicode_escape_encode,		METH_VARARGS}, |     {"unicode_escape_encode",	unicode_escape_encode,		METH_VARARGS}, | ||||||
|     {"unicode_escape_decode",	unicode_escape_decode,		METH_VARARGS}, |     {"unicode_escape_decode",	unicode_escape_decode,		METH_VARARGS}, | ||||||
|     {"unicode_internal_encode",	unicode_internal_encode,	METH_VARARGS}, |     {"unicode_internal_encode",	unicode_internal_encode,	METH_VARARGS}, | ||||||
|  |  | ||||||
|  | @ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) | ||||||
| 				NULL); | 				NULL); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* --- UTF-32 Codec ------------------------------------------------------- */ | ||||||
|  | 
 | ||||||
|  | PyObject * | ||||||
|  | PyUnicode_DecodeUTF32(const char *s, | ||||||
|  | 		      Py_ssize_t size, | ||||||
|  | 		      const char *errors, | ||||||
|  | 		      int *byteorder) | ||||||
|  | { | ||||||
|  |     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | PyObject * | ||||||
|  | PyUnicode_DecodeUTF32Stateful(const char *s, | ||||||
|  | 			      Py_ssize_t size, | ||||||
|  | 			      const char *errors, | ||||||
|  | 			      int *byteorder, | ||||||
|  | 			      Py_ssize_t *consumed) | ||||||
|  | { | ||||||
|  |     const char *starts = s; | ||||||
|  |     Py_ssize_t startinpos; | ||||||
|  |     Py_ssize_t endinpos; | ||||||
|  |     Py_ssize_t outpos; | ||||||
|  |     PyUnicodeObject *unicode; | ||||||
|  |     Py_UNICODE *p; | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  |     int i, pairs; | ||||||
|  | #else | ||||||
|  |     const int pairs = 0; | ||||||
|  | #endif | ||||||
|  |     const unsigned char *q, *e; | ||||||
|  |     int bo = 0;       /* assume native ordering by default */ | ||||||
|  |     const char *errmsg = ""; | ||||||
|  |     /* On narrow builds we split characters outside the BMP into two
 | ||||||
|  |        codepoints => count how much extra space we need. */ | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  |     for (i = pairs = 0; i < size/4; i++) | ||||||
|  | 	if (((Py_UCS4 *)s)[i] >= 0x10000) | ||||||
|  | 	    pairs++; | ||||||
|  | #endif | ||||||
|  |     /* Offsets from q for retrieving bytes in the right order. */ | ||||||
|  | #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||||
|  |     int iorder[] = {0, 1, 2, 3}; | ||||||
|  | #else | ||||||
|  |     int iorder[] = {3, 2, 1, 0}; | ||||||
|  | #endif | ||||||
|  |     PyObject *errorHandler = NULL; | ||||||
|  |     PyObject *exc = NULL; | ||||||
|  | 
 | ||||||
|  |     /* This might be one to much, because of a BOM */ | ||||||
|  |     unicode = _PyUnicode_New((size+3)/4+pairs); | ||||||
|  |     if (!unicode) | ||||||
|  |         return NULL; | ||||||
|  |     if (size == 0) | ||||||
|  |         return (PyObject *)unicode; | ||||||
|  | 
 | ||||||
|  |     /* Unpack UTF-32 encoded data */ | ||||||
|  |     p = unicode->str; | ||||||
|  |     q = (unsigned char *)s; | ||||||
|  |     e = q + size; | ||||||
|  | 
 | ||||||
|  |     if (byteorder) | ||||||
|  |         bo = *byteorder; | ||||||
|  | 
 | ||||||
|  |     /* Check for BOM marks (U+FEFF) in the input and adjust current
 | ||||||
|  |        byte order setting accordingly. In native mode, the leading BOM | ||||||
|  |        mark is skipped, in all other modes, it is copied to the output | ||||||
|  |        stream as-is (giving a ZWNBSP character). */ | ||||||
|  |     if (bo == 0) { | ||||||
|  |         if (size >= 4) { | ||||||
|  |             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||||
|  |                                 (q[iorder[1]] << 8) | q[iorder[0]]; | ||||||
|  | #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||||
|  | 	    if (bom == 0x0000FEFF) { | ||||||
|  | 		q += 4; | ||||||
|  | 		bo = -1; | ||||||
|  | 	    } | ||||||
|  | 	    else if (bom == 0xFFFE0000) { | ||||||
|  | 		q += 4; | ||||||
|  | 		bo = 1; | ||||||
|  | 	    } | ||||||
|  | #else | ||||||
|  | 	    if (bom == 0x0000FEFF) { | ||||||
|  | 		q += 4; | ||||||
|  | 		bo = 1; | ||||||
|  | 	    } | ||||||
|  | 	    else if (bom == 0xFFFE0000) { | ||||||
|  | 		q += 4; | ||||||
|  | 		bo = -1; | ||||||
|  | 	    } | ||||||
|  | #endif | ||||||
|  | 	} | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (bo == -1) { | ||||||
|  |         /* force LE */ | ||||||
|  |         iorder[0] = 0; | ||||||
|  |         iorder[1] = 1; | ||||||
|  |         iorder[2] = 2; | ||||||
|  |         iorder[3] = 3; | ||||||
|  |     } | ||||||
|  |     else if (bo == 1) { | ||||||
|  |         /* force BE */ | ||||||
|  |         iorder[0] = 3; | ||||||
|  |         iorder[1] = 2; | ||||||
|  |         iorder[2] = 1; | ||||||
|  |         iorder[3] = 0; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     while (q < e) { | ||||||
|  | 	Py_UCS4 ch; | ||||||
|  | 	/* remaining bytes at the end? (size should be divisible by 4) */ | ||||||
|  | 	if (e-q<4) { | ||||||
|  | 	    if (consumed) | ||||||
|  | 		break; | ||||||
|  | 	    errmsg = "truncated data"; | ||||||
|  | 	    startinpos = ((const char *)q)-starts; | ||||||
|  | 	    endinpos = ((const char *)e)-starts; | ||||||
|  | 	    goto utf32Error; | ||||||
|  | 	    /* The remaining input chars are ignored if the callback
 | ||||||
|  | 	       chooses to skip the input */ | ||||||
|  | 	} | ||||||
|  | 	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||||
|  | 	     (q[iorder[1]] << 8) | q[iorder[0]]; | ||||||
|  | 
 | ||||||
|  | 	if (ch >= 0x110000) | ||||||
|  | 	{ | ||||||
|  | 	    errmsg = "codepoint not in range(0x110000)"; | ||||||
|  | 	    startinpos = ((const char *)q)-starts; | ||||||
|  | 	    endinpos = startinpos+4; | ||||||
|  | 	    goto utf32Error; | ||||||
|  | 	} | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  | 	if (ch >= 0x10000) | ||||||
|  | 	{ | ||||||
|  | 	    *p++ = 0xD800 | ((ch-0x10000) >> 10); | ||||||
|  | 	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); | ||||||
|  | 	} | ||||||
|  | 	else | ||||||
|  | #endif | ||||||
|  | 	    *p++ = ch; | ||||||
|  | 	q += 4; | ||||||
|  | 	continue; | ||||||
|  |     utf32Error: | ||||||
|  | 	outpos = p-PyUnicode_AS_UNICODE(unicode); | ||||||
|  |     if (unicode_decode_call_errorhandler( | ||||||
|  |          errors, &errorHandler, | ||||||
|  |          "utf32", errmsg, | ||||||
|  |          starts, size, &startinpos, &endinpos, &exc, &s, | ||||||
|  |          (PyObject **)&unicode, &outpos, &p)) | ||||||
|  | 	    goto onError; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (byteorder) | ||||||
|  |         *byteorder = bo; | ||||||
|  | 
 | ||||||
|  |     if (consumed) | ||||||
|  | 	*consumed = (const char *)q-starts; | ||||||
|  | 
 | ||||||
|  |     /* Adjust length */ | ||||||
|  |     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) | ||||||
|  |         goto onError; | ||||||
|  | 
 | ||||||
|  |     Py_XDECREF(errorHandler); | ||||||
|  |     Py_XDECREF(exc); | ||||||
|  |     return (PyObject *)unicode; | ||||||
|  | 
 | ||||||
|  | onError: | ||||||
|  |     Py_DECREF(unicode); | ||||||
|  |     Py_XDECREF(errorHandler); | ||||||
|  |     Py_XDECREF(exc); | ||||||
|  |     return NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | PyObject * | ||||||
|  | PyUnicode_EncodeUTF32(const Py_UNICODE *s, | ||||||
|  | 		      Py_ssize_t size, | ||||||
|  | 		      const char *errors, | ||||||
|  | 		      int byteorder) | ||||||
|  | { | ||||||
|  |     PyObject *v; | ||||||
|  |     unsigned char *p; | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  |     int i, pairs; | ||||||
|  | #else | ||||||
|  |     const int pairs = 0; | ||||||
|  | #endif | ||||||
|  |     /* Offsets from p for storing byte pairs in the right order. */ | ||||||
|  | #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||||
|  |     int iorder[] = {0, 1, 2, 3}; | ||||||
|  | #else | ||||||
|  |     int iorder[] = {3, 2, 1, 0}; | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #define STORECHAR(CH)                       \ | ||||||
|  |     do {                                    \ | ||||||
|  |         p[iorder[3]] = ((CH) >> 24) & 0xff; \ | ||||||
|  |         p[iorder[2]] = ((CH) >> 16) & 0xff; \ | ||||||
|  |         p[iorder[1]] = ((CH) >> 8) & 0xff;  \ | ||||||
|  |         p[iorder[0]] = (CH) & 0xff;         \ | ||||||
|  |         p += 4;                             \ | ||||||
|  |     } while(0) | ||||||
|  | 
 | ||||||
|  |     /* In narrow builds we can output surrogate pairs as one codepoint,
 | ||||||
|  |        so we need less space. */ | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  |     for (i = pairs = 0; i < size-1; i++) | ||||||
|  | 	if (0xD800 <= s[i] && s[i] <= 0xDBFF && | ||||||
|  | 	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) | ||||||
|  | 	    pairs++; | ||||||
|  | #endif | ||||||
|  |     v = PyString_FromStringAndSize(NULL, | ||||||
|  | 		  4 * (size - pairs + (byteorder == 0))); | ||||||
|  |     if (v == NULL) | ||||||
|  |         return NULL; | ||||||
|  | 
 | ||||||
|  |     p = (unsigned char *)PyString_AS_STRING(v); | ||||||
|  |     if (byteorder == 0) | ||||||
|  | 	STORECHAR(0xFEFF); | ||||||
|  |     if (size == 0) | ||||||
|  |         return v; | ||||||
|  | 
 | ||||||
|  |     if (byteorder == -1) { | ||||||
|  |         /* force LE */ | ||||||
|  |         iorder[0] = 0; | ||||||
|  |         iorder[1] = 1; | ||||||
|  |         iorder[2] = 2; | ||||||
|  |         iorder[3] = 3; | ||||||
|  |     } | ||||||
|  |     else if (byteorder == 1) { | ||||||
|  |         /* force BE */ | ||||||
|  |         iorder[0] = 3; | ||||||
|  |         iorder[1] = 2; | ||||||
|  |         iorder[2] = 1; | ||||||
|  |         iorder[3] = 0; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     while (size-- > 0) { | ||||||
|  | 	Py_UCS4 ch = *s++; | ||||||
|  | #ifndef Py_UNICODE_WIDE | ||||||
|  | 	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { | ||||||
|  | 	    Py_UCS4 ch2 = *s; | ||||||
|  | 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | ||||||
|  | 		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; | ||||||
|  | 		s++; | ||||||
|  | 		size--; | ||||||
|  | 	    } | ||||||
|  | 	} | ||||||
|  | #endif | ||||||
|  |         STORECHAR(ch); | ||||||
|  |     } | ||||||
|  |     return v; | ||||||
|  | #undef STORECHAR | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | PyObject *PyUnicode_AsUTF32String(PyObject *unicode) | ||||||
|  | { | ||||||
|  |     if (!PyUnicode_Check(unicode)) { | ||||||
|  |         PyErr_BadArgument(); | ||||||
|  |         return NULL; | ||||||
|  |     } | ||||||
|  |     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), | ||||||
|  | 				 PyUnicode_GET_SIZE(unicode), | ||||||
|  | 				 NULL, | ||||||
|  | 				 0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* --- UTF-16 Codec ------------------------------------------------------- */ | /* --- UTF-16 Codec ------------------------------------------------------- */ | ||||||
| 
 | 
 | ||||||
| PyObject * | PyObject * | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Walter Dörwald
						Walter Dörwald