mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
This commit is contained in:
		
							parent
							
								
									437e6a3b15
								
							
						
					
					
						commit
						6e39080649
					
				
					 12 changed files with 999 additions and 2 deletions
				
			
		|  | @ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs: | |||
|    object.  Error handling is "strict".  Return *NULL* if an exception was raised | ||||
|    by the codec. | ||||
| 
 | ||||
| These are the UTF-32 codec APIs: | ||||
| 
 | ||||
| .. % --- UTF-32 Codecs ------------------------------------------------------ */ | ||||
| 
 | ||||
| 
 | ||||
| .. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder) | ||||
| 
 | ||||
|    Decode *length* bytes from a UTF-32 encoded buffer string and return the | ||||
|    corresponding Unicode object.  *errors* (if non-*NULL*) defines the error | ||||
|    handling. It defaults to "strict". | ||||
| 
 | ||||
|    If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte | ||||
|    order:: | ||||
| 
 | ||||
|       *byteorder == -1: little endian | ||||
|       *byteorder == 0:  native order | ||||
|       *byteorder == 1:  big endian | ||||
| 
 | ||||
|    and then switches if the first four bytes of the input data are a byte order mark | ||||
|    (BOM) and the specified byte order is native order.  This BOM is not copied into | ||||
|    the resulting Unicode string.  After completion, *\*byteorder* is set to the | ||||
|    current byte order at the end of input data. | ||||
| 
 | ||||
|    In a narrow build codepoints outside the BMP will be decoded as surrogate pairs. | ||||
| 
 | ||||
|    If *byteorder* is *NULL*, the codec starts in native order mode. | ||||
| 
 | ||||
|    Return *NULL* if an exception was raised by the codec. | ||||
| 
 | ||||
|    .. versionadded:: 2.6 | ||||
| 
 | ||||
| 
 | ||||
| .. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed) | ||||
| 
 | ||||
|    If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If | ||||
|    *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat | ||||
|    trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible | ||||
|    by four) as an error. Those bytes will not be decoded and the number of bytes | ||||
|    that have been decoded will be stored in *consumed*. | ||||
| 
 | ||||
|    .. versionadded:: 2.6 | ||||
| 
 | ||||
| 
 | ||||
| .. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder) | ||||
| 
 | ||||
|    Return a Python bytes object holding the UTF-32 encoded value of the Unicode | ||||
|    data in *s*.  If *byteorder* is not ``0``, output is written according to the | ||||
|    following byte order:: | ||||
| 
 | ||||
|       byteorder == -1: little endian | ||||
|       byteorder == 0:  native byte order (writes a BOM mark) | ||||
|       byteorder == 1:  big endian | ||||
| 
 | ||||
|    If byteorder is ``0``, the output string will always start with the Unicode BOM | ||||
|    mark (U+FEFF). In the other two modes, no BOM mark is prepended. | ||||
| 
 | ||||
|    If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output | ||||
|    as a single codepoint. | ||||
| 
 | ||||
|    Return *NULL* if an exception was raised by the codec. | ||||
| 
 | ||||
|    .. versionadded:: 2.6 | ||||
| 
 | ||||
| 
 | ||||
| .. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode) | ||||
| 
 | ||||
|    Return a Python string using the UTF-32 encoding in native byte order. The | ||||
|    string always starts with a BOM mark.  Error handling is "strict".  Return | ||||
|    *NULL* if an exception was raised by the codec. | ||||
| 
 | ||||
|    .. versionadded:: 2.6 | ||||
| 
 | ||||
| 
 | ||||
| These are the UTF-16 codec APIs: | ||||
| 
 | ||||
| .. % --- UTF-16 Codecs ------------------------------------------------------ */ | ||||
|  |  | |||
|  | @ -1045,6 +1045,12 @@ particular, the following variants typically exist: | |||
| | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       | | ||||
| |                 | s_jisx0213                     |                                | | ||||
| +-----------------+--------------------------------+--------------------------------+ | ||||
| | utf_32          | U32, utf32                     | all languages                  | | ||||
| +-----------------+--------------------------------+--------------------------------+ | ||||
| | utf_32_be       | UTF-32BE                       | all languages                  | | ||||
| +-----------------+--------------------------------+--------------------------------+ | ||||
| | utf_32_le       | UTF-32LE                       | all languages                  | | ||||
| +-----------------+--------------------------------+--------------------------------+ | ||||
| | utf_16          | U16, utf16                     | all languages                  | | ||||
| +-----------------+--------------------------------+--------------------------------+ | ||||
| | utf_16_be       | UTF-16BE                       | all languages (BMP only)       | | ||||
|  |  | |||
|  | @ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString | ||||
| # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String | ||||
| # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString | ||||
| # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String | ||||
| # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String | ||||
| # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String | ||||
| # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode | ||||
|  | @ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap | ||||
| # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 | ||||
| # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape | ||||
| # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 | ||||
| # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful | ||||
| # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 | ||||
| # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful | ||||
| # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 | ||||
|  | @ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal | ||||
| # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 | ||||
| # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape | ||||
| # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 | ||||
| # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 | ||||
| # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 | ||||
| # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape | ||||
|  | @ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString | ||||
| # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String | ||||
| # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString | ||||
| # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String | ||||
| # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String | ||||
| # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String | ||||
| # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode | ||||
|  | @ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap | ||||
| # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 | ||||
| # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape | ||||
| # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 | ||||
| # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful | ||||
| # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 | ||||
| # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful | ||||
| # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 | ||||
|  | @ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal | ||||
| # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 | ||||
| # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape | ||||
| # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 | ||||
| # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 | ||||
| # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 | ||||
| # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape | ||||
|  | @ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( | |||
|     const char *errors		/* error handling */ | ||||
|     ); | ||||
| 
 | ||||
| /* --- UTF-32 Codecs ------------------------------------------------------ */ | ||||
| 
 | ||||
| /* Decodes length bytes from a UTF-32 encoded buffer string and returns
 | ||||
|    the corresponding Unicode object. | ||||
| 
 | ||||
|    errors (if non-NULL) defines the error handling. It defaults | ||||
|    to "strict".  | ||||
| 
 | ||||
|    If byteorder is non-NULL, the decoder starts decoding using the | ||||
|    given byte order: | ||||
| 
 | ||||
| 	*byteorder == -1: little endian | ||||
| 	*byteorder == 0:  native order | ||||
| 	*byteorder == 1:  big endian | ||||
| 
 | ||||
|    In native mode, the first four bytes of the stream are checked for a | ||||
|    BOM mark. If found, the BOM mark is analysed, the byte order | ||||
|    adjusted and the BOM skipped.  In the other modes, no BOM mark | ||||
|    interpretation is done. After completion, *byteorder is set to the | ||||
|    current byte order at the end of input data. | ||||
| 
 | ||||
|    If byteorder is NULL, the codec starts in native order mode. | ||||
| 
 | ||||
| */ | ||||
| 
 | ||||
| PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( | ||||
|     const char *string, 	/* UTF-32 encoded string */ | ||||
|     Py_ssize_t length,	 	/* size of string */ | ||||
|     const char *errors,		/* error handling */ | ||||
|     int *byteorder		/* pointer to byteorder to use
 | ||||
| 				   0=native;-1=LE,1=BE; updated on | ||||
| 				   exit */ | ||||
|     ); | ||||
| 
 | ||||
| PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( | ||||
|     const char *string, 	/* UTF-32 encoded string */ | ||||
|     Py_ssize_t length,	 	/* size of string */ | ||||
|     const char *errors,		/* error handling */ | ||||
|     int *byteorder,		/* pointer to byteorder to use
 | ||||
| 				   0=native;-1=LE,1=BE; updated on | ||||
| 				   exit */ | ||||
|     Py_ssize_t *consumed	/* bytes consumed */ | ||||
|     ); | ||||
| 
 | ||||
| /* Returns a Python string using the UTF-32 encoding in native byte
 | ||||
|    order. The string always starts with a BOM mark.  */ | ||||
| 
 | ||||
| PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( | ||||
|     PyObject *unicode	 	/* Unicode object */ | ||||
|     ); | ||||
| 
 | ||||
| /* Returns a Python string object holding the UTF-32 encoded value of
 | ||||
|    the Unicode data. | ||||
| 
 | ||||
|    If byteorder is not 0, output is written according to the following | ||||
|    byte order: | ||||
| 
 | ||||
|    byteorder == -1: little endian | ||||
|    byteorder == 0:  native byte order (writes a BOM mark) | ||||
|    byteorder == 1:  big endian | ||||
| 
 | ||||
|    If byteorder is 0, the output string will always start with the | ||||
|    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is | ||||
|    prepended. | ||||
| 
 | ||||
| */ | ||||
| 
 | ||||
| PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( | ||||
|     const Py_UNICODE *data, 	/* Unicode char buffer */ | ||||
|     Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */ | ||||
|     const char *errors,		/* error handling */ | ||||
|     int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */ | ||||
|     ); | ||||
| 
 | ||||
| /* --- UTF-16 Codecs ------------------------------------------------------ */ | ||||
| 
 | ||||
| /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 | ||||
|  |  | |||
|  | @ -490,6 +490,16 @@ | |||
|     'unicodelittleunmarked' : 'utf_16_le', | ||||
|     'utf_16le'           : 'utf_16_le', | ||||
| 
 | ||||
|     # utf_32 codec | ||||
|     'u32'                : 'utf_32', | ||||
|     'utf32'              : 'utf_32', | ||||
| 
 | ||||
|     # utf_32_be codec | ||||
|     'utf_32be'           : 'utf_32_be', | ||||
| 
 | ||||
|     # utf_32_le codec | ||||
|     'utf_32le'           : 'utf_32_le', | ||||
| 
 | ||||
|     # utf_7 codec | ||||
|     'u7'                 : 'utf_7', | ||||
|     'utf7'               : 'utf_7', | ||||
|  |  | |||
							
								
								
									
										144
									
								
								Lib/encodings/utf_32.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										144
									
								
								Lib/encodings/utf_32.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,144 @@ | |||
| """ | ||||
| Python 'utf-32' Codec | ||||
| """ | ||||
| import codecs, sys | ||||
| 
 | ||||
| ### Codec APIs | ||||
| 
 | ||||
| encode = codecs.utf_32_encode | ||||
| 
 | ||||
| def decode(input, errors='strict'): | ||||
|     return codecs.utf_32_decode(input, errors, True) | ||||
| 
 | ||||
| class IncrementalEncoder(codecs.IncrementalEncoder): | ||||
|     def __init__(self, errors='strict'): | ||||
|         codecs.IncrementalEncoder.__init__(self, errors) | ||||
|         self.encoder = None | ||||
| 
 | ||||
|     def encode(self, input, final=False): | ||||
|         if self.encoder is None: | ||||
|             result = codecs.utf_32_encode(input, self.errors)[0] | ||||
|             if sys.byteorder == 'little': | ||||
|                 self.encoder = codecs.utf_32_le_encode | ||||
|             else: | ||||
|                 self.encoder = codecs.utf_32_be_encode | ||||
|             return result | ||||
|         return self.encoder(input, self.errors)[0] | ||||
| 
 | ||||
|     def reset(self): | ||||
|         codecs.IncrementalEncoder.reset(self) | ||||
|         self.encoder = None | ||||
| 
 | ||||
|     def getstate(self): | ||||
|         # state info we return to the caller: | ||||
|         # 0: stream is in natural order for this platform | ||||
|         # 2: endianness hasn't been determined yet | ||||
|         # (we're never writing in unnatural order) | ||||
|         return (2 if self.encoder is None else 0) | ||||
| 
 | ||||
|     def setstate(self, state): | ||||
|         if state: | ||||
|             self.encoder = None | ||||
|         else: | ||||
|             if sys.byteorder == 'little': | ||||
|                 self.encoder = codecs.utf_32_le_encode | ||||
|             else: | ||||
|                 self.encoder = codecs.utf_32_be_encode | ||||
| 
 | ||||
| class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||
|     def __init__(self, errors='strict'): | ||||
|         codecs.BufferedIncrementalDecoder.__init__(self, errors) | ||||
|         self.decoder = None | ||||
| 
 | ||||
|     def _buffer_decode(self, input, errors, final): | ||||
|         if self.decoder is None: | ||||
|             (output, consumed, byteorder) = \ | ||||
|                 codecs.utf_32_ex_decode(input, errors, 0, final) | ||||
|             if byteorder == -1: | ||||
|                 self.decoder = codecs.utf_32_le_decode | ||||
|             elif byteorder == 1: | ||||
|                 self.decoder = codecs.utf_32_be_decode | ||||
|             elif consumed >= 4: | ||||
|                 raise UnicodeError("UTF-32 stream does not start with BOM") | ||||
|             return (output, consumed) | ||||
|         return self.decoder(input, self.errors, final) | ||||
| 
 | ||||
|     def reset(self): | ||||
|         codecs.BufferedIncrementalDecoder.reset(self) | ||||
|         self.decoder = None | ||||
| 
 | ||||
|     def getstate(self): | ||||
|         # additonal state info from the base class must be None here, | ||||
|         # as it isn't passed along to the caller | ||||
|         state = codecs.BufferedIncrementalDecoder.getstate(self)[0] | ||||
|         # additional state info we pass to the caller: | ||||
|         # 0: stream is in natural order for this platform | ||||
|         # 1: stream is in unnatural order | ||||
|         # 2: endianness hasn't been determined yet | ||||
|         if self.decoder is None: | ||||
|             return (state, 2) | ||||
|         addstate = int((sys.byteorder == "big") != | ||||
|                        (self.decoder is codecs.utf_32_be_decode)) | ||||
|         return (state, addstate) | ||||
| 
 | ||||
|     def setstate(self, state): | ||||
|         # state[1] will be ignored by BufferedIncrementalDecoder.setstate() | ||||
|         codecs.BufferedIncrementalDecoder.setstate(self, state) | ||||
|         state = state[1] | ||||
|         if state == 0: | ||||
|             self.decoder = (codecs.utf_32_be_decode | ||||
|                             if sys.byteorder == "big" | ||||
|                             else codecs.utf_32_le_decode) | ||||
|         elif state == 1: | ||||
|             self.decoder = (codecs.utf_32_le_decode | ||||
|                             if sys.byteorder == "big" | ||||
|                             else codecs.utf_32_be_decode) | ||||
|         else: | ||||
|             self.decoder = None | ||||
| 
 | ||||
| class StreamWriter(codecs.StreamWriter): | ||||
|     def __init__(self, stream, errors='strict'): | ||||
|         self.bom_written = False | ||||
|         codecs.StreamWriter.__init__(self, stream, errors) | ||||
| 
 | ||||
|     def encode(self, input, errors='strict'): | ||||
|         self.bom_written = True | ||||
|         result = codecs.utf_32_encode(input, errors) | ||||
|         if sys.byteorder == 'little': | ||||
|             self.encode = codecs.utf_32_le_encode | ||||
|         else: | ||||
|             self.encode = codecs.utf_32_be_encode | ||||
|         return result | ||||
| 
 | ||||
| class StreamReader(codecs.StreamReader): | ||||
| 
 | ||||
|     def reset(self): | ||||
|         codecs.StreamReader.reset(self) | ||||
|         try: | ||||
|             del self.decode | ||||
|         except AttributeError: | ||||
|             pass | ||||
| 
 | ||||
|     def decode(self, input, errors='strict'): | ||||
|         (object, consumed, byteorder) = \ | ||||
|             codecs.utf_32_ex_decode(input, errors, 0, False) | ||||
|         if byteorder == -1: | ||||
|             self.decode = codecs.utf_32_le_decode | ||||
|         elif byteorder == 1: | ||||
|             self.decode = codecs.utf_32_be_decode | ||||
|         elif consumed>=4: | ||||
|             raise UnicodeError,"UTF-32 stream does not start with BOM" | ||||
|         return (object, consumed) | ||||
| 
 | ||||
| ### encodings module API | ||||
| 
 | ||||
| def getregentry(): | ||||
|     return codecs.CodecInfo( | ||||
|         name='utf-32', | ||||
|         encode=encode, | ||||
|         decode=decode, | ||||
|         incrementalencoder=IncrementalEncoder, | ||||
|         incrementaldecoder=IncrementalDecoder, | ||||
|         streamreader=StreamReader, | ||||
|         streamwriter=StreamWriter, | ||||
|     ) | ||||
							
								
								
									
										37
									
								
								Lib/encodings/utf_32_be.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								Lib/encodings/utf_32_be.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| """ | ||||
| Python 'utf-32-be' Codec | ||||
| """ | ||||
| import codecs | ||||
| 
 | ||||
| ### Codec APIs | ||||
| 
 | ||||
| encode = codecs.utf_32_be_encode | ||||
| 
 | ||||
| def decode(input, errors='strict'): | ||||
|     return codecs.utf_32_be_decode(input, errors, True) | ||||
| 
 | ||||
| class IncrementalEncoder(codecs.IncrementalEncoder): | ||||
|     def encode(self, input, final=False): | ||||
|         return codecs.utf_32_be_encode(input, self.errors)[0] | ||||
| 
 | ||||
| class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||
|     _buffer_decode = codecs.utf_32_be_decode | ||||
| 
 | ||||
| class StreamWriter(codecs.StreamWriter): | ||||
|     encode = codecs.utf_32_be_encode | ||||
| 
 | ||||
| class StreamReader(codecs.StreamReader): | ||||
|     decode = codecs.utf_32_be_decode | ||||
| 
 | ||||
| ### encodings module API | ||||
| 
 | ||||
| def getregentry(): | ||||
|     return codecs.CodecInfo( | ||||
|         name='utf-32-be', | ||||
|         encode=encode, | ||||
|         decode=decode, | ||||
|         incrementalencoder=IncrementalEncoder, | ||||
|         incrementaldecoder=IncrementalDecoder, | ||||
|         streamreader=StreamReader, | ||||
|         streamwriter=StreamWriter, | ||||
|     ) | ||||
							
								
								
									
										37
									
								
								Lib/encodings/utf_32_le.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								Lib/encodings/utf_32_le.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| """ | ||||
| Python 'utf-32-le' Codec | ||||
| """ | ||||
| import codecs | ||||
| 
 | ||||
| ### Codec APIs | ||||
| 
 | ||||
| encode = codecs.utf_32_le_encode | ||||
| 
 | ||||
| def decode(input, errors='strict'): | ||||
|     return codecs.utf_32_le_decode(input, errors, True) | ||||
| 
 | ||||
| class IncrementalEncoder(codecs.IncrementalEncoder): | ||||
|     def encode(self, input, final=False): | ||||
|         return codecs.utf_32_le_encode(input, self.errors)[0] | ||||
| 
 | ||||
| class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||
|     _buffer_decode = codecs.utf_32_le_decode | ||||
| 
 | ||||
| class StreamWriter(codecs.StreamWriter): | ||||
|     encode = codecs.utf_32_le_encode | ||||
| 
 | ||||
| class StreamReader(codecs.StreamReader): | ||||
|     decode = codecs.utf_32_le_decode | ||||
| 
 | ||||
| ### encodings module API | ||||
| 
 | ||||
| def getregentry(): | ||||
|     return codecs.CodecInfo( | ||||
|         name='utf-32-le', | ||||
|         encode=encode, | ||||
|         decode=decode, | ||||
|         incrementalencoder=IncrementalEncoder, | ||||
|         incrementaldecoder=IncrementalDecoder, | ||||
|         streamreader=StreamReader, | ||||
|         streamwriter=StreamWriter, | ||||
|     ) | ||||
|  | @ -285,7 +285,8 @@ def handler2(exc): | |||
| 
 | ||||
|     def test_longstrings(self): | ||||
|         # test long strings to check for memory overflow problems | ||||
|         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] | ||||
|         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", | ||||
|                    "backslashreplace"] | ||||
|         # register the handlers under different names, | ||||
|         # to prevent the codec from recognizing the name | ||||
|         for err in errors: | ||||
|  | @ -293,7 +294,8 @@ def test_longstrings(self): | |||
|         l = 1000 | ||||
|         errors += [ "test." + err for err in errors ] | ||||
|         for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: | ||||
|             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): | ||||
|             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", | ||||
|                         "utf-8", "utf-7", "utf-16", "utf-32"): | ||||
|                 for err in errors: | ||||
|                     try: | ||||
|                         uni.encode(enc, err) | ||||
|  |  | |||
|  | @ -244,6 +244,137 @@ def test_bug1098990_b(self): | |||
|         self.assertEqual(reader.readline(), s5) | ||||
|         self.assertEqual(reader.readline(), u"") | ||||
| 
 | ||||
| class UTF32Test(ReadTest): | ||||
|     encoding = "utf-32" | ||||
| 
 | ||||
|     spamle = ('\xff\xfe\x00\x00' | ||||
|               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' | ||||
|               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') | ||||
|     spambe = ('\x00\x00\xfe\xff' | ||||
|               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' | ||||
|               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') | ||||
| 
 | ||||
|     def test_only_one_bom(self): | ||||
|         _,_,reader,writer = codecs.lookup(self.encoding) | ||||
|         # encode some stream | ||||
|         s = StringIO.StringIO() | ||||
|         f = writer(s) | ||||
|         f.write(u"spam") | ||||
|         f.write(u"spam") | ||||
|         d = s.getvalue() | ||||
|         # check whether there is exactly one BOM in it | ||||
|         self.assert_(d == self.spamle or d == self.spambe) | ||||
|         # try to read it back | ||||
|         s = StringIO.StringIO(d) | ||||
|         f = reader(s) | ||||
|         self.assertEquals(f.read(), u"spamspam") | ||||
| 
 | ||||
|     def test_badbom(self): | ||||
|         s = StringIO.StringIO(4*"\xff") | ||||
|         f = codecs.getreader(self.encoding)(s) | ||||
|         self.assertRaises(UnicodeError, f.read) | ||||
| 
 | ||||
|         s = StringIO.StringIO(8*"\xff") | ||||
|         f = codecs.getreader(self.encoding)(s) | ||||
|         self.assertRaises(UnicodeError, f.read) | ||||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             u"\x00\xff\u0100\uffff", | ||||
|             [ | ||||
|                 u"", # first byte of BOM read | ||||
|                 u"", # second byte of BOM read | ||||
|                 u"", # third byte of BOM read | ||||
|                 u"", # fourth byte of BOM read => byteorder known | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100\uffff", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|     def test_errors(self): | ||||
|         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, | ||||
|                           "\xff", "strict", True) | ||||
| 
 | ||||
| class UTF32LETest(ReadTest): | ||||
|     encoding = "utf-32-le" | ||||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             u"\x00\xff\u0100\uffff", | ||||
|             [ | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100\uffff", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|     def test_simple(self): | ||||
|         self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00") | ||||
| 
 | ||||
|     def test_errors(self): | ||||
|         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, | ||||
|                           "\xff", "strict", True) | ||||
| 
 | ||||
| class UTF32BETest(ReadTest): | ||||
|     encoding = "utf-32-be" | ||||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             u"\x00\xff\u0100\uffff", | ||||
|             [ | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100", | ||||
|                 u"\x00\xff\u0100\uffff", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|     def test_simple(self): | ||||
|         self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03") | ||||
| 
 | ||||
|     def test_errors(self): | ||||
|         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, | ||||
|                           "\xff", "strict", True) | ||||
| 
 | ||||
| class UTF16Test(ReadTest): | ||||
|     encoding = "utf-16" | ||||
| 
 | ||||
|  | @ -1278,6 +1409,9 @@ def test_streamreaderwriter(self): | |||
| 
 | ||||
| def test_main(): | ||||
|     test_support.run_unittest( | ||||
|         UTF32Test, | ||||
|         UTF32LETest, | ||||
|         UTF32BETest, | ||||
|         UTF16Test, | ||||
|         UTF16LETest, | ||||
|         UTF16BETest, | ||||
|  |  | |||
|  | @ -243,6 +243,8 @@ Library | |||
| - GB18030 codec now can encode additional two-byte characters that | ||||
|   are missing in GBK. | ||||
| 
 | ||||
| - Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE. | ||||
| 
 | ||||
| - Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot | ||||
|   represent the result in a single character. | ||||
| 
 | ||||
|  |  | |||
|  | @ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self, | |||
|     return tuple; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_decode(PyObject *self, | ||||
| 	    PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
|     int byteorder = 0; | ||||
|     int final = 0; | ||||
|     Py_ssize_t consumed; | ||||
|     PyObject *decoded; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode", | ||||
| 			  &data, &size, &errors, &final)) | ||||
| 	return NULL; | ||||
|     if (size < 0) { | ||||
| 	    PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||
| 	    return 0; | ||||
|     } | ||||
|     consumed = size; /* This is overwritten unless final is true. */ | ||||
|     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, | ||||
| 					    final ? NULL : &consumed); | ||||
|     if (decoded == NULL) | ||||
| 	return NULL; | ||||
|     return codec_tuple(decoded, consumed); | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_le_decode(PyObject *self, | ||||
| 		 PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
|     int byteorder = -1; | ||||
|     int final = 0; | ||||
|     Py_ssize_t consumed; | ||||
|     PyObject *decoded = NULL; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode", | ||||
| 			  &data, &size, &errors, &final)) | ||||
| 	return NULL; | ||||
| 
 | ||||
|     if (size < 0) { | ||||
|           PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||
|           return 0; | ||||
|     } | ||||
|     consumed = size; /* This is overwritten unless final is true. */ | ||||
|     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, | ||||
| 	&byteorder, final ? NULL : &consumed); | ||||
|     if (decoded == NULL) | ||||
| 	return NULL; | ||||
|     return codec_tuple(decoded, consumed); | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_be_decode(PyObject *self, | ||||
| 		 PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
|     int byteorder = 1; | ||||
|     int final = 0; | ||||
|     Py_ssize_t consumed; | ||||
|     PyObject *decoded = NULL; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode", | ||||
| 			  &data, &size, &errors, &final)) | ||||
| 	return NULL; | ||||
|     if (size < 0) { | ||||
|           PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||
|           return 0; | ||||
|     } | ||||
|     consumed = size; /* This is overwritten unless final is true. */ | ||||
|     decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, | ||||
| 	&byteorder, final ? NULL : &consumed); | ||||
|     if (decoded == NULL) | ||||
| 	return NULL; | ||||
|     return codec_tuple(decoded, consumed); | ||||
| } | ||||
| 
 | ||||
| /* This non-standard version also provides access to the byteorder
 | ||||
|    parameter of the builtin UTF-32 codec. | ||||
| 
 | ||||
|    It returns a tuple (unicode, bytesread, byteorder) with byteorder | ||||
|    being the value in effect at the end of data. | ||||
| 
 | ||||
| */ | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_ex_decode(PyObject *self, | ||||
| 		 PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
|     int byteorder = 0; | ||||
|     PyObject *unicode, *tuple; | ||||
|     int final = 0; | ||||
|     Py_ssize_t consumed; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode", | ||||
| 			  &data, &size, &errors, &byteorder, &final)) | ||||
| 	return NULL; | ||||
|     if (size < 0) { | ||||
| 	    PyErr_SetString(PyExc_ValueError, "negative argument"); | ||||
| 	    return 0; | ||||
|     } | ||||
|     consumed = size; /* This is overwritten unless final is true. */ | ||||
|     unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, | ||||
| 					    final ? NULL : &consumed); | ||||
|     if (unicode == NULL) | ||||
| 	return NULL; | ||||
|     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder); | ||||
|     Py_DECREF(unicode); | ||||
|     return tuple; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| unicode_escape_decode(PyObject *self, | ||||
| 		     PyObject *args) | ||||
|  | @ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self, | |||
|     return v; | ||||
| } | ||||
| 
 | ||||
| /* This version provides access to the byteorder parameter of the
 | ||||
|    builtin UTF-32 codecs as optional third argument. It defaults to 0 | ||||
|    which means: use the native byte order and prepend the data with a | ||||
|    BOM mark. | ||||
| 
 | ||||
| */ | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_encode(PyObject *self, | ||||
| 	    PyObject *args) | ||||
| { | ||||
|     PyObject *str, *v; | ||||
|     const char *errors = NULL; | ||||
|     int byteorder = 0; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode", | ||||
| 			  &str, &errors, &byteorder)) | ||||
| 	return NULL; | ||||
| 
 | ||||
|     str = PyUnicode_FromObject(str); | ||||
|     if (str == NULL) | ||||
| 	return NULL; | ||||
|     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||
| 					  PyUnicode_GET_SIZE(str), | ||||
| 					  errors, | ||||
| 					  byteorder), | ||||
| 		    PyUnicode_GET_SIZE(str)); | ||||
|     Py_DECREF(str); | ||||
|     return v; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_le_encode(PyObject *self, | ||||
| 		 PyObject *args) | ||||
| { | ||||
|     PyObject *str, *v; | ||||
|     const char *errors = NULL; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode", | ||||
| 			  &str, &errors)) | ||||
| 	return NULL; | ||||
| 
 | ||||
|     str = PyUnicode_FromObject(str); | ||||
|     if (str == NULL) | ||||
| 	return NULL; | ||||
|     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||
| 					     PyUnicode_GET_SIZE(str), | ||||
| 					     errors, | ||||
| 					     -1), | ||||
| 		       PyUnicode_GET_SIZE(str)); | ||||
|     Py_DECREF(str); | ||||
|     return v; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| utf_32_be_encode(PyObject *self, | ||||
| 		 PyObject *args) | ||||
| { | ||||
|     PyObject *str, *v; | ||||
|     const char *errors = NULL; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode", | ||||
| 			  &str, &errors)) | ||||
| 	return NULL; | ||||
| 
 | ||||
|     str = PyUnicode_FromObject(str); | ||||
|     if (str == NULL) | ||||
| 	return NULL; | ||||
|     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), | ||||
| 					  PyUnicode_GET_SIZE(str), | ||||
| 					  errors, | ||||
| 					  +1), | ||||
| 		    PyUnicode_GET_SIZE(str)); | ||||
|     Py_DECREF(str); | ||||
|     return v; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| unicode_escape_encode(PyObject *self, | ||||
| 		     PyObject *args) | ||||
|  | @ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = { | |||
|     {"utf_16_le_decode",	utf_16_le_decode,		METH_VARARGS}, | ||||
|     {"utf_16_be_decode",	utf_16_be_decode,		METH_VARARGS}, | ||||
|     {"utf_16_ex_decode",	utf_16_ex_decode,		METH_VARARGS}, | ||||
|     {"utf_32_encode",		utf_32_encode,			METH_VARARGS}, | ||||
|     {"utf_32_le_encode",	utf_32_le_encode,		METH_VARARGS}, | ||||
|     {"utf_32_be_encode",	utf_32_be_encode,		METH_VARARGS}, | ||||
|     {"utf_32_decode",		utf_32_decode,			METH_VARARGS}, | ||||
|     {"utf_32_le_decode",	utf_32_le_decode,		METH_VARARGS}, | ||||
|     {"utf_32_be_decode",	utf_32_be_decode,		METH_VARARGS}, | ||||
|     {"utf_32_ex_decode",	utf_32_ex_decode,		METH_VARARGS}, | ||||
|     {"unicode_escape_encode",	unicode_escape_encode,		METH_VARARGS}, | ||||
|     {"unicode_escape_decode",	unicode_escape_decode,		METH_VARARGS}, | ||||
|     {"unicode_internal_encode",	unicode_internal_encode,	METH_VARARGS}, | ||||
|  |  | |||
|  | @ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) | |||
| 				NULL); | ||||
| } | ||||
| 
 | ||||
| /* --- UTF-32 Codec ------------------------------------------------------- */ | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_DecodeUTF32(const char *s, | ||||
| 		      Py_ssize_t size, | ||||
| 		      const char *errors, | ||||
| 		      int *byteorder) | ||||
| { | ||||
|     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_DecodeUTF32Stateful(const char *s, | ||||
| 			      Py_ssize_t size, | ||||
| 			      const char *errors, | ||||
| 			      int *byteorder, | ||||
| 			      Py_ssize_t *consumed) | ||||
| { | ||||
|     const char *starts = s; | ||||
|     Py_ssize_t startinpos; | ||||
|     Py_ssize_t endinpos; | ||||
|     Py_ssize_t outpos; | ||||
|     PyUnicodeObject *unicode; | ||||
|     Py_UNICODE *p; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     int i, pairs; | ||||
| #else | ||||
|     const int pairs = 0; | ||||
| #endif | ||||
|     const unsigned char *q, *e; | ||||
|     int bo = 0;       /* assume native ordering by default */ | ||||
|     const char *errmsg = ""; | ||||
|     /* On narrow builds we split characters outside the BMP into two
 | ||||
|        codepoints => count how much extra space we need. */ | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     for (i = pairs = 0; i < size/4; i++) | ||||
| 	if (((Py_UCS4 *)s)[i] >= 0x10000) | ||||
| 	    pairs++; | ||||
| #endif | ||||
|     /* Offsets from q for retrieving bytes in the right order. */ | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
|     int iorder[] = {0, 1, 2, 3}; | ||||
| #else | ||||
|     int iorder[] = {3, 2, 1, 0}; | ||||
| #endif | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|     /* This might be one to much, because of a BOM */ | ||||
|     unicode = _PyUnicode_New((size+3)/4+pairs); | ||||
|     if (!unicode) | ||||
|         return NULL; | ||||
|     if (size == 0) | ||||
|         return (PyObject *)unicode; | ||||
| 
 | ||||
|     /* Unpack UTF-32 encoded data */ | ||||
|     p = unicode->str; | ||||
|     q = (unsigned char *)s; | ||||
|     e = q + size; | ||||
| 
 | ||||
|     if (byteorder) | ||||
|         bo = *byteorder; | ||||
| 
 | ||||
|     /* Check for BOM marks (U+FEFF) in the input and adjust current
 | ||||
|        byte order setting accordingly. In native mode, the leading BOM | ||||
|        mark is skipped, in all other modes, it is copied to the output | ||||
|        stream as-is (giving a ZWNBSP character). */ | ||||
|     if (bo == 0) { | ||||
|         if (size >= 4) { | ||||
|             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
|                                 (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
| 	    if (bom == 0x0000FEFF) { | ||||
| 		q += 4; | ||||
| 		bo = -1; | ||||
| 	    } | ||||
| 	    else if (bom == 0xFFFE0000) { | ||||
| 		q += 4; | ||||
| 		bo = 1; | ||||
| 	    } | ||||
| #else | ||||
| 	    if (bom == 0x0000FEFF) { | ||||
| 		q += 4; | ||||
| 		bo = 1; | ||||
| 	    } | ||||
| 	    else if (bom == 0xFFFE0000) { | ||||
| 		q += 4; | ||||
| 		bo = -1; | ||||
| 	    } | ||||
| #endif | ||||
| 	} | ||||
|     } | ||||
| 
 | ||||
|     if (bo == -1) { | ||||
|         /* force LE */ | ||||
|         iorder[0] = 0; | ||||
|         iorder[1] = 1; | ||||
|         iorder[2] = 2; | ||||
|         iorder[3] = 3; | ||||
|     } | ||||
|     else if (bo == 1) { | ||||
|         /* force BE */ | ||||
|         iorder[0] = 3; | ||||
|         iorder[1] = 2; | ||||
|         iorder[2] = 1; | ||||
|         iorder[3] = 0; | ||||
|     } | ||||
| 
 | ||||
|     while (q < e) { | ||||
| 	Py_UCS4 ch; | ||||
| 	/* remaining bytes at the end? (size should be divisible by 4) */ | ||||
| 	if (e-q<4) { | ||||
| 	    if (consumed) | ||||
| 		break; | ||||
| 	    errmsg = "truncated data"; | ||||
| 	    startinpos = ((const char *)q)-starts; | ||||
| 	    endinpos = ((const char *)e)-starts; | ||||
| 	    goto utf32Error; | ||||
| 	    /* The remaining input chars are ignored if the callback
 | ||||
| 	       chooses to skip the input */ | ||||
| 	} | ||||
| 	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
| 	     (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| 
 | ||||
| 	if (ch >= 0x110000) | ||||
| 	{ | ||||
| 	    errmsg = "codepoint not in range(0x110000)"; | ||||
| 	    startinpos = ((const char *)q)-starts; | ||||
| 	    endinpos = startinpos+4; | ||||
| 	    goto utf32Error; | ||||
| 	} | ||||
| #ifndef Py_UNICODE_WIDE | ||||
| 	if (ch >= 0x10000) | ||||
| 	{ | ||||
| 	    *p++ = 0xD800 | ((ch-0x10000) >> 10); | ||||
| 	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); | ||||
| 	} | ||||
| 	else | ||||
| #endif | ||||
| 	    *p++ = ch; | ||||
| 	q += 4; | ||||
| 	continue; | ||||
|     utf32Error: | ||||
| 	outpos = p-PyUnicode_AS_UNICODE(unicode); | ||||
|     if (unicode_decode_call_errorhandler( | ||||
|          errors, &errorHandler, | ||||
|          "utf32", errmsg, | ||||
|          starts, size, &startinpos, &endinpos, &exc, &s, | ||||
|          (PyObject **)&unicode, &outpos, &p)) | ||||
| 	    goto onError; | ||||
|     } | ||||
| 
 | ||||
|     if (byteorder) | ||||
|         *byteorder = bo; | ||||
| 
 | ||||
|     if (consumed) | ||||
| 	*consumed = (const char *)q-starts; | ||||
| 
 | ||||
|     /* Adjust length */ | ||||
|     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) | ||||
|         goto onError; | ||||
| 
 | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return (PyObject *)unicode; | ||||
| 
 | ||||
| onError: | ||||
|     Py_DECREF(unicode); | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return NULL; | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_EncodeUTF32(const Py_UNICODE *s, | ||||
| 		      Py_ssize_t size, | ||||
| 		      const char *errors, | ||||
| 		      int byteorder) | ||||
| { | ||||
|     PyObject *v; | ||||
|     unsigned char *p; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     int i, pairs; | ||||
| #else | ||||
|     const int pairs = 0; | ||||
| #endif | ||||
|     /* Offsets from p for storing byte pairs in the right order. */ | ||||
| #ifdef BYTEORDER_IS_LITTLE_ENDIAN | ||||
|     int iorder[] = {0, 1, 2, 3}; | ||||
| #else | ||||
|     int iorder[] = {3, 2, 1, 0}; | ||||
| #endif | ||||
| 
 | ||||
| #define STORECHAR(CH)                       \ | ||||
|     do {                                    \ | ||||
|         p[iorder[3]] = ((CH) >> 24) & 0xff; \ | ||||
|         p[iorder[2]] = ((CH) >> 16) & 0xff; \ | ||||
|         p[iorder[1]] = ((CH) >> 8) & 0xff;  \ | ||||
|         p[iorder[0]] = (CH) & 0xff;         \ | ||||
|         p += 4;                             \ | ||||
|     } while(0) | ||||
| 
 | ||||
|     /* In narrow builds we can output surrogate pairs as one codepoint,
 | ||||
|        so we need less space. */ | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|     for (i = pairs = 0; i < size-1; i++) | ||||
| 	if (0xD800 <= s[i] && s[i] <= 0xDBFF && | ||||
| 	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) | ||||
| 	    pairs++; | ||||
| #endif | ||||
|     v = PyString_FromStringAndSize(NULL, | ||||
| 		  4 * (size - pairs + (byteorder == 0))); | ||||
|     if (v == NULL) | ||||
|         return NULL; | ||||
| 
 | ||||
|     p = (unsigned char *)PyString_AS_STRING(v); | ||||
|     if (byteorder == 0) | ||||
| 	STORECHAR(0xFEFF); | ||||
|     if (size == 0) | ||||
|         return v; | ||||
| 
 | ||||
|     if (byteorder == -1) { | ||||
|         /* force LE */ | ||||
|         iorder[0] = 0; | ||||
|         iorder[1] = 1; | ||||
|         iorder[2] = 2; | ||||
|         iorder[3] = 3; | ||||
|     } | ||||
|     else if (byteorder == 1) { | ||||
|         /* force BE */ | ||||
|         iorder[0] = 3; | ||||
|         iorder[1] = 2; | ||||
|         iorder[2] = 1; | ||||
|         iorder[3] = 0; | ||||
|     } | ||||
| 
 | ||||
|     while (size-- > 0) { | ||||
| 	Py_UCS4 ch = *s++; | ||||
| #ifndef Py_UNICODE_WIDE | ||||
| 	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { | ||||
| 	    Py_UCS4 ch2 = *s; | ||||
| 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | ||||
| 		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; | ||||
| 		s++; | ||||
| 		size--; | ||||
| 	    } | ||||
| 	} | ||||
| #endif | ||||
|         STORECHAR(ch); | ||||
|     } | ||||
|     return v; | ||||
| #undef STORECHAR | ||||
| } | ||||
| 
 | ||||
| PyObject *PyUnicode_AsUTF32String(PyObject *unicode) | ||||
| { | ||||
|     if (!PyUnicode_Check(unicode)) { | ||||
|         PyErr_BadArgument(); | ||||
|         return NULL; | ||||
|     } | ||||
|     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), | ||||
| 				 PyUnicode_GET_SIZE(unicode), | ||||
| 				 NULL, | ||||
| 				 0); | ||||
| } | ||||
| 
 | ||||
| /* --- UTF-16 Codec ------------------------------------------------------- */ | ||||
| 
 | ||||
| PyObject * | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Walter Dörwald
						Walter Dörwald