mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Module codecs -- Python Codec Registry, API and helpers. Written by
Marc-Andre Lemburg.
This commit is contained in:
		
							parent
							
								
									b5f2f1bb6f
								
							
						
					
					
						commit
						0612d84155
					
				
					 1 changed files with 414 additions and 0 deletions
				
			
		
							
								
								
									
										414
									
								
								Lib/codecs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										414
									
								
								Lib/codecs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,414 @@ | |||
| """ codecs -- Python Codec Registry, API and helpers. | ||||
| 
 | ||||
| 
 | ||||
| Written by Marc-Andre Lemburg (mal@lemburg.com). | ||||
| 
 | ||||
| (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | ||||
| 
 | ||||
| """#" | ||||
| 
 | ||||
| import struct,types,__builtin__ | ||||
| 
 | ||||
| ### Registry and builtin stateless codec functions | ||||
| 
 | ||||
| from _codecs import * | ||||
| 
 | ||||
| ### Constants | ||||
| 
 | ||||
| # | ||||
| # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) | ||||
| # | ||||
| BOM = struct.pack('=H',0xFEFF) | ||||
| # | ||||
| BOM_BE = BOM32_BE = '\376\377' | ||||
| #	corresponds to Unicode U+FEFF in UTF-16 on big endian | ||||
| #	platforms == ZERO WIDTH NO-BREAK SPACE | ||||
| BOM_LE = BOM32_LE = '\377\376'  | ||||
| #	corresponds to Unicode U+FFFE in UTF-16 on little endian | ||||
| #	platforms == defined as being an illegal Unicode character | ||||
| 
 | ||||
| # | ||||
| # 64-bit Byte Order Marks | ||||
| # | ||||
| BOM64_BE = '\000\000\376\377' | ||||
| #	corresponds to Unicode U+0000FEFF in UCS-4 | ||||
| BOM64_LE = '\377\376\000\000' | ||||
| #	corresponds to Unicode U+0000FFFE in UCS-4 | ||||
| 
 | ||||
| 
 | ||||
| ### Codec base classes (defining the API) | ||||
| 
 | ||||
| class Codec: | ||||
| 
 | ||||
|     """ Defines the interface for stateless encoders/decoders. | ||||
| 
 | ||||
|         The .encode()/.decode() methods may implement different error | ||||
|         handling schemes by providing the errors argument. These | ||||
|         string values are defined: | ||||
| 
 | ||||
|          'strict' - raise an error (or a subclass) | ||||
|          'ignore' - ignore the character and continue with the next | ||||
|          'replace' - replace with a suitable replacement character; | ||||
|                     Python will use the official U+FFFD REPLACEMENT | ||||
|                     CHARACTER for the builtin Unicode codecs. | ||||
| 
 | ||||
|     """ | ||||
|     def encode(self,input,errors='strict'): | ||||
|          | ||||
|         """ Encodes the object intput and returns a tuple (output | ||||
|             object, length consumed). | ||||
| 
 | ||||
|             errors defines the error handling to apply. It defaults to | ||||
|             'strict' handling. | ||||
| 
 | ||||
|             The method may not store state in the Codec instance. Use | ||||
|             StreamCodec for codecs which have to keep state in order to | ||||
|             make encoding/decoding efficient. | ||||
| 
 | ||||
|             The encoder must be able to handle zero length input and | ||||
|             return an empty object of the output object type in this | ||||
|             situation. | ||||
| 
 | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def decode(self,input,errors='strict'): | ||||
| 
 | ||||
|         """ Decodes the object input and returns a tuple (output | ||||
|             object, length consumed). | ||||
| 
 | ||||
|             input must be an object which provides the bf_getreadbuf | ||||
|             buffer slot. Python strings, buffer objects and memory | ||||
|             mapped files are examples of objects providing this slot. | ||||
|          | ||||
|             errors defines the error handling to apply. It defaults to | ||||
|             'strict' handling. | ||||
| 
 | ||||
|             The method may not store state in the Codec instance. Use | ||||
|             StreamCodec for codecs which have to keep state in order to | ||||
|             make encoding/decoding efficient. | ||||
| 
 | ||||
|             The decoder must be able to handle zero length input and | ||||
|             return an empty object of the output object type in this | ||||
|             situation. | ||||
| 
 | ||||
|         """  | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
| # | ||||
| # The StreamWriter and StreamReader class provide generic working | ||||
| # interfaces which can be used to implement new encodings submodules | ||||
| # very easily. See encodings/utf_8.py for an example on how this is | ||||
| # done. | ||||
| #  | ||||
| 
 | ||||
| class StreamWriter(Codec): | ||||
| 
 | ||||
|     def __init__(self,stream,errors='strict'): | ||||
| 
 | ||||
|         """ Creates a StreamWriter instance. | ||||
| 
 | ||||
|             stream must be a file-like object open for writing | ||||
|             (binary) data. | ||||
| 
 | ||||
|             The StreamWriter may implement different error handling | ||||
|             schemes by providing the errors keyword argument. These | ||||
|             parameters are defined: | ||||
| 
 | ||||
|              'strict' - raise a ValueError (or a subclass) | ||||
|              'ignore' - ignore the character and continue with the next | ||||
|              'replace'- replace with a suitable replacement character | ||||
| 
 | ||||
|         """ | ||||
|         self.stream = stream | ||||
|         self.errors = errors | ||||
| 
 | ||||
|     def write(self,object): | ||||
| 
 | ||||
|         """ Writes the object's contents encoded to self.stream. | ||||
|         """ | ||||
|         data, consumed = self.encode(object,self.errors) | ||||
|         self.stream.write(data) | ||||
| 
 | ||||
|     # XXX .writelines() ? | ||||
|          | ||||
|     def reset(self): | ||||
| 
 | ||||
|         """ Flushes and resets the codec buffers used for keeping state. | ||||
| 
 | ||||
|             Calling this method should ensure that the data on the | ||||
|             output is put into a clean state, that allows appending | ||||
|             of new fresh data without having to rescan the whole | ||||
|             stream to recover state. | ||||
| 
 | ||||
|         """ | ||||
|         pass | ||||
| 
 | ||||
|     def __getattr__(self,name, | ||||
| 
 | ||||
|                     getattr=getattr): | ||||
| 
 | ||||
|         """ Inherit all other methods from the underlying stream. | ||||
|         """ | ||||
|         return getattr(self.stream,name) | ||||
| 
 | ||||
| ### | ||||
| 
 | ||||
| class StreamReader(Codec): | ||||
| 
 | ||||
|     def __init__(self,stream,errors='strict'): | ||||
| 
 | ||||
|         """ Creates a StreamReader instance. | ||||
| 
 | ||||
|             stream must be a file-like object open for reading | ||||
|             (binary) data. | ||||
| 
 | ||||
|             The StreamReader may implement different error handling | ||||
|             schemes by providing the errors keyword argument. These | ||||
|             parameters are defined: | ||||
| 
 | ||||
|              'strict' - raise a ValueError (or a subclass) | ||||
|              'ignore' - ignore the character and continue with the next | ||||
|              'replace'- replace with a suitable replacement character; | ||||
| 
 | ||||
|         """ | ||||
|         self.stream = stream | ||||
|         self.errors = errors | ||||
| 
 | ||||
|     def read(self,size=-1): | ||||
| 
 | ||||
|         """ Decodes data from the stream self.stream and returns the | ||||
|             resulting object. | ||||
| 
 | ||||
|             size indicates the approximate maximum number of bytes to | ||||
|             read from the stream for decoding purposes. The decoder | ||||
|             can modify this setting as appropriate. The default value | ||||
|             -1 indicates to read and decode as much as possible.  size | ||||
|             is intended to prevent having to decode huge files in one | ||||
|             step. | ||||
| 
 | ||||
|             The method should use a greedy read strategy meaning that | ||||
|             it should read as much data as is allowed within the | ||||
|             definition of the encoding and the given size, e.g.  if | ||||
|             optional encoding endings or state markers are available | ||||
|             on the stream, these should be read too. | ||||
| 
 | ||||
|         """ | ||||
|         # Unsliced reading: | ||||
|         if size < 0: | ||||
|             return self.decode(self.stream.read())[0] | ||||
|          | ||||
|         # Sliced reading: | ||||
|         read = self.stream.read | ||||
|         decode = self.decode | ||||
|         data = read(size) | ||||
|         i = 0 | ||||
|         while 1: | ||||
|             try: | ||||
|                 object, decodedbytes = decode(data) | ||||
|             except ValueError,why: | ||||
|                 # This method is slow but should work under pretty much | ||||
|                 # all conditions; at most 10 tries are made | ||||
|                 i = i + 1 | ||||
|                 newdata = read(1) | ||||
|                 if not newdata or i > 10: | ||||
|                     raise | ||||
|                 data = data + newdata | ||||
|             else: | ||||
|                 return object | ||||
| 
 | ||||
|     # XXX .readline() and .readlines() (these are hard to implement | ||||
|     #     without using buffers for keeping read-ahead data) | ||||
| 
 | ||||
|     def reset(self): | ||||
| 
 | ||||
|         """ Resets the codec buffers used for keeping state. | ||||
| 
 | ||||
|             Note that no stream repositioning should take place. | ||||
|             This method is primarely intended to be able to recover | ||||
|             from decoding errors. | ||||
| 
 | ||||
|         """ | ||||
|         pass | ||||
| 
 | ||||
|     def __getattr__(self,name, | ||||
| 
 | ||||
|                     getattr=getattr): | ||||
| 
 | ||||
|         """ Inherit all other methods from the underlying stream. | ||||
|         """ | ||||
|         return getattr(self.stream,name) | ||||
| 
 | ||||
| ### | ||||
| 
 | ||||
| class StreamReaderWriter: | ||||
| 
 | ||||
|     def __init__(self,stream,Reader,Writer,errors='strict'): | ||||
| 
 | ||||
|         """ Creates a StreamReaderWriter instance. | ||||
| 
 | ||||
|             stream must be a Stream-like object. | ||||
| 
 | ||||
|             Reader, Writer must be factory functions or classes | ||||
|             providing the StreamReader, StreamWriter interface resp. | ||||
| 
 | ||||
|             Error handling is done in the same way as defined for the | ||||
|             StreamWriter/Readers. | ||||
| 
 | ||||
|         """ | ||||
|         self.stream = stream | ||||
|         self.reader = Reader(stream, errors) | ||||
|         self.writer = Writer(stream, errors) | ||||
|         self.errors = errors | ||||
| 
 | ||||
|     def read(self,size=-1): | ||||
| 
 | ||||
|         return self.reader.read(size) | ||||
| 
 | ||||
|     def write(self,data): | ||||
| 
 | ||||
|         return self.writer.write(data) | ||||
| 
 | ||||
|     def reset(self): | ||||
| 
 | ||||
|         self.reader.reset() | ||||
|         self.writer.reset() | ||||
| 
 | ||||
|     def __getattr__(self,name, | ||||
| 
 | ||||
|                     getattr=getattr): | ||||
| 
 | ||||
|         """ Inherit all other methods from the underlying stream. | ||||
|         """ | ||||
|         return getattr(self.stream,name) | ||||
| 
 | ||||
| ### | ||||
| 
 | ||||
| class StreamRecoder: | ||||
| 
 | ||||
|     def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'): | ||||
| 
 | ||||
|         """ Creates a StreamRecoder instance which implements a two-way | ||||
|             conversion: encode and decode work on the frontend (the | ||||
|             input to .read() and output of .write()) while  | ||||
|             Reader and Writer work on the backend (reading and | ||||
|             writing to the the stream). | ||||
| 
 | ||||
|             You can use these objects to do transparent direct | ||||
|             recodings from e.g. latin-1 to utf-8 and back. | ||||
| 
 | ||||
|             stream must be a file-like object. | ||||
| 
 | ||||
|             encode, decode must adhere to the Codec interface, Reader, | ||||
|             Writer must be factory functions or classes providing the | ||||
|             StreamReader, StreamWriter interface resp. | ||||
| 
 | ||||
|             encode and decode are needed for the frontend translation, | ||||
|             Reader and Writer for the backend translation. Unicode is | ||||
|             used as intermediate encoding. | ||||
| 
 | ||||
|             Error handling is done in the same way as defined for the | ||||
|             StreamWriter/Readers. | ||||
| 
 | ||||
|         """ | ||||
|         self.stream = stream | ||||
|         self.encode = encode | ||||
|         self.decode = decode | ||||
|         self.reader = Reader(stream, errors) | ||||
|         self.writer = Writer(stream, errors) | ||||
|         self.errors = errors | ||||
| 
 | ||||
|     def read(self,size=-1): | ||||
| 
 | ||||
|         data = self.reader.read(size) | ||||
|         data, bytesencoded = self.encode(data, self.errors) | ||||
|         return data | ||||
| 
 | ||||
|     def write(self,data): | ||||
| 
 | ||||
|         data, bytesdecoded = self.decode(data, self.errors) | ||||
|         return self.writer.write(data) | ||||
| 
 | ||||
|     # .writelines(), .readline() and .readlines() ... see notes | ||||
|     # above. | ||||
| 
 | ||||
|     def reset(self): | ||||
| 
 | ||||
|         self.reader.reset() | ||||
|         self.writer.reset() | ||||
| 
 | ||||
|     def __getattr__(self,name, | ||||
| 
 | ||||
|                     getattr=getattr): | ||||
| 
 | ||||
|         """ Inherit all other methods from the underlying stream. | ||||
|         """ | ||||
|         return getattr(self.stream,name) | ||||
| 
 | ||||
| ### Shortcuts | ||||
| 
 | ||||
| def open(filename, mode, encoding=None, errors='strict', buffering=1): | ||||
| 
 | ||||
|     """ Open an encoded file using the given mode and return | ||||
|         a wrapped version providing transparent encoding/decoding. | ||||
| 
 | ||||
|         Note: The wrapped version will only accept the object format | ||||
|         defined by the codecs, i.e. Unicode objects for most builtin | ||||
|         codecs. Output is also codec dependent and will usually by | ||||
|         Unicode as well. | ||||
| 
 | ||||
|         encoding specifies the encoding which is to be used for the | ||||
|         the file. | ||||
| 
 | ||||
|         errors may be given to define the error handling. It defaults | ||||
|         to 'strict' which causes ValueErrors to be raised in case an | ||||
|         encoding error occurs. | ||||
| 
 | ||||
|         buffering has the same meaning as for the builtin open() API. | ||||
|         It defaults to line buffered. | ||||
| 
 | ||||
|     """ | ||||
|     if encoding is not None and \ | ||||
|        'b' not in mode: | ||||
|         # Force opening of the file in binary mode | ||||
|         mode = mode + 'b' | ||||
|     file = __builtin__.open(filename, mode, buffering) | ||||
|     if encoding is None: | ||||
|         return file | ||||
|     (e,d,sr,sw) = lookup(encoding) | ||||
|     return StreamReaderWriter(file, sr, sw, errors) | ||||
| 
 | ||||
| def EncodedFile(file, input, output=None, errors='strict'): | ||||
| 
 | ||||
|     """ Return a wrapped version of file which provides transparent | ||||
|         encoding translation. | ||||
| 
 | ||||
|         Strings written to the wrapped file are interpreted according | ||||
|         to the given input encoding and then written to the original | ||||
|         file as string using the output encoding. The intermediate | ||||
|         encoding will usually be Unicode but depends on the specified | ||||
|         codecs. | ||||
| 
 | ||||
|         If output is not given, it defaults to input. | ||||
| 
 | ||||
|         errors may be given to define the error handling. It defaults | ||||
|         to 'strict' which causes ValueErrors to be raised in case an | ||||
|         encoding error occurs. | ||||
| 
 | ||||
|     """ | ||||
|     if output is None: | ||||
|         output = input | ||||
|     encode, decode = lookup(input)[:2] | ||||
|     Reader, Writer = lookup(output)[2:] | ||||
|     return StreamRecoder(file, | ||||
|                          encode,decode,Reader,Writer, | ||||
|                          errors) | ||||
| 
 | ||||
| ### Tests | ||||
|      | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     import sys | ||||
|      | ||||
|     # Make stdout translate Latin-1 into Unicode-Escape | ||||
|     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape') | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Guido van Rossum
						Guido van Rossum