| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | """ codecs -- Python Codec Registry, API and helpers.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """#"
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-12-12 17:37:50 +00:00
										 |  |  | import __builtin__, sys | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ### Registry and builtin stateless codec functions | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-31 17:25:23 +00:00
										 |  |  | try: | 
					
						
							|  |  |  |     from _codecs import * | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  | except ImportError, why: | 
					
						
							| 
									
										
										
										
											2006-03-18 16:35:17 +00:00
										 |  |  |     raise SystemError('Failed to load the builtin codecs: %s' % why) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", | 
					
						
							| 
									
										
										
										
											2002-06-04 15:16:29 +00:00
										 |  |  |            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", | 
					
						
							|  |  |  |            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", | 
					
						
							| 
									
										
										
										
											2002-09-02 13:14:32 +00:00
										 |  |  |            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", | 
					
						
							|  |  |  |            "strict_errors", "ignore_errors", "replace_errors", | 
					
						
							|  |  |  |            "xmlcharrefreplace_errors", | 
					
						
							|  |  |  |            "register_error", "lookup_error"] | 
					
						
							| 
									
										
										
										
											2001-01-20 19:54:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | ### Constants | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2002-06-04 15:16:29 +00:00
										 |  |  | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) | 
					
						
							|  |  |  | # and its possible byte string values | 
					
						
							|  |  |  | # for UTF8/UTF16/UTF32 output and little/big endian machines | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-06-04 15:16:29 +00:00
										 |  |  | # UTF-8 | 
					
						
							|  |  |  | BOM_UTF8 = '\xef\xbb\xbf' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # UTF-16, little endian | 
					
						
							|  |  |  | BOM_LE = BOM_UTF16_LE = '\xff\xfe' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # UTF-16, big endian | 
					
						
							|  |  |  | BOM_BE = BOM_UTF16_BE = '\xfe\xff' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # UTF-32, little endian | 
					
						
							|  |  |  | BOM_UTF32_LE = '\xff\xfe\x00\x00' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # UTF-32, big endian | 
					
						
							|  |  |  | BOM_UTF32_BE = '\x00\x00\xfe\xff' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-12-12 17:37:50 +00:00
										 |  |  | if sys.byteorder == 'little': | 
					
						
							| 
									
										
										
										
											2002-06-04 15:16:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-12-12 17:37:50 +00:00
										 |  |  |     # UTF-16, native endianness | 
					
						
							|  |  |  |     BOM = BOM_UTF16 = BOM_UTF16_LE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # UTF-32, native endianness | 
					
						
							|  |  |  |     BOM_UTF32 = BOM_UTF32_LE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # UTF-16, native endianness | 
					
						
							|  |  |  |     BOM = BOM_UTF16 = BOM_UTF16_BE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # UTF-32, native endianness | 
					
						
							|  |  |  |     BOM_UTF32 = BOM_UTF32_BE | 
					
						
							| 
									
										
										
										
											2002-06-04 15:16:29 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Old broken names (don't use in new code) | 
					
						
							|  |  |  | BOM32_LE = BOM_UTF16_LE | 
					
						
							|  |  |  | BOM32_BE = BOM_UTF16_BE | 
					
						
							|  |  |  | BOM64_LE = BOM_UTF32_LE | 
					
						
							|  |  |  | BOM64_BE = BOM_UTF32_BE | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Codec base classes (defining the API) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  | class CodecInfo(tuple): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __new__(cls, encode, decode, streamreader=None, streamwriter=None, | 
					
						
							|  |  |  |         incrementalencoder=None, incrementaldecoder=None, name=None): | 
					
						
							|  |  |  |         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) | 
					
						
							|  |  |  |         self.name = name | 
					
						
							|  |  |  |         self.encode = encode | 
					
						
							|  |  |  |         self.decode = decode | 
					
						
							|  |  |  |         self.incrementalencoder = incrementalencoder | 
					
						
							|  |  |  |         self.incrementaldecoder = incrementaldecoder | 
					
						
							|  |  |  |         self.streamwriter = streamwriter | 
					
						
							|  |  |  |         self.streamreader = streamreader | 
					
						
							|  |  |  |         return self | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | class Codec: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Defines the interface for stateless encoders/decoders.
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |         The .encode()/.decode() methods may use different error | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         handling schemes by providing the errors argument. These | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |         string values are predefined: | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-24 22:14:19 +00:00
										 |  |  |          'strict' - raise a ValueError error (or a subclass) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |          'ignore' - ignore the character and continue with the next | 
					
						
							|  |  |  |          'replace' - replace with a suitable replacement character; | 
					
						
							|  |  |  |                     Python will use the official U+FFFD REPLACEMENT | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |                     CHARACTER for the builtin Unicode codecs on | 
					
						
							|  |  |  |                     decoding and '?' on encoding. | 
					
						
							|  |  |  |          'xmlcharrefreplace' - Replace with the appropriate XML | 
					
						
							|  |  |  |                                character reference (only for encoding). | 
					
						
							|  |  |  |          'backslashreplace'  - Replace with backslashed escape sequences | 
					
						
							|  |  |  |                                (only for encoding). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         The set of allowed values can be extended via register_error. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def encode(self, input, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-17 15:40:35 +00:00
										 |  |  |         """ Encodes the object input and returns a tuple (output
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             object, length consumed). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             errors defines the error handling to apply. It defaults to | 
					
						
							|  |  |  |             'strict' handling. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             The method may not store state in the Codec instance. Use | 
					
						
							|  |  |  |             StreamCodec for codecs which have to keep state in order to | 
					
						
							|  |  |  |             make encoding/decoding efficient. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             The encoder must be able to handle zero length input and | 
					
						
							|  |  |  |             return an empty object of the output object type in this | 
					
						
							|  |  |  |             situation. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def decode(self, input, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Decodes the object input and returns a tuple (output
 | 
					
						
							|  |  |  |             object, length consumed). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             input must be an object which provides the bf_getreadbuf | 
					
						
							|  |  |  |             buffer slot. Python strings, buffer objects and memory | 
					
						
							|  |  |  |             mapped files are examples of objects providing this slot. | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             errors defines the error handling to apply. It defaults to | 
					
						
							|  |  |  |             'strict' handling. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             The method may not store state in the Codec instance. Use | 
					
						
							|  |  |  |             StreamCodec for codecs which have to keep state in order to | 
					
						
							|  |  |  |             make encoding/decoding efficient. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             The decoder must be able to handle zero length input and | 
					
						
							|  |  |  |             return an empty object of the output object type in this | 
					
						
							|  |  |  |             situation. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  | class IncrementalEncoder(object): | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-16 07:49:19 +00:00
										 |  |  |     An IncrementalEncoder encodes an input in multiple steps. The input can be | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     passed piece by piece to the encode() method. The IncrementalEncoder remembers | 
					
						
							|  |  |  |     the state of the Encoding process between calls to encode(). | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, errors='strict'): | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2006-03-16 07:49:19 +00:00
										 |  |  |         Creates an IncrementalEncoder instance. | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         The IncrementalEncoder may use different error handling schemes by | 
					
						
							|  |  |  |         providing the errors keyword argument. See the module docstring | 
					
						
							|  |  |  |         for a list of possible values. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							|  |  |  |         self.buffer = "" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def encode(self, input, final=False): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Encodes input and returns the resulting object. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Resets the encoder to the initial state. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-04-14 18:25:39 +00:00
										 |  |  | class BufferedIncrementalEncoder(IncrementalEncoder): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     This subclass of IncrementalEncoder can be used as the baseclass for an | 
					
						
							|  |  |  |     incremental encoder if the encoder must keep some of the output in a | 
					
						
							|  |  |  |     buffer between calls to encode(). | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, errors='strict'): | 
					
						
							|  |  |  |         IncrementalEncoder.__init__(self, errors) | 
					
						
							|  |  |  |         self.buffer = "" # unencoded input that is kept between calls to encode() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _buffer_encode(self, input, errors, final): | 
					
						
							|  |  |  |         # Overwrite this method in subclasses: It must encode input | 
					
						
							|  |  |  |         # and return an (output, length consumed) tuple | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def encode(self, input, final=False): | 
					
						
							|  |  |  |         # encode input (taking the buffer into account) | 
					
						
							|  |  |  |         data = self.buffer + input | 
					
						
							|  |  |  |         (result, consumed) = self._buffer_encode(data, self.errors, final) | 
					
						
							|  |  |  |         # keep unencoded input until the next call | 
					
						
							|  |  |  |         self.buffer = data[consumed:] | 
					
						
							|  |  |  |         return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  |         IncrementalEncoder.reset(self) | 
					
						
							|  |  |  |         self.buffer = "" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  | class IncrementalDecoder(object): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     An IncrementalDecoder decodes an input in multiple steps. The input can be | 
					
						
							|  |  |  |     passed piece by piece to the decode() method. The IncrementalDecoder | 
					
						
							|  |  |  |     remembers the state of the decoding process between calls to decode(). | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, errors='strict'): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Creates a IncrementalDecoder instance. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         The IncrementalDecoder may use different error handling schemes by | 
					
						
							|  |  |  |         providing the errors keyword argument. See the module docstring | 
					
						
							|  |  |  |         for a list of possible values. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def decode(self, input, final=False): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Decodes input and returns the resulting object. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Resets the decoder to the initial state. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class BufferedIncrementalDecoder(IncrementalDecoder): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     This subclass of IncrementalDecoder can be used as the baseclass for an | 
					
						
							|  |  |  |     incremental decoder if the decoder must be able to handle incomplete byte | 
					
						
							|  |  |  |     sequences. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     def __init__(self, errors='strict'): | 
					
						
							|  |  |  |         IncrementalDecoder.__init__(self, errors) | 
					
						
							|  |  |  |         self.buffer = "" # undecoded input that is kept between calls to decode() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _buffer_decode(self, input, errors, final): | 
					
						
							|  |  |  |         # Overwrite this method in subclasses: It must decode input | 
					
						
							|  |  |  |         # and return an (output, length consumed) tuple | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def decode(self, input, final=False): | 
					
						
							|  |  |  |         # decode input (taking the buffer into account) | 
					
						
							|  |  |  |         data = self.buffer + input | 
					
						
							|  |  |  |         (result, consumed) = self._buffer_decode(data, self.errors, final) | 
					
						
							|  |  |  |         # keep undecoded input until the next call | 
					
						
							|  |  |  |         self.buffer = data[consumed:] | 
					
						
							|  |  |  |         return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  |         IncrementalDecoder.reset(self) | 
					
						
							| 
									
										
										
										
											2006-04-14 15:40:54 +00:00
										 |  |  |         self.buffer = "" | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | # | 
					
						
							|  |  |  | # The StreamWriter and StreamReader class provide generic working | 
					
						
							| 
									
										
										
										
											2001-09-18 20:29:48 +00:00
										 |  |  | # interfaces which can be used to implement new encoding submodules | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | # very easily. See encodings/utf_8.py for an example on how this is | 
					
						
							|  |  |  | # done. | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class StreamWriter(Codec): | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __init__(self, stream, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Creates a StreamWriter instance.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             stream must be a file-like object open for writing | 
					
						
							|  |  |  |             (binary) data. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             The StreamWriter may use different error handling | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             schemes by providing the errors keyword argument. These | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             parameters are predefined: | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |              'strict' - raise a ValueError (or a subclass) | 
					
						
							|  |  |  |              'ignore' - ignore the character and continue with the next | 
					
						
							|  |  |  |              'replace'- replace with a suitable replacement character | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |              'xmlcharrefreplace' - Replace with the appropriate XML | 
					
						
							|  |  |  |                                    character reference. | 
					
						
							|  |  |  |              'backslashreplace'  - Replace with backslashed escape | 
					
						
							|  |  |  |                                    sequences (only for encoding). | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             The set of allowed parameter values can be extended via | 
					
						
							|  |  |  |             register_error. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         self.stream = stream | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     def write(self, object): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Writes the object's contents encoded to self.stream.
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |         data, consumed = self.encode(object, self.errors) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         self.stream.write(data) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     def writelines(self, list): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Writes the concatenated list of strings to the stream
 | 
					
						
							|  |  |  |             using .write(). | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.write(''.join(list)) | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |     def reset(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Flushes and resets the codec buffers used for keeping state.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Calling this method should ensure that the data on the | 
					
						
							|  |  |  |             output is put into a clean state, that allows appending | 
					
						
							|  |  |  |             of new fresh data without having to rescan the whole | 
					
						
							|  |  |  |             stream to recover state. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __getattr__(self, name, | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |                     getattr=getattr): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Inherit all other methods from the underlying stream.
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |         return getattr(self.stream, name) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ### | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class StreamReader(Codec): | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __init__(self, stream, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Creates a StreamReader instance.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             stream must be a file-like object open for reading | 
					
						
							|  |  |  |             (binary) data. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             The StreamReader may use different error handling | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             schemes by providing the errors keyword argument. These | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             parameters are predefined: | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |              'strict' - raise a ValueError (or a subclass) | 
					
						
							|  |  |  |              'ignore' - ignore the character and continue with the next | 
					
						
							|  |  |  |              'replace'- replace with a suitable replacement character; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-19 21:42:53 +00:00
										 |  |  |             The set of allowed parameter values can be extended via | 
					
						
							|  |  |  |             register_error. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         self.stream = stream | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |         self.bytebuffer = "" | 
					
						
							| 
									
										
										
										
											2005-07-20 22:15:39 +00:00
										 |  |  |         # For str->str decoding this will stay a str | 
					
						
							|  |  |  |         # For str->unicode decoding the first read will promote it to unicode | 
					
						
							|  |  |  |         self.charbuffer = "" | 
					
						
							| 
									
										
										
										
											2005-09-18 08:34:39 +00:00
										 |  |  |         self.linebuffer = None | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |     def decode(self, input, errors='strict'): | 
					
						
							|  |  |  |         raise NotImplementedError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-08-24 07:38:12 +00:00
										 |  |  |     def read(self, size=-1, chars=-1, firstline=False): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Decodes data from the stream self.stream and returns the
 | 
					
						
							|  |  |  |             resulting object. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             chars indicates the number of characters to read from the | 
					
						
							|  |  |  |             stream. read() will never return more than chars | 
					
						
							|  |  |  |             characters, but it might return less, if there are not enough | 
					
						
							|  |  |  |             characters available. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             size indicates the approximate maximum number of bytes to | 
					
						
							|  |  |  |             read from the stream for decoding purposes. The decoder | 
					
						
							|  |  |  |             can modify this setting as appropriate. The default value | 
					
						
							|  |  |  |             -1 indicates to read and decode as much as possible.  size | 
					
						
							|  |  |  |             is intended to prevent having to decode huge files in one | 
					
						
							|  |  |  |             step. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-08-24 07:38:12 +00:00
										 |  |  |             If firstline is true, and a UnicodeDecodeError happens | 
					
						
							|  |  |  |             after the first line terminator in the input only the first line | 
					
						
							|  |  |  |             will be returned, the rest of the input will be kept until the | 
					
						
							|  |  |  |             next call to read(). | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             The method should use a greedy read strategy meaning that | 
					
						
							|  |  |  |             it should read as much data as is allowed within the | 
					
						
							|  |  |  |             definition of the encoding and the given size, e.g.  if | 
					
						
							|  |  |  |             optional encoding endings or state markers are available | 
					
						
							|  |  |  |             on the stream, these should be read too. | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2005-09-18 08:34:39 +00:00
										 |  |  |         # If we have lines cached, first merge them back into characters | 
					
						
							|  |  |  |         if self.linebuffer: | 
					
						
							|  |  |  |             self.charbuffer = "".join(self.linebuffer) | 
					
						
							|  |  |  |             self.linebuffer = None | 
					
						
							| 
									
										
										
										
											2005-12-25 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |         # read until we get the required number of characters (if available) | 
					
						
							|  |  |  |         while True: | 
					
						
							|  |  |  |             # can the request can be satisfied from the character buffer? | 
					
						
							|  |  |  |             if chars < 0: | 
					
						
							| 
									
										
										
										
											2006-03-06 22:39:12 +00:00
										 |  |  |                 if size < 0: | 
					
						
							|  |  |  |                     if self.charbuffer: | 
					
						
							|  |  |  |                         break | 
					
						
							|  |  |  |                 elif len(self.charbuffer) >= size: | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |                 if len(self.charbuffer) >= chars: | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |             # we need more data | 
					
						
							|  |  |  |             if size < 0: | 
					
						
							|  |  |  |                 newdata = self.stream.read() | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 newdata = self.stream.read(size) | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |             # decode bytes (those remaining from the last call included) | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             data = self.bytebuffer + newdata | 
					
						
							| 
									
										
										
										
											2005-08-24 07:38:12 +00:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 newchars, decodedbytes = self.decode(data, self.errors) | 
					
						
							|  |  |  |             except UnicodeDecodeError, exc: | 
					
						
							|  |  |  |                 if firstline: | 
					
						
							|  |  |  |                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors) | 
					
						
							|  |  |  |                     lines = newchars.splitlines(True) | 
					
						
							|  |  |  |                     if len(lines)<=1: | 
					
						
							|  |  |  |                         raise | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     raise | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             # keep undecoded bytes until the next call | 
					
						
							|  |  |  |             self.bytebuffer = data[decodedbytes:] | 
					
						
							|  |  |  |             # put new characters in the character buffer | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |             self.charbuffer += newchars | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             # there was no data available | 
					
						
							|  |  |  |             if not newdata: | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |                 break | 
					
						
							|  |  |  |         if chars < 0: | 
					
						
							|  |  |  |             # Return everything we've got | 
					
						
							|  |  |  |             result = self.charbuffer | 
					
						
							| 
									
										
										
										
											2005-07-20 22:15:39 +00:00
										 |  |  |             self.charbuffer = "" | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             # Return the first chars characters | 
					
						
							|  |  |  |             result = self.charbuffer[:chars] | 
					
						
							|  |  |  |             self.charbuffer = self.charbuffer[chars:] | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |         return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def readline(self, size=None, keepends=True): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Read one line from the input stream and return the
 | 
					
						
							|  |  |  |             decoded data. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             size, if given, is passed as size argument to the | 
					
						
							|  |  |  |             read() method. | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2005-09-18 08:34:39 +00:00
										 |  |  |         # If we have lines cached from an earlier read, return | 
					
						
							|  |  |  |         # them unconditionally | 
					
						
							|  |  |  |         if self.linebuffer: | 
					
						
							|  |  |  |             line = self.linebuffer[0] | 
					
						
							|  |  |  |             del self.linebuffer[0] | 
					
						
							|  |  |  |             if len(self.linebuffer) == 1: | 
					
						
							|  |  |  |                 # revert to charbuffer mode; we might need more data | 
					
						
							|  |  |  |                 # next time | 
					
						
							|  |  |  |                 self.charbuffer = self.linebuffer[0] | 
					
						
							|  |  |  |                 self.linebuffer = None | 
					
						
							|  |  |  |             if not keepends: | 
					
						
							|  |  |  |                 line = line.splitlines(False)[0] | 
					
						
							|  |  |  |             return line | 
					
						
							| 
									
										
										
										
											2005-12-25 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |         readsize = size or 72 | 
					
						
							| 
									
										
										
										
											2005-07-20 22:15:39 +00:00
										 |  |  |         line = "" | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |         # If size is given, we call read() only once | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |         while True: | 
					
						
							| 
									
										
										
										
											2005-08-24 07:38:12 +00:00
										 |  |  |             data = self.read(readsize, firstline=True) | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |             if data: | 
					
						
							| 
									
										
										
										
											2005-04-21 21:42:35 +00:00
										 |  |  |                 # If we're at a "\r" read one extra character (which might | 
					
						
							|  |  |  |                 # be a "\n") to get a proper line ending. If the stream is | 
					
						
							| 
									
										
										
										
											2005-04-21 21:32:03 +00:00
										 |  |  |                 # temporarily exhausted we return the wrong line ending. | 
					
						
							| 
									
										
										
										
											2005-07-20 22:15:39 +00:00
										 |  |  |                 if data.endswith("\r"): | 
					
						
							| 
									
										
										
										
											2005-04-04 21:38:47 +00:00
										 |  |  |                     data += self.read(size=1, chars=1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |             line += data | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |             lines = line.splitlines(True) | 
					
						
							|  |  |  |             if lines: | 
					
						
							| 
									
										
										
										
											2005-09-18 08:34:39 +00:00
										 |  |  |                 if len(lines) > 1: | 
					
						
							|  |  |  |                     # More than one line result; the first line is a full line | 
					
						
							|  |  |  |                     # to return | 
					
						
							|  |  |  |                     line = lines[0] | 
					
						
							|  |  |  |                     del lines[0] | 
					
						
							|  |  |  |                     if len(lines) > 1: | 
					
						
							|  |  |  |                         # cache the remaining lines | 
					
						
							|  |  |  |                         lines[-1] += self.charbuffer | 
					
						
							|  |  |  |                         self.linebuffer = lines | 
					
						
							|  |  |  |                         self.charbuffer = None | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         # only one remaining line, put it back into charbuffer | 
					
						
							|  |  |  |                         self.charbuffer = lines[0] + self.charbuffer | 
					
						
							|  |  |  |                     if not keepends: | 
					
						
							|  |  |  |                         line = line.splitlines(False)[0] | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |                 line0withend = lines[0] | 
					
						
							|  |  |  |                 line0withoutend = lines[0].splitlines(False)[0] | 
					
						
							|  |  |  |                 if line0withend != line0withoutend: # We really have a line end | 
					
						
							|  |  |  |                     # Put the rest back together and keep it until the next call | 
					
						
							| 
									
										
										
										
											2005-07-20 22:15:39 +00:00
										 |  |  |                     self.charbuffer = "".join(lines[1:]) + self.charbuffer | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |                     if keepends: | 
					
						
							|  |  |  |                         line = line0withend | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         line = line0withoutend | 
					
						
							| 
									
										
										
										
											2005-01-10 12:01:39 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |             # we didn't get anything or this was our only try | 
					
						
							| 
									
										
										
										
											2005-01-10 12:01:39 +00:00
										 |  |  |             if not data or size is not None: | 
					
						
							| 
									
										
										
										
											2004-12-21 22:24:00 +00:00
										 |  |  |                 if line and not keepends: | 
					
						
							|  |  |  |                     line = line.splitlines(False)[0] | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |             if readsize<8000: | 
					
						
							|  |  |  |                 readsize *= 2 | 
					
						
							|  |  |  |         return line | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def readlines(self, sizehint=None, keepends=True): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Read all lines available on the input stream
 | 
					
						
							|  |  |  |             and return them as list of lines. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Line breaks are implemented using the codec's decoder | 
					
						
							|  |  |  |             method and are included in the list entries. | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-02-26 15:22:17 +00:00
										 |  |  |             sizehint, if given, is ignored since there is no efficient | 
					
						
							|  |  |  |             way to finding the true end-of-line. | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2004-09-07 20:24:22 +00:00
										 |  |  |         data = self.read() | 
					
						
							| 
									
										
										
										
											2004-10-17 23:51:21 +00:00
										 |  |  |         return data.splitlines(keepends) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Resets the codec buffers used for keeping state.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Note that no stream repositioning should take place. | 
					
						
							| 
									
										
										
										
											2000-07-16 12:04:32 +00:00
										 |  |  |             This method is primarily intended to be able to recover | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             from decoding errors. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2005-03-14 19:06:30 +00:00
										 |  |  |         self.bytebuffer = "" | 
					
						
							|  |  |  |         self.charbuffer = u"" | 
					
						
							| 
									
										
										
										
											2005-09-18 08:34:39 +00:00
										 |  |  |         self.linebuffer = None | 
					
						
							| 
									
										
										
										
											2005-03-14 19:06:30 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-03-14 19:25:41 +00:00
										 |  |  |     def seek(self, offset, whence=0): | 
					
						
							| 
									
										
										
										
											2005-03-14 19:06:30 +00:00
										 |  |  |         """ Set the input stream's current position.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Resets the codec buffers used for keeping state. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.reset() | 
					
						
							|  |  |  |         self.stream.seek(offset, whence) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-06 16:53:44 +00:00
										 |  |  |     def next(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Return the next decoded line from the input stream.""" | 
					
						
							|  |  |  |         line = self.readline() | 
					
						
							|  |  |  |         if line: | 
					
						
							|  |  |  |             return line | 
					
						
							|  |  |  |         raise StopIteration | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __iter__(self): | 
					
						
							|  |  |  |         return self | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __getattr__(self, name, | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |                     getattr=getattr): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Inherit all other methods from the underlying stream.
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |         return getattr(self.stream, name) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ### | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class StreamReaderWriter: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-13 14:11:21 +00:00
										 |  |  |     """ StreamReaderWriter instances allow wrapping streams which
 | 
					
						
							|  |  |  |         work in both read and write modes. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         The design is such that one can use the factory functions | 
					
						
							| 
									
										
										
										
											2000-07-16 12:04:32 +00:00
										 |  |  |         returned by the codec.lookup() function to construct the | 
					
						
							| 
									
										
										
										
											2000-04-13 14:11:21 +00:00
										 |  |  |         instance. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Optional attributes set by the file wrappers below | 
					
						
							|  |  |  |     encoding = 'unknown' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __init__(self, stream, Reader, Writer, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Creates a StreamReaderWriter instance.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             stream must be a Stream-like object. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Reader, Writer must be factory functions or classes | 
					
						
							|  |  |  |             providing the StreamReader, StreamWriter interface resp. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Error handling is done in the same way as defined for the | 
					
						
							|  |  |  |             StreamWriter/Readers. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.stream = stream | 
					
						
							|  |  |  |         self.reader = Reader(stream, errors) | 
					
						
							|  |  |  |         self.writer = Writer(stream, errors) | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def read(self, size=-1): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return self.reader.read(size) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-05-01 16:17:32 +00:00
										 |  |  |     def readline(self, size=None): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return self.reader.readline(size) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-05-01 16:17:32 +00:00
										 |  |  |     def readlines(self, sizehint=None): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return self.reader.readlines(sizehint) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-06 16:53:44 +00:00
										 |  |  |     def next(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Return the next decoded line from the input stream.""" | 
					
						
							|  |  |  |         return self.reader.next() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __iter__(self): | 
					
						
							|  |  |  |         return self | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def write(self, data): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return self.writer.write(data) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def writelines(self, list): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return self.writer.writelines(list) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |     def reset(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.reader.reset() | 
					
						
							|  |  |  |         self.writer.reset() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __getattr__(self, name, | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |                     getattr=getattr): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Inherit all other methods from the underlying stream.
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |         return getattr(self.stream, name) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ### | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class StreamRecoder: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-13 14:11:21 +00:00
										 |  |  |     """ StreamRecoder instances provide a frontend - backend
 | 
					
						
							|  |  |  |         view of encoding data. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         They use the complete set of APIs returned by the | 
					
						
							|  |  |  |         codecs.lookup() function to implement their task. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Data written to the stream is first decoded into an | 
					
						
							|  |  |  |         intermediate format (which is dependent on the given codec | 
					
						
							|  |  |  |         combination) and then written to the stream using an instance | 
					
						
							|  |  |  |         of the provided Writer class. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         In the other direction, data is read from the stream using a | 
					
						
							|  |  |  |         Reader instance and then return encoded data to the caller. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Optional attributes set by the file wrappers below | 
					
						
							|  |  |  |     data_encoding = 'unknown' | 
					
						
							|  |  |  |     file_encoding = 'unknown' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __init__(self, stream, encode, decode, Reader, Writer, | 
					
						
							|  |  |  |                  errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         """ Creates a StreamRecoder instance which implements a two-way
 | 
					
						
							|  |  |  |             conversion: encode and decode work on the frontend (the | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  |             input to .read() and output of .write()) while | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |             Reader and Writer work on the backend (reading and | 
					
						
							| 
									
										
										
										
											2000-03-17 15:42:11 +00:00
										 |  |  |             writing to the stream). | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |             You can use these objects to do transparent direct | 
					
						
							|  |  |  |             recodings from e.g. latin-1 to utf-8 and back. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             stream must be a file-like object. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             encode, decode must adhere to the Codec interface, Reader, | 
					
						
							|  |  |  |             Writer must be factory functions or classes providing the | 
					
						
							|  |  |  |             StreamReader, StreamWriter interface resp. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             encode and decode are needed for the frontend translation, | 
					
						
							|  |  |  |             Reader and Writer for the backend translation. Unicode is | 
					
						
							|  |  |  |             used as intermediate encoding. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             Error handling is done in the same way as defined for the | 
					
						
							|  |  |  |             StreamWriter/Readers. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.stream = stream | 
					
						
							|  |  |  |         self.encode = encode | 
					
						
							|  |  |  |         self.decode = decode | 
					
						
							|  |  |  |         self.reader = Reader(stream, errors) | 
					
						
							|  |  |  |         self.writer = Writer(stream, errors) | 
					
						
							|  |  |  |         self.errors = errors | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def read(self, size=-1): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         data = self.reader.read(size) | 
					
						
							|  |  |  |         data, bytesencoded = self.encode(data, self.errors) | 
					
						
							|  |  |  |         return data | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def readline(self, size=None): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if size is None: | 
					
						
							|  |  |  |             data = self.reader.readline() | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             data = self.reader.readline(size) | 
					
						
							|  |  |  |         data, bytesencoded = self.encode(data, self.errors) | 
					
						
							|  |  |  |         return data | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def readlines(self, sizehint=None): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-02-26 15:22:17 +00:00
										 |  |  |         data = self.reader.read() | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |         data, bytesencoded = self.encode(data, self.errors) | 
					
						
							|  |  |  |         return data.splitlines(1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-06 16:53:44 +00:00
										 |  |  |     def next(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Return the next decoded line from the input stream.""" | 
					
						
							| 
									
										
										
										
											2005-09-01 11:56:53 +00:00
										 |  |  |         data = self.reader.next() | 
					
						
							|  |  |  |         data, bytesencoded = self.encode(data, self.errors) | 
					
						
							|  |  |  |         return data | 
					
						
							| 
									
										
										
										
											2002-11-06 16:53:44 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def __iter__(self): | 
					
						
							|  |  |  |         return self | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def write(self, data): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         data, bytesdecoded = self.decode(data, self.errors) | 
					
						
							|  |  |  |         return self.writer.write(data) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def writelines(self, list): | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         data = ''.join(list) | 
					
						
							|  |  |  |         data, bytesdecoded = self.decode(data, self.errors) | 
					
						
							|  |  |  |         return self.writer.write(data) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.reader.reset() | 
					
						
							|  |  |  |         self.writer.reset() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |     def __getattr__(self, name, | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |                     getattr=getattr): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """ Inherit all other methods from the underlying stream.
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-15 17:19:16 +00:00
										 |  |  |         return getattr(self.stream, name) | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ### Shortcuts | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-21 21:21:04 +00:00
										 |  |  | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """ Open an encoded file using the given mode and return
 | 
					
						
							|  |  |  |         a wrapped version providing transparent encoding/decoding. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Note: The wrapped version will only accept the object format | 
					
						
							|  |  |  |         defined by the codecs, i.e. Unicode objects for most builtin | 
					
						
							| 
									
										
										
										
											2005-03-16 03:51:56 +00:00
										 |  |  |         codecs. Output is also codec dependent and will usually be | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         Unicode as well. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-21 21:21:04 +00:00
										 |  |  |         Files are always opened in binary mode, even if no binary mode | 
					
						
							| 
									
										
										
										
											2003-02-02 23:08:27 +00:00
										 |  |  |         was specified. This is done to avoid data loss due to encodings | 
					
						
							| 
									
										
										
										
											2000-06-21 21:21:04 +00:00
										 |  |  |         using 8-bit values. The default file mode is 'rb' meaning to | 
					
						
							|  |  |  |         open the file in binary read mode. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |         encoding specifies the encoding which is to be used for the | 
					
						
							| 
									
										
										
										
											2003-02-02 23:08:27 +00:00
										 |  |  |         file. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         errors may be given to define the error handling. It defaults | 
					
						
							|  |  |  |         to 'strict' which causes ValueErrors to be raised in case an | 
					
						
							|  |  |  |         encoding error occurs. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         buffering has the same meaning as for the builtin open() API. | 
					
						
							|  |  |  |         It defaults to line buffered. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-13 14:11:21 +00:00
										 |  |  |         The returned wrapped file object provides an extra attribute | 
					
						
							|  |  |  |         .encoding which allows querying the used encoding. This | 
					
						
							|  |  |  |         attribute is only available if an encoding was specified as | 
					
						
							|  |  |  |         parameter. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     if encoding is not None and \ | 
					
						
							|  |  |  |        'b' not in mode: | 
					
						
							|  |  |  |         # Force opening of the file in binary mode | 
					
						
							|  |  |  |         mode = mode + 'b' | 
					
						
							|  |  |  |     file = __builtin__.open(filename, mode, buffering) | 
					
						
							|  |  |  |     if encoding is None: | 
					
						
							|  |  |  |         return file | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     info = lookup(encoding) | 
					
						
							|  |  |  |     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Add attributes to simplify introspection | 
					
						
							|  |  |  |     srw.encoding = encoding | 
					
						
							|  |  |  |     return srw | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """ Return a wrapped version of file which provides transparent
 | 
					
						
							|  |  |  |         encoding translation. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Strings written to the wrapped file are interpreted according | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |         to the given data_encoding and then written to the original | 
					
						
							|  |  |  |         file as string using file_encoding. The intermediate encoding | 
					
						
							|  |  |  |         will usually be Unicode but depends on the specified codecs. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Strings are read from the file using file_encoding and then | 
					
						
							|  |  |  |         passed back to the caller as string using data_encoding. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |         If file_encoding is not given, it defaults to data_encoding. | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         errors may be given to define the error handling. It defaults | 
					
						
							|  |  |  |         to 'strict' which causes ValueErrors to be raised in case an | 
					
						
							|  |  |  |         encoding error occurs. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-13 14:11:21 +00:00
										 |  |  |         The returned wrapped file object provides two extra attributes | 
					
						
							|  |  |  |         .data_encoding and .file_encoding which reflect the given | 
					
						
							|  |  |  |         parameters of the same name. The attributes can be used for | 
					
						
							|  |  |  |         introspection by Python programs. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     if file_encoding is None: | 
					
						
							|  |  |  |         file_encoding = data_encoding | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     info = lookup(data_encoding) | 
					
						
							|  |  |  |     sr = StreamRecoder(file, info.encode, info.decode, | 
					
						
							|  |  |  |                        info.streamreader, info.streamwriter, errors) | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Add attributes to simplify introspection | 
					
						
							|  |  |  |     sr.data_encoding = data_encoding | 
					
						
							|  |  |  |     sr.file_encoding = file_encoding | 
					
						
							|  |  |  |     return sr | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-19 11:24:48 +00:00
										 |  |  | ### Helpers for codec lookup | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def getencoder(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its encoder function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     return lookup(encoding).encode | 
					
						
							| 
									
										
										
										
											2001-09-19 11:24:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def getdecoder(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its decoder function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     return lookup(encoding).decode | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def getincrementalencoder(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its IncrementalEncoder class or factory function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found | 
					
						
							|  |  |  |         or the codecs doesn't provide an incremental encoder. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     encoder = lookup(encoding).incrementalencoder | 
					
						
							|  |  |  |     if encoder is None: | 
					
						
							|  |  |  |         raise LookupError(encoding) | 
					
						
							|  |  |  |     return encoder | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def getincrementaldecoder(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its IncrementalDecoder class or factory function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found | 
					
						
							|  |  |  |         or the codecs doesn't provide an incremental decoder. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     decoder = lookup(encoding).incrementaldecoder | 
					
						
							|  |  |  |     if decoder is None: | 
					
						
							|  |  |  |         raise LookupError(encoding) | 
					
						
							|  |  |  |     return decoder | 
					
						
							| 
									
										
										
										
											2001-09-19 11:24:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def getreader(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its StreamReader class or factory function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     return lookup(encoding).streamreader | 
					
						
							| 
									
										
										
										
											2001-09-19 11:24:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def getwriter(encoding): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Lookup up the codec for the given encoding and return
 | 
					
						
							|  |  |  |         its StreamWriter class or factory function. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Raises a LookupError in case the encoding cannot be found. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-15 11:35:15 +00:00
										 |  |  |     return lookup(encoding).streamwriter | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def iterencode(iterator, encoding, errors='strict', **kwargs): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Encoding iterator. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Encodes the input strings from the iterator using a IncrementalEncoder. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     errors and kwargs are passed through to the IncrementalEncoder | 
					
						
							|  |  |  |     constructor. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     encoder = getincrementalencoder(encoding)(errors, **kwargs) | 
					
						
							|  |  |  |     for input in iterator: | 
					
						
							|  |  |  |         output = encoder.encode(input) | 
					
						
							|  |  |  |         if output: | 
					
						
							|  |  |  |             yield output | 
					
						
							|  |  |  |     output = encoder.encode("", True) | 
					
						
							|  |  |  |     if output: | 
					
						
							|  |  |  |         yield output | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def iterdecode(iterator, encoding, errors='strict', **kwargs): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Decoding iterator. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Decodes the input strings from the iterator using a IncrementalDecoder. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     errors and kwargs are passed through to the IncrementalDecoder | 
					
						
							|  |  |  |     constructor. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     decoder = getincrementaldecoder(encoding)(errors, **kwargs) | 
					
						
							|  |  |  |     for input in iterator: | 
					
						
							|  |  |  |         output = decoder.decode(input) | 
					
						
							|  |  |  |         if output: | 
					
						
							|  |  |  |             yield output | 
					
						
							|  |  |  |     output = decoder.decode("", True) | 
					
						
							|  |  |  |     if output: | 
					
						
							|  |  |  |         yield output | 
					
						
							| 
									
										
										
										
											2001-09-19 11:24:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | ### Helpers for charmap-based codecs | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def make_identity_dict(rng): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ make_identity_dict(rng) -> dict
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Return a dictionary where elements of the rng sequence are | 
					
						
							|  |  |  |         mapped to themselves. | 
					
						
							| 
									
										
										
										
											2001-01-14 23:36:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     res = {} | 
					
						
							|  |  |  |     for i in rng: | 
					
						
							|  |  |  |         res[i]=i | 
					
						
							|  |  |  |     return res | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-16 09:41:45 +00:00
										 |  |  | def make_encoding_map(decoding_map): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Creates an encoding map from a decoding map.
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2003-02-02 23:08:27 +00:00
										 |  |  |         If a target mapping in the decoding map occurs multiple | 
					
						
							| 
									
										
										
										
											2001-05-16 09:41:45 +00:00
										 |  |  |         times, then that target is mapped to None (undefined mapping), | 
					
						
							|  |  |  |         causing an exception when encountered by the charmap codec | 
					
						
							|  |  |  |         during translation. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         One example where this happens is cp875.py which decodes | 
					
						
							|  |  |  |         multiple character to \u001a. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     m = {} | 
					
						
							|  |  |  |     for k,v in decoding_map.items(): | 
					
						
							| 
									
										
										
										
											2002-06-01 14:18:47 +00:00
										 |  |  |         if not v in m: | 
					
						
							| 
									
										
										
										
											2001-05-16 09:41:45 +00:00
										 |  |  |             m[v] = k | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             m[v] = None | 
					
						
							|  |  |  |     return m | 
					
						
							| 
									
										
										
										
											2001-05-29 06:06:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-02 13:14:32 +00:00
										 |  |  | ### error handlers | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-03-08 15:03:08 +00:00
										 |  |  | try: | 
					
						
							|  |  |  |     strict_errors = lookup_error("strict") | 
					
						
							|  |  |  |     ignore_errors = lookup_error("ignore") | 
					
						
							|  |  |  |     replace_errors = lookup_error("replace") | 
					
						
							|  |  |  |     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") | 
					
						
							|  |  |  |     backslashreplace_errors = lookup_error("backslashreplace") | 
					
						
							|  |  |  | except LookupError: | 
					
						
							|  |  |  |     # In --disable-unicode builds, these error handler are missing | 
					
						
							|  |  |  |     strict_errors = None | 
					
						
							|  |  |  |     ignore_errors = None | 
					
						
							|  |  |  |     replace_errors = None | 
					
						
							|  |  |  |     xmlcharrefreplace_errors = None | 
					
						
							|  |  |  |     backslashreplace_errors = None | 
					
						
							| 
									
										
										
										
											2002-09-02 13:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-07-31 08:54:55 +00:00
										 |  |  | # Tell modulefinder that using codecs probably needs the encodings | 
					
						
							|  |  |  | # package | 
					
						
							|  |  |  | _false = 0 | 
					
						
							|  |  |  | if _false: | 
					
						
							|  |  |  |     import encodings | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | ### Tests | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 23:20:43 +00:00
										 |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Make stdout translate Latin-1 output into UTF-8 output | 
					
						
							|  |  |  |     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | 
					
						
							| 
									
										
										
										
											2000-04-11 15:41:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-04-11 15:37:43 +00:00
										 |  |  |     # Have stdin translate Latin-1 input into UTF-8 input | 
					
						
							|  |  |  |     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |