mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	svn+ssh://pythondev@svn.python.org/python/trunk ........ r59044 | neal.norwitz | 2007-11-18 17:46:20 -0800 (Sun, 18 Nov 2007) | 1 line Use a slightly more recent version than 1.5.2b2. ........ r59047 | walter.doerwald | 2007-11-19 04:14:05 -0800 (Mon, 19 Nov 2007) | 2 lines Fix typo in comment. ........ r59049 | walter.doerwald | 2007-11-19 04:41:10 -0800 (Mon, 19 Nov 2007) | 4 lines Fix for #1444: utf_8_sig.StreamReader was (indirectly through decode()) calling codecs.utf_8_decode() with final==True, which falled with incomplete byte sequences. Fix and test by James G. Sack. ........ r59051 | nick.coghlan | 2007-11-19 05:56:27 -0800 (Mon, 19 Nov 2007) | 1 line Enable some test_cmd_line_script debugging output to investigate failure on Mac OSX buildbot ........ r59053 | facundo.batista | 2007-11-19 08:30:24 -0800 (Mon, 19 Nov 2007) | 3 lines Fixed detail in add_type() explanation (issue 1463). ........ r59054 | guido.van.rossum | 2007-11-19 09:35:24 -0800 (Mon, 19 Nov 2007) | 2 lines Make this work stand-alone, too. ........ r59055 | guido.van.rossum | 2007-11-19 09:50:22 -0800 (Mon, 19 Nov 2007) | 3 lines Fix the OSX failures in this test -- they were due to /tmp being a symlink to /private/tmp. Adding a call to os.path.realpath() to temp_dir() fixed it. ........
		
			
				
	
	
		
			130 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			130 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
""" Python 'utf-8-sig' Codec
 | 
						|
This work similar to UTF-8 with the following changes:
 | 
						|
 | 
						|
* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
 | 
						|
  first three bytes.
 | 
						|
 | 
						|
* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
 | 
						|
  bytes will be skipped.
 | 
						|
"""
 | 
						|
import codecs
 | 
						|
 | 
						|
### Codec APIs
 | 
						|
 | 
						|
def encode(input, errors='strict'):
 | 
						|
    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
 | 
						|
            len(input))
 | 
						|
 | 
						|
def decode(input, errors='strict'):
 | 
						|
    prefix = 0
 | 
						|
    if input[:3] == codecs.BOM_UTF8:
 | 
						|
        input = input[3:]
 | 
						|
        prefix = 3
 | 
						|
    (output, consumed) = codecs.utf_8_decode(input, errors, True)
 | 
						|
    return (output, consumed+prefix)
 | 
						|
 | 
						|
class IncrementalEncoder(codecs.IncrementalEncoder):
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        codecs.IncrementalEncoder.__init__(self, errors)
 | 
						|
        self.first = 1
 | 
						|
 | 
						|
    def encode(self, input, final=False):
 | 
						|
        if self.first:
 | 
						|
            self.first = 0
 | 
						|
            return codecs.BOM_UTF8 + \
 | 
						|
                   codecs.utf_8_encode(input, self.errors)[0]
 | 
						|
        else:
 | 
						|
            return codecs.utf_8_encode(input, self.errors)[0]
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        codecs.IncrementalEncoder.reset(self)
 | 
						|
        self.first = 1
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        return self.first
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        self.first = state
 | 
						|
 | 
						|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        codecs.BufferedIncrementalDecoder.__init__(self, errors)
 | 
						|
        self.first = 1
 | 
						|
 | 
						|
    def _buffer_decode(self, input, errors, final):
 | 
						|
        if self.first:
 | 
						|
            if len(input) < 3:
 | 
						|
                if codecs.BOM_UTF8.startswith(input):
 | 
						|
                    # not enough data to decide if this really is a BOM
 | 
						|
                    # => try again on the next call
 | 
						|
                    return ("", 0)
 | 
						|
                else:
 | 
						|
                    self.first = 0
 | 
						|
            else:
 | 
						|
                self.first = 0
 | 
						|
                if input[:3] == codecs.BOM_UTF8:
 | 
						|
                    (output, consumed) = \
 | 
						|
                       codecs.utf_8_decode(input[3:], errors, final)
 | 
						|
                    return (output, consumed+3)
 | 
						|
        return codecs.utf_8_decode(input, errors, final)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        codecs.BufferedIncrementalDecoder.reset(self)
 | 
						|
        self.first = 1
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        state = codecs.BufferedIncrementalDecoder.getstate(self)
 | 
						|
        # state[1] must be 0 here, as it isn't passed along to the caller
 | 
						|
        return (state[0], self.first)
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
 | 
						|
        codecs.BufferedIncrementalDecoder.setstate(self, state)
 | 
						|
        self.first = state[1]
 | 
						|
 | 
						|
class StreamWriter(codecs.StreamWriter):
 | 
						|
    def reset(self):
 | 
						|
        codecs.StreamWriter.reset(self)
 | 
						|
        try:
 | 
						|
            del self.encode
 | 
						|
        except AttributeError:
 | 
						|
            pass
 | 
						|
 | 
						|
    def encode(self, input, errors='strict'):
 | 
						|
        self.encode = codecs.utf_8_encode
 | 
						|
        return encode(input, errors)
 | 
						|
 | 
						|
class StreamReader(codecs.StreamReader):
 | 
						|
    def reset(self):
 | 
						|
        codecs.StreamReader.reset(self)
 | 
						|
        try:
 | 
						|
            del self.decode
 | 
						|
        except AttributeError:
 | 
						|
            pass
 | 
						|
 | 
						|
    def decode(self, input, errors='strict'):
 | 
						|
        if len(input) < 3:
 | 
						|
            if codecs.BOM_UTF8.startswith(input):
 | 
						|
                # not enough data to decide if this is a BOM
 | 
						|
                # => try again on the next call
 | 
						|
                return ("", 0)
 | 
						|
        elif input[:3] == codecs.BOM_UTF8:
 | 
						|
            self.decode = codecs.utf_8_decode
 | 
						|
            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
 | 
						|
            return (output, consumed+3)
 | 
						|
        # (else) no BOM present
 | 
						|
        self.decode = codecs.utf_8_decode
 | 
						|
        return codecs.utf_8_decode(input, errors)
 | 
						|
 | 
						|
### encodings module API
 | 
						|
 | 
						|
def getregentry():
 | 
						|
    return codecs.CodecInfo(
 | 
						|
        name='utf-8-sig',
 | 
						|
        encode=encode,
 | 
						|
        decode=decode,
 | 
						|
        incrementalencoder=IncrementalEncoder,
 | 
						|
        incrementaldecoder=IncrementalDecoder,
 | 
						|
        streamreader=StreamReader,
 | 
						|
        streamwriter=StreamWriter,
 | 
						|
    )
 |