Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.

2025-10-31 13:41:24 +00:00 · 2007-08-17 16:41:28 +00:00 · 2007-08-17 16:41:28 +00:00 · 6e39080649
commit 6e39080649
parent 437e6a3b15
12 changed files with 999 additions and 2 deletions
--- a/Doc/c-api/concrete.rst
+++ b/Doc/c-api/concrete.rst
@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
   object.  Error handling is "strict".  Return *NULL* if an exception was raised
   by the codec.
 These are the UTF-32 codec APIs:
 .. % --- UTF-32 Codecs ------------------------------------------------------ */
 .. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
   Decode *length* bytes from a UTF-32 encoded buffer string and return the
   corresponding Unicode object.  *errors* (if non-*NULL*) defines the error
   handling. It defaults to "strict".
   If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
   order::
      *byteorder == -1: little endian
      *byteorder == 0:  native order
      *byteorder == 1:  big endian
   and then switches if the first four bytes of the input data are a byte order mark
   (BOM) and the specified byte order is native order.  This BOM is not copied into
   the resulting Unicode string.  After completion, *\*byteorder* is set to the
   current byte order at the end of input data.
   In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
   If *byteorder* is *NULL*, the codec starts in native order mode.
   Return *NULL* if an exception was raised by the codec.
   .. versionadded:: 2.6
 .. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
   If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
   *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
   trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
   by four) as an error. Those bytes will not be decoded and the number of bytes
   that have been decoded will be stored in *consumed*.
   .. versionadded:: 2.6
 .. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
   Return a Python bytes object holding the UTF-32 encoded value of the Unicode
   data in *s*.  If *byteorder* is not ``0``, output is written according to the
   following byte order::
      byteorder == -1: little endian
      byteorder == 0:  native byte order (writes a BOM mark)
      byteorder == 1:  big endian
   If byteorder is ``0``, the output string will always start with the Unicode BOM
   mark (U+FEFF). In the other two modes, no BOM mark is prepended.
   If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
   as a single codepoint.
   Return *NULL* if an exception was raised by the codec.
   .. versionadded:: 2.6
 .. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
   Return a Python string using the UTF-32 encoding in native byte order. The
   string always starts with a BOM mark.  Error handling is "strict".  Return
   *NULL* if an exception was raised by the codec.
   .. versionadded:: 2.6
 These are the UTF-16 codec APIs:
 .. % --- UTF-16 Codecs ------------------------------------------------------ */
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
 | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       |
 |                 | s_jisx0213                     |                                |
 +-----------------+--------------------------------+--------------------------------+
 | utf_32          | U32, utf32                     | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_32_be       | UTF-32BE                       | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_32_le       | UTF-32LE                       | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_16          | U16, utf16                     | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_16_be       | UTF-16BE                       | all languages (BMP only)       |
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
 # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
 # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
 # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
 # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
    const char *errors		/* error handling */
    );
 /* --- UTF-32 Codecs ------------------------------------------------------ */
 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
   the corresponding Unicode object.
   errors (if non-NULL) defines the error handling. It defaults
   to "strict". 
   If byteorder is non-NULL, the decoder starts decoding using the
   given byte order:
 	*byteorder == -1: little endian
 	*byteorder == 0:  native order
 	*byteorder == 1:  big endian
   In native mode, the first four bytes of the stream are checked for a
   BOM mark. If found, the BOM mark is analysed, the byte order
   adjusted and the BOM skipped.  In the other modes, no BOM mark
   interpretation is done. After completion, *byteorder is set to the
   current byte order at the end of input data.
   If byteorder is NULL, the codec starts in native order mode.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
    const char *string, 	/* UTF-32 encoded string */
    Py_ssize_t length,	 	/* size of string */
    const char *errors,		/* error handling */
    int *byteorder		/* pointer to byteorder to use
 				   0=native;-1=LE,1=BE; updated on
 				   exit */
    );
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
    const char *string, 	/* UTF-32 encoded string */
    Py_ssize_t length,	 	/* size of string */
    const char *errors,		/* error handling */
    int *byteorder,		/* pointer to byteorder to use
 				   0=native;-1=LE,1=BE; updated on
 				   exit */
    Py_ssize_t *consumed	/* bytes consumed */
    );
 /* Returns a Python string using the UTF-32 encoding in native byte
   order. The string always starts with a BOM mark.  */
 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
    PyObject *unicode	 	/* Unicode object */
    );
 /* Returns a Python string object holding the UTF-32 encoded value of
   the Unicode data.
   If byteorder is not 0, output is written according to the following
   byte order:
   byteorder == -1: little endian
   byteorder == 0:  native byte order (writes a BOM mark)
   byteorder == 1:  big endian
   If byteorder is 0, the output string will always start with the
   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
   prepended.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
    const Py_UNICODE *data, 	/* Unicode char buffer */
    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
    const char *errors,		/* error handling */
    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
    );
 /* --- UTF-16 Codecs ------------------------------------------------------ */
 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@ -490,6 +490,16 @@
    'unicodelittleunmarked' : 'utf_16_le',
    'utf_16le'           : 'utf_16_le',
    # utf_32 codec
    'u32'                : 'utf_32',
    'utf32'              : 'utf_32',
    # utf_32_be codec
    'utf_32be'           : 'utf_32_be',
    # utf_32_le codec
    'utf_32le'           : 'utf_32_le',
    # utf_7 codec
    'u7'                 : 'utf_7',
    'utf7'               : 'utf_7',
--- a/Lib/encodings/utf_32.py
+++ b/Lib/encodings/utf_32.py
@ -0,0 +1,144 @@
 """
 Python 'utf-32' Codec
 """
 import codecs, sys
 ### Codec APIs
 encode = codecs.utf_32_encode
 def decode(input, errors='strict'):
    return codecs.utf_32_decode(input, errors, True)
 class IncrementalEncoder(codecs.IncrementalEncoder):
    def __init__(self, errors='strict'):
        codecs.IncrementalEncoder.__init__(self, errors)
        self.encoder = None
    def encode(self, input, final=False):
        if self.encoder is None:
            result = codecs.utf_32_encode(input, self.errors)[0]
            if sys.byteorder == 'little':
                self.encoder = codecs.utf_32_le_encode
            else:
                self.encoder = codecs.utf_32_be_encode
            return result
        return self.encoder(input, self.errors)[0]
    def reset(self):
        codecs.IncrementalEncoder.reset(self)
        self.encoder = None
    def getstate(self):
        # state info we return to the caller:
        # 0: stream is in natural order for this platform
        # 2: endianness hasn't been determined yet
        # (we're never writing in unnatural order)
        return (2 if self.encoder is None else 0)
    def setstate(self, state):
        if state:
            self.encoder = None
        else:
            if sys.byteorder == 'little':
                self.encoder = codecs.utf_32_le_encode
            else:
                self.encoder = codecs.utf_32_be_encode
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    def __init__(self, errors='strict'):
        codecs.BufferedIncrementalDecoder.__init__(self, errors)
        self.decoder = None
    def _buffer_decode(self, input, errors, final):
        if self.decoder is None:
            (output, consumed, byteorder) = \
                codecs.utf_32_ex_decode(input, errors, 0, final)
            if byteorder == -1:
                self.decoder = codecs.utf_32_le_decode
            elif byteorder == 1:
                self.decoder = codecs.utf_32_be_decode
            elif consumed >= 4:
                raise UnicodeError("UTF-32 stream does not start with BOM")
            return (output, consumed)
        return self.decoder(input, self.errors, final)
    def reset(self):
        codecs.BufferedIncrementalDecoder.reset(self)
        self.decoder = None
    def getstate(self):
        # additonal state info from the base class must be None here,
        # as it isn't passed along to the caller
        state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
        # additional state info we pass to the caller:
        # 0: stream is in natural order for this platform
        # 1: stream is in unnatural order
        # 2: endianness hasn't been determined yet
        if self.decoder is None:
            return (state, 2)
        addstate = int((sys.byteorder == "big") !=
                       (self.decoder is codecs.utf_32_be_decode))
        return (state, addstate)
    def setstate(self, state):
        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
        codecs.BufferedIncrementalDecoder.setstate(self, state)
        state = state[1]
        if state == 0:
            self.decoder = (codecs.utf_32_be_decode
                            if sys.byteorder == "big"
                            else codecs.utf_32_le_decode)
        elif state == 1:
            self.decoder = (codecs.utf_32_le_decode
                            if sys.byteorder == "big"
                            else codecs.utf_32_be_decode)
        else:
            self.decoder = None
 class StreamWriter(codecs.StreamWriter):
    def __init__(self, stream, errors='strict'):
        self.bom_written = False
        codecs.StreamWriter.__init__(self, stream, errors)
    def encode(self, input, errors='strict'):
        self.bom_written = True
        result = codecs.utf_32_encode(input, errors)
        if sys.byteorder == 'little':
            self.encode = codecs.utf_32_le_encode
        else:
            self.encode = codecs.utf_32_be_encode
        return result
 class StreamReader(codecs.StreamReader):
    def reset(self):
        codecs.StreamReader.reset(self)
        try:
            del self.decode
        except AttributeError:
            pass
    def decode(self, input, errors='strict'):
        (object, consumed, byteorder) = \
            codecs.utf_32_ex_decode(input, errors, 0, False)
        if byteorder == -1:
            self.decode = codecs.utf_32_le_decode
        elif byteorder == 1:
            self.decode = codecs.utf_32_be_decode
        elif consumed>=4:
            raise UnicodeError,"UTF-32 stream does not start with BOM"
        return (object, consumed)
 ### encodings module API
 def getregentry():
    return codecs.CodecInfo(
        name='utf-32',
        encode=encode,
        decode=decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
--- a/Lib/encodings/utf_32_be.py
+++ b/Lib/encodings/utf_32_be.py
@ -0,0 +1,37 @@
 """
 Python 'utf-32-be' Codec
 """
 import codecs
 ### Codec APIs
 encode = codecs.utf_32_be_encode
 def decode(input, errors='strict'):
    return codecs.utf_32_be_decode(input, errors, True)
 class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return codecs.utf_32_be_encode(input, self.errors)[0]
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    _buffer_decode = codecs.utf_32_be_decode
 class StreamWriter(codecs.StreamWriter):
    encode = codecs.utf_32_be_encode
 class StreamReader(codecs.StreamReader):
    decode = codecs.utf_32_be_decode
 ### encodings module API
 def getregentry():
    return codecs.CodecInfo(
        name='utf-32-be',
        encode=encode,
        decode=decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
--- a/Lib/encodings/utf_32_le.py
+++ b/Lib/encodings/utf_32_le.py
@ -0,0 +1,37 @@
 """
 Python 'utf-32-le' Codec
 """
 import codecs
 ### Codec APIs
 encode = codecs.utf_32_le_encode
 def decode(input, errors='strict'):
    return codecs.utf_32_le_decode(input, errors, True)
 class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return codecs.utf_32_le_encode(input, self.errors)[0]
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    _buffer_decode = codecs.utf_32_le_decode
 class StreamWriter(codecs.StreamWriter):
    encode = codecs.utf_32_le_encode
 class StreamReader(codecs.StreamReader):
    decode = codecs.utf_32_le_decode
 ### encodings module API
 def getregentry():
    return codecs.CodecInfo(
        name='utf-32-le',
        encode=encode,
        decode=decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -285,7 +285,8 @@ def handler2(exc):
    def test_longstrings(self):
        # test long strings to check for memory overflow problems
-        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
                   "backslashreplace"]
        # register the handlers under different names,
        # to prevent the codec from recognizing the name
        for err in errors:
@ -293,7 +294,8 @@ def test_longstrings(self):
        l = 1000
        errors += [ "test." + err for err in errors ]
        for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
-            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
                        "utf-8", "utf-7", "utf-16", "utf-32"):
                for err in errors:
                    try:
                        uni.encode(enc, err)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -244,6 +244,137 @@ def test_bug1098990_b(self):
        self.assertEqual(reader.readline(), s5)
        self.assertEqual(reader.readline(), u"")
 class UTF32Test(ReadTest):
    encoding = "utf-32"
    spamle = ('\xff\xfe\x00\x00'
              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
    spambe = ('\x00\x00\xfe\xff'
              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
    def test_only_one_bom(self):
        _,_,reader,writer = codecs.lookup(self.encoding)
        # encode some stream
        s = StringIO.StringIO()
        f = writer(s)
        f.write(u"spam")
        f.write(u"spam")
        d = s.getvalue()
        # check whether there is exactly one BOM in it
        self.assert_(d == self.spamle or d == self.spambe)
        # try to read it back
        s = StringIO.StringIO(d)
        f = reader(s)
        self.assertEquals(f.read(), u"spamspam")
    def test_badbom(self):
        s = StringIO.StringIO(4*"\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)
        s = StringIO.StringIO(8*"\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)
    def test_partial(self):
        self.check_partial(
            u"\x00\xff\u0100\uffff",
            [
                u"", # first byte of BOM read
                u"", # second byte of BOM read
                u"", # third byte of BOM read
                u"", # fourth byte of BOM read => byteorder known
                u"",
                u"",
                u"",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100\uffff",
            ]
        )
    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
                          "\xff", "strict", True)
 class UTF32LETest(ReadTest):
    encoding = "utf-32-le"
    def test_partial(self):
        self.check_partial(
            u"\x00\xff\u0100\uffff",
            [
                u"",
                u"",
                u"",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100\uffff",
            ]
        )
    def test_simple(self):
        self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
                          "\xff", "strict", True)
 class UTF32BETest(ReadTest):
    encoding = "utf-32-be"
    def test_partial(self):
        self.check_partial(
            u"\x00\xff\u0100\uffff",
            [
                u"",
                u"",
                u"",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100",
                u"\x00\xff\u0100\uffff",
            ]
        )
    def test_simple(self):
        self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
                          "\xff", "strict", True)
 class UTF16Test(ReadTest):
    encoding = "utf-16"
@ -1278,6 +1409,9 @@ def test_streamreaderwriter(self):
 def test_main():
    test_support.run_unittest(
        UTF32Test,
        UTF32LETest,
        UTF32BETest,
        UTF16Test,
        UTF16LETest,
        UTF16BETest,
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -243,6 +243,8 @@ Library
 - GB18030 codec now can encode additional two-byte characters that
  are missing in GBK.
 - Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
 - Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
  represent the result in a single character.
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self,
    return tuple;
 }
 static PyObject *
 utf_32_decode(PyObject *self,
 	    PyObject *args)
 {
    const char *data;
    Py_ssize_t size;
    const char *errors = NULL;
    int byteorder = 0;
    int final = 0;
    Py_ssize_t consumed;
    PyObject *decoded;
    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
 			  &data, &size, &errors, &final))
 	return NULL;
    if (size < 0) {
 	    PyErr_SetString(PyExc_ValueError, "negative argument");
 	    return 0;
    }
    consumed = size; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
 					    final ? NULL : &consumed);
    if (decoded == NULL)
 	return NULL;
    return codec_tuple(decoded, consumed);
 }
 static PyObject *
 utf_32_le_decode(PyObject *self,
 		 PyObject *args)
 {
    const char *data;
    Py_ssize_t size;
    const char *errors = NULL;
    int byteorder = -1;
    int final = 0;
    Py_ssize_t consumed;
    PyObject *decoded = NULL;
    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
 			  &data, &size, &errors, &final))
 	return NULL;
    if (size < 0) {
          PyErr_SetString(PyExc_ValueError, "negative argument");
          return 0;
    }
    consumed = size; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
 	&byteorder, final ? NULL : &consumed);
    if (decoded == NULL)
 	return NULL;
    return codec_tuple(decoded, consumed);
 }
 static PyObject *
 utf_32_be_decode(PyObject *self,
 		 PyObject *args)
 {
    const char *data;
    Py_ssize_t size;
    const char *errors = NULL;
    int byteorder = 1;
    int final = 0;
    Py_ssize_t consumed;
    PyObject *decoded = NULL;
    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
 			  &data, &size, &errors, &final))
 	return NULL;
    if (size < 0) {
          PyErr_SetString(PyExc_ValueError, "negative argument");
          return 0;
    }
    consumed = size; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
 	&byteorder, final ? NULL : &consumed);
    if (decoded == NULL)
 	return NULL;
    return codec_tuple(decoded, consumed);
 }
 /* This non-standard version also provides access to the byteorder
   parameter of the builtin UTF-32 codec.
   It returns a tuple (unicode, bytesread, byteorder) with byteorder
   being the value in effect at the end of data.
 */
 static PyObject *
 utf_32_ex_decode(PyObject *self,
 		 PyObject *args)
 {
    const char *data;
    Py_ssize_t size;
    const char *errors = NULL;
    int byteorder = 0;
    PyObject *unicode, *tuple;
    int final = 0;
    Py_ssize_t consumed;
    if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
 			  &data, &size, &errors, &byteorder, &final))
 	return NULL;
    if (size < 0) {
 	    PyErr_SetString(PyExc_ValueError, "negative argument");
 	    return 0;
    }
    consumed = size; /* This is overwritten unless final is true. */
    unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
 					    final ? NULL : &consumed);
    if (unicode == NULL)
 	return NULL;
    tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
    Py_DECREF(unicode);
    return tuple;
 }
 static PyObject *
 unicode_escape_decode(PyObject *self,
 		     PyObject *args)
@ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self,
    return v;
 }
 /* This version provides access to the byteorder parameter of the
   builtin UTF-32 codecs as optional third argument. It defaults to 0
   which means: use the native byte order and prepend the data with a
   BOM mark.
 */
 static PyObject *
 utf_32_encode(PyObject *self,
 	    PyObject *args)
 {
    PyObject *str, *v;
    const char *errors = NULL;
    int byteorder = 0;
    if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
 			  &str, &errors, &byteorder))
 	return NULL;
    str = PyUnicode_FromObject(str);
    if (str == NULL)
 	return NULL;
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 					  PyUnicode_GET_SIZE(str),
 					  errors,
 					  byteorder),
 		    PyUnicode_GET_SIZE(str));
    Py_DECREF(str);
    return v;
 }
 static PyObject *
 utf_32_le_encode(PyObject *self,
 		 PyObject *args)
 {
    PyObject *str, *v;
    const char *errors = NULL;
    if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
 			  &str, &errors))
 	return NULL;
    str = PyUnicode_FromObject(str);
    if (str == NULL)
 	return NULL;
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 					     PyUnicode_GET_SIZE(str),
 					     errors,
 					     -1),
 		       PyUnicode_GET_SIZE(str));
    Py_DECREF(str);
    return v;
 }
 static PyObject *
 utf_32_be_encode(PyObject *self,
 		 PyObject *args)
 {
    PyObject *str, *v;
    const char *errors = NULL;
    if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
 			  &str, &errors))
 	return NULL;
    str = PyUnicode_FromObject(str);
    if (str == NULL)
 	return NULL;
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 					  PyUnicode_GET_SIZE(str),
 					  errors,
 					  +1),
 		    PyUnicode_GET_SIZE(str));
    Py_DECREF(str);
    return v;
 }
 static PyObject *
 unicode_escape_encode(PyObject *self,
 		     PyObject *args)
@ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = {
    {"utf_16_le_decode",	utf_16_le_decode,		METH_VARARGS},
    {"utf_16_be_decode",	utf_16_be_decode,		METH_VARARGS},
    {"utf_16_ex_decode",	utf_16_ex_decode,		METH_VARARGS},
    {"utf_32_encode",		utf_32_encode,			METH_VARARGS},
    {"utf_32_le_encode",	utf_32_le_encode,		METH_VARARGS},
    {"utf_32_be_encode",	utf_32_be_encode,		METH_VARARGS},
    {"utf_32_decode",		utf_32_decode,			METH_VARARGS},
    {"utf_32_le_decode",	utf_32_le_decode,		METH_VARARGS},
    {"utf_32_be_decode",	utf_32_be_decode,		METH_VARARGS},
    {"utf_32_ex_decode",	utf_32_ex_decode,		METH_VARARGS},
    {"unicode_escape_encode",	unicode_escape_encode,		METH_VARARGS},
    {"unicode_escape_decode",	unicode_escape_decode,		METH_VARARGS},
    {"unicode_internal_encode",	unicode_internal_encode,	METH_VARARGS},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 				NULL);
 }
 /* --- UTF-32 Codec ------------------------------------------------------- */
 PyObject *
 PyUnicode_DecodeUTF32(const char *s,
 		      Py_ssize_t size,
 		      const char *errors,
 		      int *byteorder)
 {
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
 }
 PyObject *
 PyUnicode_DecodeUTF32Stateful(const char *s,
 			      Py_ssize_t size,
 			      const char *errors,
 			      int *byteorder,
 			      Py_ssize_t *consumed)
 {
    const char *starts = s;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
 #ifndef Py_UNICODE_WIDE
    int i, pairs;
 #else
    const int pairs = 0;
 #endif
    const unsigned char *q, *e;
    int bo = 0;       /* assume native ordering by default */
    const char *errmsg = "";
    /* On narrow builds we split characters outside the BMP into two
       codepoints => count how much extra space we need. */
 #ifndef Py_UNICODE_WIDE
    for (i = pairs = 0; i < size/4; i++)
 	if (((Py_UCS4 *)s)[i] >= 0x10000)
 	    pairs++;
 #endif
    /* Offsets from q for retrieving bytes in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int iorder[] = {0, 1, 2, 3};
 #else
    int iorder[] = {3, 2, 1, 0};
 #endif
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    /* This might be one to much, because of a BOM */
    unicode = _PyUnicode_New((size+3)/4+pairs);
    if (!unicode)
        return NULL;
    if (size == 0)
        return (PyObject *)unicode;
    /* Unpack UTF-32 encoded data */
    p = unicode->str;
    q = (unsigned char *)s;
    e = q + size;
    if (byteorder)
        bo = *byteorder;
    /* Check for BOM marks (U+FEFF) in the input and adjust current
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
        if (size >= 4) {
            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
                                (q[iorder[1]] << 8) | q[iorder[0]];
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 	    if (bom == 0x0000FEFF) {
 		q += 4;
 		bo = -1;
 	    }
 	    else if (bom == 0xFFFE0000) {
 		q += 4;
 		bo = 1;
 	    }
 #else
 	    if (bom == 0x0000FEFF) {
 		q += 4;
 		bo = 1;
 	    }
 	    else if (bom == 0xFFFE0000) {
 		q += 4;
 		bo = -1;
 	    }
 #endif
 	}
    }
    if (bo == -1) {
        /* force LE */
        iorder[0] = 0;
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
    }
    else if (bo == 1) {
        /* force BE */
        iorder[0] = 3;
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
    }
    while (q < e) {
 	Py_UCS4 ch;
 	/* remaining bytes at the end? (size should be divisible by 4) */
 	if (e-q<4) {
 	    if (consumed)
 		break;
 	    errmsg = "truncated data";
 	    startinpos = ((const char *)q)-starts;
 	    endinpos = ((const char *)e)-starts;
 	    goto utf32Error;
 	    /* The remaining input chars are ignored if the callback
 	       chooses to skip the input */
 	}
 	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
 	     (q[iorder[1]] << 8) | q[iorder[0]];
 	if (ch >= 0x110000)
 	{
 	    errmsg = "codepoint not in range(0x110000)";
 	    startinpos = ((const char *)q)-starts;
 	    endinpos = startinpos+4;
 	    goto utf32Error;
 	}
 #ifndef Py_UNICODE_WIDE
 	if (ch >= 0x10000)
 	{
 	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
 	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
 	}
 	else
 #endif
 	    *p++ = ch;
 	q += 4;
 	continue;
    utf32Error:
 	outpos = p-PyUnicode_AS_UNICODE(unicode);
    if (unicode_decode_call_errorhandler(
         errors, &errorHandler,
         "utf32", errmsg,
         starts, size, &startinpos, &endinpos, &exc, &s,
         (PyObject **)&unicode, &outpos, &p))
 	    goto onError;
    }
    if (byteorder)
        *byteorder = bo;
    if (consumed)
 	*consumed = (const char *)q-starts;
    /* Adjust length */
    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
        goto onError;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return (PyObject *)unicode;
 onError:
    Py_DECREF(unicode);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
 }
 PyObject *
 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
 		      Py_ssize_t size,
 		      const char *errors,
 		      int byteorder)
 {
    PyObject *v;
    unsigned char *p;
 #ifndef Py_UNICODE_WIDE
    int i, pairs;
 #else
    const int pairs = 0;
 #endif
    /* Offsets from p for storing byte pairs in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int iorder[] = {0, 1, 2, 3};
 #else
    int iorder[] = {3, 2, 1, 0};
 #endif
 #define STORECHAR(CH)                       \
    do {                                    \
        p[iorder[3]] = ((CH) >> 24) & 0xff; \
        p[iorder[2]] = ((CH) >> 16) & 0xff; \
        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
        p[iorder[0]] = (CH) & 0xff;         \
        p += 4;                             \
    } while(0)
    /* In narrow builds we can output surrogate pairs as one codepoint,
       so we need less space. */
 #ifndef Py_UNICODE_WIDE
    for (i = pairs = 0; i < size-1; i++)
 	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
 	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
 	    pairs++;
 #endif
    v = PyString_FromStringAndSize(NULL,
 		  4 * (size - pairs + (byteorder == 0)));
    if (v == NULL)
        return NULL;
    p = (unsigned char *)PyString_AS_STRING(v);
    if (byteorder == 0)
 	STORECHAR(0xFEFF);
    if (size == 0)
        return v;
    if (byteorder == -1) {
        /* force LE */
        iorder[0] = 0;
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
    }
    else if (byteorder == 1) {
        /* force BE */
        iorder[0] = 3;
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
    }
    while (size-- > 0) {
 	Py_UCS4 ch = *s++;
 #ifndef Py_UNICODE_WIDE
 	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
 	    Py_UCS4 ch2 = *s;
 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
 		s++;
 		size--;
 	    }
 	}
 #endif
        STORECHAR(ch);
    }
    return v;
 #undef STORECHAR
 }
 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
 {
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
 				 PyUnicode_GET_SIZE(unicode),
 				 NULL,
 				 0);
 }
 /* --- UTF-16 Codec ------------------------------------------------------- */
 PyObject *