bpo-36819: Fix crashes in built-in encoders with weird error handlers (GH-28593)

If the error handler returns position less or equal than the starting position of non-encodable characters, most of built-in encoders didn't properly re-size the output buffer. This led to out-of-bounds writes, and segfaults.
2025-10-31 21:51:50 +00:00 · 2022-05-02 12:37:48 +03:00 · 2022-05-02 12:37:48 +03:00 · 18b07d773e
commit 18b07d773e
parent 614420df97
4 changed files with 222 additions and 32 deletions
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -1,5 +1,6 @@
 import codecs
 import html.entities
 import itertools
 import sys
 import unicodedata
 import unittest
@ -22,6 +23,18 @@ def handle(self, exc):
            self.pos = len(exc.object)
        return ("<?>", oldpos)
 class RepeatedPosReturn:
    def __init__(self, repl="<?>"):
        self.repl = repl
        self.pos = 0
        self.count = 0
    def handle(self, exc):
        if self.count > 0:
            self.count -= 1
            return (self.repl, self.pos)
        return (self.repl, exc.end)
 # A UnicodeEncodeError object with a bad start attribute
 class BadStartUnicodeEncodeError(UnicodeEncodeError):
    def __init__(self):
@ -783,20 +796,104 @@ def test_lookup(self):
            codecs.lookup_error("namereplace")
        )
-    def test_unencodablereplacement(self):
+    def test_encode_nonascii_replacement(self):
        def handle(exc):
            if isinstance(exc, UnicodeEncodeError):
                return (repl, exc.end)
            raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error("test.replacing", handle)
        for enc, input, repl in (
                ("ascii", "[¤]", "abc"),
                ("iso-8859-1", "[€]", "½¾"),
                ("iso-8859-15", "[¤]", "œŸ"),
        ):
            res = input.encode(enc, "test.replacing")
            self.assertEqual(res, ("[" + repl + "]").encode(enc))
        for enc, input, repl in (
                ("utf-8", "[\udc80]", "\U0001f40d"),
                ("utf-16", "[\udc80]", "\U0001f40d"),
                ("utf-32", "[\udc80]", "\U0001f40d"),
        ):
            with self.subTest(encoding=enc):
                with self.assertRaises(UnicodeEncodeError) as cm:
                    input.encode(enc, "test.replacing")
                exc = cm.exception
                self.assertEqual(exc.start, 1)
                self.assertEqual(exc.end, 2)
                self.assertEqual(exc.object, input)
    def test_encode_unencodable_replacement(self):
        def unencrepl(exc):
            if isinstance(exc, UnicodeEncodeError):
-                return ("\u4242", exc.end)
+                return (repl, exc.end)
            else:
                raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error("test.unencreplhandler", unencrepl)
-        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
+
-            self.assertRaises(
+        for enc, input, repl in (
-                UnicodeEncodeError,
+                ("ascii", "[¤]", "½"),
-                "\u4242".encode,
+                ("iso-8859-1", "[€]", "œ"),
-                enc,
+                ("iso-8859-15", "[¤]", "½"),
-                "test.unencreplhandler"
+                ("utf-8", "[\udc80]", "\udcff"),
-            )
+                ("utf-16", "[\udc80]", "\udcff"),
                ("utf-32", "[\udc80]", "\udcff"),
        ):
            with self.subTest(encoding=enc):
                with self.assertRaises(UnicodeEncodeError) as cm:
                    input.encode(enc, "test.unencreplhandler")
                exc = cm.exception
                self.assertEqual(exc.start, 1)
                self.assertEqual(exc.end, 2)
                self.assertEqual(exc.object, input)
    def test_encode_bytes_replacement(self):
        def handle(exc):
            if isinstance(exc, UnicodeEncodeError):
                return (repl, exc.end)
            raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error("test.replacing", handle)
        # It works even if the bytes sequence is not decodable.
        for enc, input, repl in (
                ("ascii", "[¤]", b"\xbd\xbe"),
                ("iso-8859-1", "[€]", b"\xbd\xbe"),
                ("iso-8859-15", "[¤]", b"\xbd\xbe"),
                ("utf-8", "[\udc80]", b"\xbd\xbe"),
                ("utf-16le", "[\udc80]", b"\xbd\xbe"),
                ("utf-16be", "[\udc80]", b"\xbd\xbe"),
                ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
                ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
        ):
            with self.subTest(encoding=enc):
                res = input.encode(enc, "test.replacing")
                self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
    def test_encode_odd_bytes_replacement(self):
        def handle(exc):
            if isinstance(exc, UnicodeEncodeError):
                return (repl, exc.end)
            raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error("test.replacing", handle)
        input = "[\udc80]"
        # Tests in which the replacement bytestring contains not whole number
        # of code units.
        for enc, repl in (
            *itertools.product(("utf-16le", "utf-16be"),
                               [b"a", b"abc"]),
            *itertools.product(("utf-32le", "utf-32be"),
                               [b"a", b"ab", b"abc", b"abcde"]),
        ):
            with self.subTest(encoding=enc, repl=repl):
                with self.assertRaises(UnicodeEncodeError) as cm:
                    input.encode(enc, "test.replacing")
                exc = cm.exception
                self.assertEqual(exc.start, 1)
                self.assertEqual(exc.end, 2)
                self.assertEqual(exc.object, input)
                self.assertEqual(exc.reason, "surrogates not allowed")
    def test_badregistercall(self):
        # enhance coverage of:
@ -940,6 +1037,68 @@ def __getitem__(self, key):
            self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
            self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
    def test_decodehelper_bug36819(self):
        handler = RepeatedPosReturn("x")
        codecs.register_error("test.bug36819", handler.handle)
        testcases = [
            ("ascii", b"\xff"),
            ("utf-8", b"\xff"),
            ("utf-16be", b'\xdc\x80'),
            ("utf-32be", b'\x00\x00\xdc\x80'),
            ("iso-8859-6", b"\xff"),
        ]
        for enc, bad in testcases:
            input = "abcd".encode(enc) + bad
            with self.subTest(encoding=enc):
                handler.count = 50
                decoded = input.decode(enc, "test.bug36819")
                self.assertEqual(decoded, 'abcdx' * 51)
    def test_encodehelper_bug36819(self):
        handler = RepeatedPosReturn()
        codecs.register_error("test.bug36819", handler.handle)
        input = "abcd\udc80"
        encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
        encodings += ["iso-8859-15"]  # charmap codec
        if sys.platform == 'win32':
            encodings = ["mbcs", "oem"]  # code page codecs
        handler.repl = "\udcff"
        for enc in encodings:
            with self.subTest(encoding=enc):
                handler.count = 50
                with self.assertRaises(UnicodeEncodeError) as cm:
                    input.encode(enc, "test.bug36819")
                exc = cm.exception
                self.assertEqual(exc.start, 4)
                self.assertEqual(exc.end, 5)
                self.assertEqual(exc.object, input)
        if sys.platform == "win32":
            handler.count = 50
            with self.assertRaises(UnicodeEncodeError) as cm:
                codecs.code_page_encode(437, input, "test.bug36819")
            exc = cm.exception
            self.assertEqual(exc.start, 4)
            self.assertEqual(exc.end, 5)
            self.assertEqual(exc.object, input)
        handler.repl = "x"
        for enc in encodings:
            with self.subTest(encoding=enc):
                # The interpreter should segfault after a handful of attempts.
                # 50 was chosen to try to ensure a segfault without a fix,
                # but not OOM a machine with one.
                handler.count = 50
                encoded = input.encode(enc, "test.bug36819")
                self.assertEqual(encoded.decode(enc), "abcdx" * 51)
        if sys.platform == "win32":
            handler.count = 50
            encoded = codecs.code_page_encode(437, input, "test.bug36819")
            self.assertEqual(encoded[0].decode(), "abcdx" * 51)
            self.assertEqual(encoded[1], len(input))
    def test_translatehelper(self):
        # enhance coverage of:
        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
--- a/Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst
+++ b/Builtins/2021-09-28-10-58-30.bpo-36819.cyV50C.rst
@ -0,0 +1,2 @@
 Fix crashes in built-in encoders with error handlers that return position
 less or equal than the starting position of non-encodable characters.
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
                if (!rep)
                    goto error;
                if (newpos < startpos) {
                    writer->overallocate = 1;
                    p = _PyBytesWriter_Prepare(writer, p,
                                               max_char_size * (startpos - newpos));
                    if (p == NULL)
                        goto error;
                }
                else {
                    /* subtract preallocated bytes */
                    writer->min_size -= max_char_size * (newpos - startpos);
                    /* Only overallocate the buffer if it's not the last write */
                    writer->overallocate = (newpos < size);
                }
                if (PyBytes_Check(rep)) {
                    p = _PyBytesWriter_WriteBytes(writer, p,
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -5868,7 +5868,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
    pos = 0;
    while (pos < len) {
-        Py_ssize_t repsize, moreunits;
+        Py_ssize_t newpos, repsize, moreunits;
        if (kind == PyUnicode_2BYTE_KIND) {
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
@ -5885,7 +5885,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        rep = unicode_encode_call_errorhandler(
                errors, &errorHandler,
                encoding, "surrogates not allowed",
-                str, &exc, pos, pos + 1, &pos);
+                str, &exc, pos, pos + 1, &newpos);
        if (!rep)
            goto error;
@ -5893,7 +5893,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
            repsize = PyBytes_GET_SIZE(rep);
            if (repsize & 3) {
                raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                       "surrogates not allowed");
                goto error;
            }
@ -5906,28 +5906,30 @@ _PyUnicode_EncodeUTF32(PyObject *str,
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
            if (!PyUnicode_IS_ASCII(rep)) {
                raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                       "surrogates not allowed");
                goto error;
            }
        }
        moreunits += pos - newpos;
        pos = newpos;
        /* four bytes are reserved for each surrogate */
-        if (moreunits > 1) {
+        if (moreunits > 0) {
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
                /* integer overflow */
                PyErr_NoMemory();
                goto error;
            }
-            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
+            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
                goto error;
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
        }
        if (PyBytes_Check(rep)) {
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
-            out += moreunits;
+            out += repsize / 4;
        } else /* rep is unicode */ {
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
@ -6205,7 +6207,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
    pos = 0;
    while (pos < len) {
-        Py_ssize_t repsize, moreunits;
+        Py_ssize_t newpos, repsize, moreunits;
        if (kind == PyUnicode_2BYTE_KIND) {
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
@ -6222,7 +6224,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
        rep = unicode_encode_call_errorhandler(
                errors, &errorHandler,
                encoding, "surrogates not allowed",
-                str, &exc, pos, pos + 1, &pos);
+                str, &exc, pos, pos + 1, &newpos);
        if (!rep)
            goto error;
@ -6230,7 +6232,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
            repsize = PyBytes_GET_SIZE(rep);
            if (repsize & 1) {
                raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                       "surrogates not allowed");
                goto error;
            }
@ -6243,28 +6245,30 @@ _PyUnicode_EncodeUTF16(PyObject *str,
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
            if (!PyUnicode_IS_ASCII(rep)) {
                raise_encode_exception(&exc, encoding,
-                                       str, pos - 1, pos,
+                                       str, pos, pos + 1,
                                       "surrogates not allowed");
                goto error;
            }
        }
        moreunits += pos - newpos;
        pos = newpos;
        /* two bytes are reserved for each surrogate */
-        if (moreunits > 1) {
+        if (moreunits > 0) {
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
                /* integer overflow */
                PyErr_NoMemory();
                goto error;
            }
-            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
+            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
                goto error;
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
        }
        if (PyBytes_Check(rep)) {
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
-            out += moreunits;
+            out += repsize / 2;
        } else /* rep is unicode */ {
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
@ -7167,8 +7171,19 @@ unicode_encode_ucs1(PyObject *unicode,
                if (rep == NULL)
                    goto onError;
                if (newpos < collstart) {
                    writer.overallocate = 1;
                    str = _PyBytesWriter_Prepare(&writer, str,
                                                 collstart - newpos);
                    if (str == NULL)
                        goto onError;
                }
                else {
                    /* subtract preallocated bytes */
                    writer.min_size -= newpos - collstart;
                    /* Only overallocate the buffer if it's not the last write */
                    writer.overallocate = (newpos < size);
                }
                if (PyBytes_Check(rep)) {
                    /* Directly copy bytes result to output. */
@ -7944,13 +7959,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
                  pos, pos + 1, &newpos);
        if (rep == NULL)
            goto error;
        pos = newpos;
        Py_ssize_t morebytes = pos - newpos;
        if (PyBytes_Check(rep)) {
            outsize = PyBytes_GET_SIZE(rep);
-            if (outsize != 1) {
+            morebytes += outsize;
            if (morebytes > 0) {
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
-                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                    Py_DECREF(rep);
                    goto error;
@ -7971,9 +7987,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
            }
            outsize = PyUnicode_GET_LENGTH(rep);
-            if (outsize != 1) {
+            morebytes += outsize;
            if (morebytes > 0) {
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
-                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                    Py_DECREF(rep);
                    goto error;
@ -7996,6 +8013,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
                out++;
            }
        }
        pos = newpos;
        Py_DECREF(rep);
    }
    /* write a NUL byte */
		`@ -0,0 +1,2 @@`
							`Fix crashes in built-in encoders with error handlers that return position`
							`less or equal than the starting position of non-encodable characters.`