[3.10] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (GH-134345)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623)
(cherry picked from commit 6279eb8c07)
(cherry picked from commit a75953b347)
(cherry picked from commit 0c33e5baed)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Serhiy Storchaka 2025-06-02 18:55:48 +03:00 committed by GitHub
parent f85e71a008
commit ab9893c406
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 163 additions and 40 deletions

View file

@ -1124,7 +1124,7 @@ def test_bug828737(self):
text = 'abc<def>ghi'*n
text.translate(charmap)
def test_mutatingdecodehandler(self):
def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
@ -1159,6 +1159,40 @@ def mutating(exc):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
def test_mutating_decode_handler_unicode_escape(self):
decode = codecs.unicode_escape_decode
def mutating(exc):
if isinstance(exc, UnicodeDecodeError):
r = data.get(exc.object[:exc.end])
if r is not None:
exc.object = r[0] + exc.object[exc.end:]
return ('\u0404', r[1])
raise AssertionError("don't know how to handle %r" % exc)
codecs.register_error('test.mutating2', mutating)
data = {
br'\x0': (b'\\', 0),
br'\x3': (b'xxx\\', 3),
br'\x5': (b'x\\', 1),
}
def check(input, expected, msg):
with self.assertWarns(DeprecationWarning) as cm:
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
self.assertIn(msg, str(cm.warning))
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot

View file

@ -1181,20 +1181,32 @@ def test_escape(self):
check(br"[\501]", b"[A]")
check(br"[\x41]", b"[A]")
check(br"[\x410]", b"[A0]")
def test_warnings(self):
decode = codecs.escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, b"\\" + b)
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), b"\\" + b.upper())
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\8'"):
check(br"\8", b"\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", b"\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", b"\\\xfa")
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\z'"):
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
def test_errors(self):
decode = codecs.escape_decode
self.assertRaises(ValueError, decode, br"\x")
@ -2408,20 +2420,31 @@ def test_escape_decode(self):
check(br"[\x410]", "[A0]")
check(br"\u20ac", "\u20ac")
check(br"\U0001d120", "\U0001d120")
def test_decode_warnings(self):
decode = codecs.unicode_escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtuvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, "\\" + chr(i))
if b.upper() not in b'UN':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), "\\" + chr(i-32))
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\8'"):
check(br"\8", "\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", "\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", "\\\xfa")
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\z'"):
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
def test_decode_errors(self):
decode = codecs.unicode_escape_decode