From c5ec267311f9312c75ebef330dd210cc64ec0c24 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Tue, 14 Oct 2025 15:48:29 +0100 Subject: [PATCH] [3.13] gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340) (gh-140112) * [3.13] gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340) (cherry picked from commit 87eadce3e0309d80a95e85d70a00028b5dca9907) Co-authored-by: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> * Accidentally removed line --- Lib/test/multibytecodec_support.py | 17 +++++++++++++++++ ...25-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst | 3 +++ Modules/cjkcodecs/_codecs_iso2022.c | 11 +++++++---- Modules/cjkcodecs/_codecs_jp.c | 9 +++++++-- 4 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst diff --git a/Lib/test/multibytecodec_support.py b/Lib/test/multibytecodec_support.py index dbf0cc428e3..6b4c57d0b4b 100644 --- a/Lib/test/multibytecodec_support.py +++ b/Lib/test/multibytecodec_support.py @@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self): with self.assertRaises(AttributeError): del e.errors + def test_null_terminator(self): + # see gh-101828 + text = "フルーツ" + try: + text.encode(self.encoding) + except UnicodeEncodeError: + text = "Python is cool" + encode_w_null = (text + "\0").encode(self.encoding) + encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding) + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + class TestBase_Mapping(unittest.TestCase): pass_enctest = [] diff --git a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst new file mode 100644 index 00000000000..1d100180c07 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst @@ -0,0 +1,3 @@ +Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and +``'euc_jis_2004'`` codecs truncating null chars +as they were treated as part of multi-character sequences. diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index e8835ad0909..bdbaca2c421 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data, return coded; case 2: /* second character of unicode pair */ - coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], - jisx0213_pair_encmap, JISX0213_ENCPAIRS); - if (coded != DBCINV) - return coded; + if (data[1] != 0) { /* Don't consume null char as part of pair */ + coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], + jisx0213_pair_encmap, JISX0213_ENCPAIRS); + if (coded != DBCINV) { + return coded; + } + } /* fall through */ case -1: /* flush unterminated */ diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index f7127487aa5..cd77888d551 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -192,8 +192,11 @@ ENCODER(euc_jis_2004) JISX0213_ENCPAIRS); if (code == DBCINV) return 1; - } else + } + else if (c2 != 0) { + /* Don't consume null char as part of pair */ insize = 2; + } } } } @@ -611,8 +614,10 @@ ENCODER(shift_jis_2004) if (code == DBCINV) return 1; } - else + else if (ch2 != 0) { + /* Don't consume null char as part of pair */ insize = 2; + } } } }