gh-101828: Fix jisx0213 codecs removing null characters (gh-139340)

This commit is contained in:
Stan Ulbrych 2025-10-14 14:55:00 +01:00 committed by GitHub
parent ded59f7e8e
commit 87eadce3e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 34 additions and 6 deletions

View file

@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self):
with self.assertRaises(AttributeError): with self.assertRaises(AttributeError):
del e.errors del e.errors
def test_null_terminator(self):
# see gh-101828
text = "フルーツ"
try:
text.encode(self.encoding)
except UnicodeEncodeError:
text = "Python is cool"
encode_w_null = (text + "\0").encode(self.encoding)
encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding)
self.assertTrue(encode_w_null.endswith(b'\x00'))
self.assertEqual(encode_w_null, encode_plus_null)
encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding)
encode_plus_null_2 = encode_plus_null + encode_plus_null
self.assertEqual(encode_w_null_2.count(b'\x00'), 2)
self.assertEqual(encode_w_null_2, encode_plus_null_2)
class TestBase_Mapping(unittest.TestCase): class TestBase_Mapping(unittest.TestCase):
pass_enctest = [] pass_enctest = []

View file

@ -0,0 +1,3 @@
Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and
``'euc_jis_2004'`` codecs truncating null chars
as they were treated as part of multi-character sequences.

View file

@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
return coded; return coded;
case 2: /* second character of unicode pair */ case 2: /* second character of unicode pair */
coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], if (data[1] != 0) { /* Don't consume null char as part of pair */
jisx0213_pair_encmap, JISX0213_ENCPAIRS); coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
if (coded != DBCINV) jisx0213_pair_encmap, JISX0213_ENCPAIRS);
return coded; if (coded != DBCINV) {
return coded;
}
}
_Py_FALLTHROUGH; _Py_FALLTHROUGH;
case -1: /* flush unterminated */ case -1: /* flush unterminated */

View file

@ -192,8 +192,11 @@ ENCODER(euc_jis_2004)
JISX0213_ENCPAIRS); JISX0213_ENCPAIRS);
if (code == DBCINV) if (code == DBCINV)
return 1; return 1;
} else }
else if (c2 != 0) {
/* Don't consume null char as part of pair */
insize = 2; insize = 2;
}
} }
} }
} }
@ -611,8 +614,10 @@ ENCODER(shift_jis_2004)
if (code == DBCINV) if (code == DBCINV)
return 1; return 1;
} }
else else if (ch2 != 0) {
/* Don't consume null char as part of pair */
insize = 2; insize = 2;
}
} }
} }
} }