Ressurect unicode_errors of the Packer. (#379)

This commit is contained in:
Inada Naoki 2019-12-03 20:53:11 +09:00 committed by GitHub
parent a0480c7602
commit 83ebb63c44
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 50 additions and 13 deletions

View file

@ -5,7 +5,7 @@ Release Date: TBD
* Remove Python 2 support from the ``msgpack/_cmsgpack``. * Remove Python 2 support from the ``msgpack/_cmsgpack``.
``msgpack/fallback`` still supports Python 2. ``msgpack/fallback`` still supports Python 2.
* Remove encoding and unicode_errors options from the Packer. * Remove ``encoding`` option from the Packer.
0.6.2 0.6.2

View file

@ -89,9 +89,15 @@ cdef class Packer(object):
Additionally tuples will not be serialized as lists. Additionally tuples will not be serialized as lists.
This is useful when trying to implement accurate serialization This is useful when trying to implement accurate serialization
for python types. for python types.
:param str unicode_errors:
The error handler for encoding unicode. (default: 'strict')
DO NOT USE THIS!! This option is kept for very specific usage.
""" """
cdef msgpack_packer pk cdef msgpack_packer pk
cdef object _default cdef object _default
cdef object _berrors
cdef const char *unicode_errors
cdef bint strict_types cdef bint strict_types
cdef bool use_float cdef bool use_float
cdef bint autoreset cdef bint autoreset
@ -104,10 +110,8 @@ cdef class Packer(object):
self.pk.buf_size = buf_size self.pk.buf_size = buf_size
self.pk.length = 0 self.pk.length = 0
def __init__(self, default=None, def __init__(self, *, default=None, unicode_errors=None,
bint use_single_float=False, bint use_single_float=False, bint autoreset=True, bint use_bin_type=False,
bint autoreset=True,
bint use_bin_type=False,
bint strict_types=False): bint strict_types=False):
self.use_float = use_single_float self.use_float = use_single_float
self.strict_types = strict_types self.strict_types = strict_types
@ -118,6 +122,12 @@ cdef class Packer(object):
raise TypeError("default must be a callable.") raise TypeError("default must be a callable.")
self._default = default self._default = default
self._berrors = unicode_errors
if unicode_errors is None:
self.unicode_errors = NULL
else:
self.unicode_errors = self._berrors
def __dealloc__(self): def __dealloc__(self):
PyMem_Free(self.pk.buf) PyMem_Free(self.pk.buf)
self.pk.buf = NULL self.pk.buf = NULL
@ -183,9 +193,19 @@ cdef class Packer(object):
if ret == 0: if ret == 0:
ret = msgpack_pack_raw_body(&self.pk, rawval, L) ret = msgpack_pack_raw_body(&self.pk, rawval, L)
elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); if self.unicode_errors == NULL:
if ret == -2: ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
raise ValueError("unicode string is too large") if ret == -2:
raise ValueError("unicode string is too large")
else:
o = PyUnicode_AsEncodedString(o, NULL, self.unicode_errors)
L = Py_SIZE(o)
if L > ITEM_LIMIT:
raise ValueError("unicode string is too large")
ret = msgpack_pack_raw(&self.pk, L)
if ret == 0:
rawval = o
ret = msgpack_pack_raw_body(&self.pk, rawval, L)
elif PyDict_CheckExact(o): elif PyDict_CheckExact(o):
d = <dict>o d = <dict>o
L = len(d) L = len(d)

View file

@ -667,7 +667,7 @@ class Unpacker(object):
elif self._raw: elif self._raw:
obj = bytes(obj) obj = bytes(obj)
else: else:
obj = obj.decode('utf_8') obj = obj.decode('utf_8', self._unicode_errors)
return obj return obj
if typ == TYPE_EXT: if typ == TYPE_EXT:
return self._ext_hook(n, bytes(obj)) return self._ext_hook(n, bytes(obj))
@ -752,14 +752,19 @@ class Packer(object):
Additionally tuples will not be serialized as lists. Additionally tuples will not be serialized as lists.
This is useful when trying to implement accurate serialization This is useful when trying to implement accurate serialization
for python types. for python types.
:param str unicode_errors:
The error handler for encoding unicode. (default: 'strict')
DO NOT USE THIS!! This option is kept for very specific usage.
""" """
def __init__(self, default=None, def __init__(self, default=None, unicode_errors=None,
use_single_float=False, autoreset=True, use_bin_type=False, use_single_float=False, autoreset=True, use_bin_type=False,
strict_types=False): strict_types=False):
self._strict_types = strict_types self._strict_types = strict_types
self._use_float = use_single_float self._use_float = use_single_float
self._autoreset = autoreset self._autoreset = autoreset
self._use_bin_type = use_bin_type self._use_bin_type = use_bin_type
self._unicode_errors = unicode_errors or "strict"
self._buffer = StringIO() self._buffer = StringIO()
if default is not None: if default is not None:
if not callable(default): if not callable(default):
@ -816,7 +821,7 @@ class Packer(object):
self._pack_bin_header(n) self._pack_bin_header(n)
return self._buffer.write(obj) return self._buffer.write(obj)
if check(obj, unicode): if check(obj, unicode):
obj = obj.encode("utf-8") obj = obj.encode("utf-8", self._unicode_errors)
n = len(obj) n = len(obj)
if n >= 2**32: if n >= 2**32:
raise ValueError("String is too large") raise ValueError("String is too large")

View file

@ -5,6 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
from collections import OrderedDict from collections import OrderedDict
from io import BytesIO from io import BytesIO
import struct import struct
import sys
import pytest import pytest
from pytest import raises, xfail from pytest import raises, xfail
@ -54,13 +55,24 @@ def testPackByteArrays():
for td in test_data: for td in test_data:
check(td) check(td)
@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates")
def testIgnoreUnicodeErrors():
re = unpackb(packb(b'abc\xeddef', use_bin_type=False),
raw=False, unicode_errors='ignore')
assert re == "abcdef"
def testStrictUnicodeUnpack(): def testStrictUnicodeUnpack():
packed = packb(b'abc\xeddef') packed = packb(b'abc\xeddef', use_bin_type=False)
with pytest.raises(UnicodeDecodeError): with pytest.raises(UnicodeDecodeError):
unpackb(packed, raw=False, use_list=1) unpackb(packed, raw=False, use_list=1)
@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates")
def testIgnoreErrorsPack():
re = unpackb(packb(u"abc\uDC80\uDCFFdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1)
assert re == "abcdef"
def testDecodeBinary(): def testDecodeBinary():
re = unpackb(packb(b"abc"), encoding=None, use_list=1) re = unpackb(packb(b"abc"), use_list=1)
assert re == b"abc" assert re == b"abc"
def testPackFloat(): def testPackFloat():