Add raw_as_bytes option to Unpacker. (#265)

This commit is contained in:
INADA Naoki 2018-01-11 17:02:41 +09:00 committed by GitHub
parent 50ea49c86f
commit 5534d0c7af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 199 additions and 93 deletions

View file

@ -8,7 +8,8 @@ cython:
.PHONY: test .PHONY: test
test: test:
py.test -v test pytest -v test
MSGPACK_PUREPYTHON=1 pytest -v test
.PHONY: serve-doc .PHONY: serve-doc
serve-doc: all serve-doc: all

View file

@ -10,8 +10,21 @@ MessagePack for Python
:target: https://msgpack-python.readthedocs.io/en/latest/?badge=latest :target: https://msgpack-python.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status :alt: Documentation Status
IMPORTANT: Upgrading from msgpack-0.4
-------------------------------------- What's this
-----------
`MessagePack <https://msgpack.org/>`_ is an efficient binary serialization format.
It lets you exchange data among multiple languages like JSON.
But it's faster and smaller.
This package provides CPython bindings for reading and writing MessagePack data.
Very important notes for existing users
---------------------------------------
PyPI package name
^^^^^^^^^^^^^^^^^
TL;DR: When upgrading from msgpack-0.4 or earlier, don't do `pip install -U msgpack-python`. TL;DR: When upgrading from msgpack-0.4 or earlier, don't do `pip install -U msgpack-python`.
Do `pip uninstall msgpack-python; pip install msgpack` instead. Do `pip uninstall msgpack-python; pip install msgpack` instead.
@ -24,13 +37,37 @@ Sadly, this doesn't work for upgrade install. After `pip install -U msgpack-pyt
msgpack is removed and `import msgpack` fail. msgpack is removed and `import msgpack` fail.
What's this Deprecating encoding option
----------- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
encoding and unicode_errors options are deprecated.
In case of packer, use UTF-8 always. Storing other than UTF-8 is not recommended.
For backward compatibility, you can use ``use_bin_type=False`` and pack ``bytes``
object into msgpack raw type.
In case of unpacker, there is new ``raw_as_bytes`` option. It is ``True`` by default
for backward compatibility, but it is changed to ``False`` in near future.
You can use ``raw_as_bytes=False`` instead of ``encoding='utf-8'``.
Planned backward incompatible changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
When msgpack 1.0, I planning these breaking changes:
* packer and unpacker: Remove ``encoding`` and ``unicode_errors`` option.
* packer: Change default of ``use_bin_type`` option from False to True.
* unpacker: Change default of ``raw_as_bytes`` option from True to False.
* unpacker: Reduce all ``max_xxx_len`` options for typical usage.
* unpacker: Remove ``write_bytes`` option from all methods.
To avoid these breaking changes breaks your application, please:
* Don't use deprecated options.
* Pass ``use_bin_type`` and ``raw_as_bytes`` options explicitly.
* If your application handle large (>1MB) data, specify ``max_xxx_len`` options too.
`MessagePack <https://msgpack.org/>`_ is an efficient binary serialization format.
It lets you exchange data among multiple languages like JSON.
But it's faster and smaller.
This package provides CPython bindings for reading and writing MessagePack data.
Install Install
------- -------
@ -76,14 +113,14 @@ msgpack provides ``dumps`` and ``loads`` as an alias for compatibility with
>>> import msgpack >>> import msgpack
>>> msgpack.packb([1, 2, 3], use_bin_type=True) >>> msgpack.packb([1, 2, 3], use_bin_type=True)
'\x93\x01\x02\x03' '\x93\x01\x02\x03'
>>> msgpack.unpackb(_) >>> msgpack.unpackb(_, raw_as_bytes=False)
[1, 2, 3] [1, 2, 3]
``unpack`` unpacks msgpack's array to Python's list, but can also unpack to tuple: ``unpack`` unpacks msgpack's array to Python's list, but can also unpack to tuple:
.. code-block:: pycon .. code-block:: pycon
>>> msgpack.unpackb(b'\x93\x01\x02\x03', use_list=False) >>> msgpack.unpackb(b'\x93\x01\x02\x03', use_list=False, raw_as_bytes=False)
(1, 2, 3) (1, 2, 3)
You should always specify the ``use_list`` keyword argument for backward compatibility. You should always specify the ``use_list`` keyword argument for backward compatibility.
@ -109,7 +146,7 @@ stream (or from bytes provided through its ``feed`` method).
buf.seek(0) buf.seek(0)
unpacker = msgpack.Unpacker(buf) unpacker = msgpack.Unpacker(buf, raw_as_bytes=False)
for unpacked in unpacker: for unpacked in unpacker:
print(unpacked) print(unpacked)
@ -142,7 +179,7 @@ It is also possible to pack/unpack custom data types. Here is an example for
packed_dict = msgpack.packb(useful_dict, default=encode_datetime, use_bin_type=True) packed_dict = msgpack.packb(useful_dict, default=encode_datetime, use_bin_type=True)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime) this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime, raw_as_bytes=False)
``Unpacker``'s ``object_hook`` callback receives a dict; the ``Unpacker``'s ``object_hook`` callback receives a dict; the
``object_pairs_hook`` callback may instead be used to receive a list of ``object_pairs_hook`` callback may instead be used to receive a list of
@ -172,7 +209,7 @@ It is also possible to pack/unpack custom data types using the **ext** type.
... ...
>>> data = array.array('d', [1.2, 3.4]) >>> data = array.array('d', [1.2, 3.4])
>>> packed = msgpack.packb(data, default=default, use_bin_type=True) >>> packed = msgpack.packb(data, default=default, use_bin_type=True)
>>> unpacked = msgpack.unpackb(packed, ext_hook=ext_hook) >>> unpacked = msgpack.unpackb(packed, ext_hook=ext_hook, raw_as_bytes=False)
>>> data == unpacked >>> data == unpacked
True True
@ -217,14 +254,10 @@ Early versions of msgpack didn't distinguish string and binary types (like Pytho
The type for representing both string and binary types was named **raw**. The type for representing both string and binary types was named **raw**.
For backward compatibility reasons, msgpack-python will still default all For backward compatibility reasons, msgpack-python will still default all
strings to byte strings, unless you specify the `use_bin_type=True` option in strings to byte strings, unless you specify the ``use_bin_type=True`` option in
the packer. If you do so, it will use a non-standard type called **bin** to the packer. If you do so, it will use a non-standard type called **bin** to
serialize byte arrays, and **raw** becomes to mean **str**. If you want to serialize byte arrays, and **raw** becomes to mean **str**. If you want to
distinguish **bin** and **raw** in the unpacker, specify `encoding='utf-8'`. distinguish **bin** and **raw** in the unpacker, specify ``raw_as_bytes=False``.
**In future version, default value of ``use_bin_type`` will be changed to ``True``.
To avoid this change will break your code, you must specify it explicitly
even when you want to use old format.**
Note that Python 2 defaults to byte-arrays over Unicode strings: Note that Python 2 defaults to byte-arrays over Unicode strings:
@ -234,7 +267,7 @@ Note that Python 2 defaults to byte-arrays over Unicode strings:
>>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'])) >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs']))
['spam', 'eggs'] ['spam', 'eggs']
>>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True), >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True),
encoding='utf-8') raw_as_bytes=False)
['spam', u'eggs'] ['spam', u'eggs']
This is the same code in Python 3 (same behaviour, but Python 3 has a This is the same code in Python 3 (same behaviour, but Python 3 has a
@ -246,7 +279,7 @@ different default):
>>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'])) >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs']))
[b'spam', b'eggs'] [b'spam', b'eggs']
>>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True), >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True),
encoding='utf-8') raw_as_bytes=False)
[b'spam', 'eggs'] [b'spam', 'eggs']
@ -277,6 +310,7 @@ You can use ``gc.disable()`` when unpacking large message.
use_list option use_list option
^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^
List is the default sequence type of Python. List is the default sequence type of Python.
But tuple is lighter than list. But tuple is lighter than list.
You can use ``use_list=False`` while unpacking when performance is important. You can use ``use_list=False`` while unpacking when performance is important.
@ -295,7 +329,7 @@ Test
MessagePack uses `pytest` for testing. MessagePack uses `pytest` for testing.
Run test with following command: Run test with following command:
$ pytest -v test $ make test
.. ..

View file

@ -3,5 +3,7 @@
%PYTHON%\python.exe setup.py install %PYTHON%\python.exe setup.py install
%PYTHON%\python.exe -c "import sys; print(hex(sys.maxsize))" %PYTHON%\python.exe -c "import sys; print(hex(sys.maxsize))"
%PYTHON%\python.exe -c "from msgpack import _packer, _unpacker" %PYTHON%\python.exe -c "from msgpack import _packer, _unpacker"
%PYTHON%\python.exe -m pytest -v test
%PYTHON%\python.exe setup.py bdist_wheel %PYTHON%\python.exe setup.py bdist_wheel
%PYTHON%\python.exe -m pytest -v test
SET EL=%ERRORLEVEL%
exit /b %EL%

View file

@ -2,7 +2,7 @@
#cython: embedsignature=True #cython: embedsignature=True
from cpython cimport * from cpython cimport *
#from cpython.exc cimport PyErr_WarnEx from cpython.exc cimport PyErr_WarnEx
from msgpack.exceptions import PackValueError, PackOverflowError from msgpack.exceptions import PackValueError, PackOverflowError
from msgpack import ExtType from msgpack import ExtType
@ -39,7 +39,7 @@ cdef extern from "pack.h":
int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l) int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l)
cdef int DEFAULT_RECURSE_LIMIT=511 cdef int DEFAULT_RECURSE_LIMIT=511
cdef size_t ITEM_LIMIT = (2**32)-1 cdef long long ITEM_LIMIT = (2**32)-1
cdef inline int PyBytesLike_Check(object o): cdef inline int PyBytesLike_Check(object o):
@ -110,9 +110,13 @@ cdef class Packer(object):
self.pk.buf_size = buf_size self.pk.buf_size = buf_size
self.pk.length = 0 self.pk.length = 0
def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', def __init__(self, default=None, encoding=None, unicode_errors=None,
bint use_single_float=False, bint autoreset=True, bint use_bin_type=False, bint use_single_float=False, bint autoreset=True, bint use_bin_type=False,
bint strict_types=False): bint strict_types=False):
if encoding is not None:
PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated.", 1)
if unicode_errors is not None:
PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated.", 1)
self.use_float = use_single_float self.use_float = use_single_float
self.strict_types = strict_types self.strict_types = strict_types
self.autoreset = autoreset self.autoreset = autoreset
@ -122,7 +126,7 @@ cdef class Packer(object):
raise TypeError("default must be a callable.") raise TypeError("default must be a callable.")
self._default = default self._default = default
if encoding is None: if encoding is None:
self.encoding = NULL self.encoding = 'utf_8'
self.unicode_errors = NULL self.unicode_errors = NULL
else: else:
if isinstance(encoding, unicode): if isinstance(encoding, unicode):
@ -134,6 +138,7 @@ cdef class Packer(object):
self._berrors = unicode_errors.encode('ascii') self._berrors = unicode_errors.encode('ascii')
else: else:
self._berrors = unicode_errors self._berrors = unicode_errors
if self._berrors is not None:
self.unicode_errors = PyBytes_AsString(self._berrors) self.unicode_errors = PyBytes_AsString(self._berrors)
def __dealloc__(self): def __dealloc__(self):
@ -149,7 +154,7 @@ cdef class Packer(object):
cdef char* rawval cdef char* rawval
cdef int ret cdef int ret
cdef dict d cdef dict d
cdef size_t L cdef Py_ssize_t L
cdef int default_used = 0 cdef int default_used = 0
cdef bint strict_types = self.strict_types cdef bint strict_types = self.strict_types
cdef Py_buffer view cdef Py_buffer view
@ -203,6 +208,7 @@ cdef class Packer(object):
elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
if not self.encoding: if not self.encoding:
raise TypeError("Can't encode unicode string: no encoding is specified") raise TypeError("Can't encode unicode string: no encoding is specified")
#TODO: Use faster API for UTF-8
o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
L = len(o) L = len(o)
if L > ITEM_LIMIT: if L > ITEM_LIMIT:

View file

@ -43,8 +43,9 @@ from msgpack import ExtType
cdef extern from "unpack.h": cdef extern from "unpack.h":
ctypedef struct msgpack_user: ctypedef struct msgpack_user:
bint use_list bint use_list
PyObject* object_hook bint raw_as_bytes
bint has_pairs_hook # call object_hook with k-v pairs bint has_pairs_hook # call object_hook with k-v pairs
PyObject* object_hook
PyObject* list_hook PyObject* list_hook
PyObject* ext_hook PyObject* ext_hook
char *encoding char *encoding
@ -73,12 +74,14 @@ cdef extern from "unpack.h":
cdef inline init_ctx(unpack_context *ctx, cdef inline init_ctx(unpack_context *ctx,
object object_hook, object object_pairs_hook, object object_hook, object object_pairs_hook,
object list_hook, object ext_hook, object list_hook, object ext_hook,
bint use_list, char* encoding, char* unicode_errors, bint use_list, bint raw_as_bytes,
char* encoding, char* unicode_errors,
Py_ssize_t max_str_len, Py_ssize_t max_bin_len, Py_ssize_t max_str_len, Py_ssize_t max_bin_len,
Py_ssize_t max_array_len, Py_ssize_t max_map_len, Py_ssize_t max_array_len, Py_ssize_t max_map_len,
Py_ssize_t max_ext_len): Py_ssize_t max_ext_len):
unpack_init(ctx) unpack_init(ctx)
ctx.user.use_list = use_list ctx.user.use_list = use_list
ctx.user.raw_as_bytes = raw_as_bytes
ctx.user.object_hook = ctx.user.list_hook = <PyObject*>NULL ctx.user.object_hook = ctx.user.list_hook = <PyObject*>NULL
ctx.user.max_str_len = max_str_len ctx.user.max_str_len = max_str_len
ctx.user.max_bin_len = max_bin_len ctx.user.max_bin_len = max_bin_len
@ -155,7 +158,8 @@ cdef inline int get_data_from_buffer(object obj,
return 1 return 1
def unpackb(object packed, object object_hook=None, object list_hook=None, def unpackb(object packed, object object_hook=None, object list_hook=None,
bint use_list=1, encoding=None, unicode_errors="strict", bint use_list=True, bint raw_as_bytes=True,
encoding=None, unicode_errors="strict",
object_pairs_hook=None, ext_hook=ExtType, object_pairs_hook=None, ext_hook=ExtType,
Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_str_len=2147483647, # 2**32-1
Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_bin_len=2147483647,
@ -180,21 +184,26 @@ def unpackb(object packed, object object_hook=None, object list_hook=None,
cdef char* cerr = NULL cdef char* cerr = NULL
cdef int new_protocol = 0 cdef int new_protocol = 0
get_data_from_buffer(packed, &view, &buf, &buf_len, &new_protocol)
try:
if encoding is not None: if encoding is not None:
PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated, Use raw_as_bytes=False instead.", 1)
if isinstance(encoding, unicode): if isinstance(encoding, unicode):
encoding = encoding.encode('ascii') encoding = encoding.encode('ascii')
elif not isinstance(encoding, bytes):
raise TypeError("encoding should be bytes or unicode")
cenc = PyBytes_AsString(encoding) cenc = PyBytes_AsString(encoding)
if unicode_errors is not None: if unicode_errors is not None:
PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated", 1)
if isinstance(unicode_errors, unicode): if isinstance(unicode_errors, unicode):
unicode_errors = unicode_errors.encode('ascii') unicode_errors = unicode_errors.encode('ascii')
elif not isinstance(unicode_errors, bytes):
raise TypeError("unicode_errors should be bytes or unicode")
cerr = PyBytes_AsString(unicode_errors) cerr = PyBytes_AsString(unicode_errors)
get_data_from_buffer(packed, &view, &buf, &buf_len, &new_protocol)
try:
init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook,
use_list, cenc, cerr, use_list, raw_as_bytes, cenc, cerr,
max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len)
ret = unpack_construct(&ctx, buf, buf_len, &off) ret = unpack_construct(&ctx, buf, buf_len, &off)
finally: finally:
@ -252,6 +261,16 @@ cdef class Unpacker(object):
If true, unpack msgpack array to Python list. If true, unpack msgpack array to Python list.
Otherwise, unpack to Python tuple. (default: True) Otherwise, unpack to Python tuple. (default: True)
:param bool raw_as_bytes:
If true, unpack msgpack raw to Python bytes (default).
Otherwise, unpack to Python str (or unicode on Python 2) by decoding
with UTF-8 encoding (recommended).
Currently, the default is true, but it will be changed to false in
near future. So you must specify it explicitly for keeping backward
compatibility.
*encoding* option which is deprecated overrides this option.
:param callable object_hook: :param callable object_hook:
When specified, it should be callable. When specified, it should be callable.
Unpacker calls it with a dict argument after unpacking msgpack map. Unpacker calls it with a dict argument after unpacking msgpack map.
@ -262,14 +281,6 @@ cdef class Unpacker(object):
Unpacker calls it with a list of key-value pairs after unpacking msgpack map. Unpacker calls it with a list of key-value pairs after unpacking msgpack map.
(See also simplejson) (See also simplejson)
:param str encoding:
Encoding used for decoding msgpack raw.
If it is None (default), msgpack raw is deserialized to Python bytes.
:param str unicode_errors:
Used for decoding msgpack raw with *encoding*.
(default: `'strict'`)
:param int max_buffer_size: :param int max_buffer_size:
Limits size of data waiting unpacked. 0 means system's INT_MAX (default). Limits size of data waiting unpacked. 0 means system's INT_MAX (default).
Raises `BufferFull` exception when it is insufficient. Raises `BufferFull` exception when it is insufficient.
@ -287,16 +298,25 @@ cdef class Unpacker(object):
:param int max_map_len: :param int max_map_len:
Limits max length of map. (default: 2**31-1) Limits max length of map. (default: 2**31-1)
:param str encoding:
Deprecated, use raw_as_bytes instead.
Encoding used for decoding msgpack raw.
If it is None (default), msgpack raw is deserialized to Python bytes.
example of streaming deserialize from file-like object:: :param str unicode_errors:
Deprecated. Used for decoding msgpack raw with *encoding*.
(default: `'strict'`)
unpacker = Unpacker(file_like)
Example of streaming deserialize from file-like object::
unpacker = Unpacker(file_like, raw_as_bytes=False)
for o in unpacker: for o in unpacker:
process(o) process(o)
example of streaming deserialize from socket:: Example of streaming deserialize from socket::
unpacker = Unpacker() unpacker = Unpacker(raw_as_bytes=False)
while True: while True:
buf = sock.recv(1024**2) buf = sock.recv(1024**2)
if not buf: if not buf:
@ -324,7 +344,8 @@ cdef class Unpacker(object):
PyMem_Free(self.buf) PyMem_Free(self.buf)
self.buf = NULL self.buf = NULL
def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, def __init__(self, file_like=None, Py_ssize_t read_size=0,
bint use_list=True, bint raw_as_bytes=True,
object object_hook=None, object object_pairs_hook=None, object list_hook=None, object object_hook=None, object object_pairs_hook=None, object list_hook=None,
encoding=None, unicode_errors='strict', int max_buffer_size=0, encoding=None, unicode_errors='strict', int max_buffer_size=0,
object ext_hook=ExtType, object ext_hook=ExtType,
@ -363,6 +384,7 @@ cdef class Unpacker(object):
self.stream_offset = 0 self.stream_offset = 0
if encoding is not None: if encoding is not None:
PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated, Use raw_as_bytes=False instead.", 1)
if isinstance(encoding, unicode): if isinstance(encoding, unicode):
self.encoding = encoding.encode('ascii') self.encoding = encoding.encode('ascii')
elif isinstance(encoding, bytes): elif isinstance(encoding, bytes):
@ -372,6 +394,7 @@ cdef class Unpacker(object):
cenc = PyBytes_AsString(self.encoding) cenc = PyBytes_AsString(self.encoding)
if unicode_errors is not None: if unicode_errors is not None:
PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated", 1)
if isinstance(unicode_errors, unicode): if isinstance(unicode_errors, unicode):
self.unicode_errors = unicode_errors.encode('ascii') self.unicode_errors = unicode_errors.encode('ascii')
elif isinstance(unicode_errors, bytes): elif isinstance(unicode_errors, bytes):
@ -381,7 +404,7 @@ cdef class Unpacker(object):
cerr = PyBytes_AsString(self.unicode_errors) cerr = PyBytes_AsString(self.unicode_errors)
init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook,
ext_hook, use_list, cenc, cerr, ext_hook, use_list, raw_as_bytes, cenc, cerr,
max_str_len, max_bin_len, max_array_len, max_str_len, max_bin_len, max_array_len,
max_map_len, max_ext_len) max_map_len, max_ext_len)

View file

@ -145,6 +145,16 @@ class Unpacker(object):
If true, unpack msgpack array to Python list. If true, unpack msgpack array to Python list.
Otherwise, unpack to Python tuple. (default: True) Otherwise, unpack to Python tuple. (default: True)
:param bool raw_as_bytes:
If true, unpack msgpack raw to Python bytes (default).
Otherwise, unpack to Python str (or unicode on Python 2) by decoding
with UTF-8 encoding (recommended).
Currently, the default is true, but it will be changed to false in
near future. So you must specify it explicitly for keeping backward
compatibility.
*encoding* option which is deprecated overrides this option.
:param callable object_hook: :param callable object_hook:
When specified, it should be callable. When specified, it should be callable.
Unpacker calls it with a dict argument after unpacking msgpack map. Unpacker calls it with a dict argument after unpacking msgpack map.
@ -183,13 +193,13 @@ class Unpacker(object):
example of streaming deserialize from file-like object:: example of streaming deserialize from file-like object::
unpacker = Unpacker(file_like) unpacker = Unpacker(file_like, raw_as_bytes=False)
for o in unpacker: for o in unpacker:
process(o) process(o)
example of streaming deserialize from socket:: example of streaming deserialize from socket::
unpacker = Unpacker() unpacker = Unpacker(raw_as_bytes=False)
while True: while True:
buf = sock.recv(1024**2) buf = sock.recv(1024**2)
if not buf: if not buf:
@ -199,15 +209,28 @@ class Unpacker(object):
process(o) process(o)
""" """
def __init__(self, file_like=None, read_size=0, use_list=True, def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=True,
object_hook=None, object_pairs_hook=None, list_hook=None, object_hook=None, object_pairs_hook=None, list_hook=None,
encoding=None, unicode_errors='strict', max_buffer_size=0, encoding=None, unicode_errors=None, max_buffer_size=0,
ext_hook=ExtType, ext_hook=ExtType,
max_str_len=2147483647, # 2**32-1 max_str_len=2147483647, # 2**32-1
max_bin_len=2147483647, max_bin_len=2147483647,
max_array_len=2147483647, max_array_len=2147483647,
max_map_len=2147483647, max_map_len=2147483647,
max_ext_len=2147483647): max_ext_len=2147483647):
if encoding is not None:
warnings.warn(
"encoding is deprecated, Use raw_as_bytes=False instead.",
PendingDeprecationWarning)
if unicode_errors is not None:
warnings.warn(
"unicode_errors is deprecated.",
PendingDeprecationWarning)
else:
unicode_errors = 'strict'
if file_like is None: if file_like is None:
self._feeding = True self._feeding = True
else: else:
@ -234,6 +257,7 @@ class Unpacker(object):
if read_size > self._max_buffer_size: if read_size > self._max_buffer_size:
raise ValueError("read_size must be smaller than max_buffer_size") raise ValueError("read_size must be smaller than max_buffer_size")
self._read_size = read_size or min(self._max_buffer_size, 16*1024) self._read_size = read_size or min(self._max_buffer_size, 16*1024)
self._raw_as_bytes = bool(raw_as_bytes)
self._encoding = encoding self._encoding = encoding
self._unicode_errors = unicode_errors self._unicode_errors = unicode_errors
self._use_list = use_list self._use_list = use_list
@ -582,8 +606,10 @@ class Unpacker(object):
if typ == TYPE_RAW: if typ == TYPE_RAW:
if self._encoding is not None: if self._encoding is not None:
obj = obj.decode(self._encoding, self._unicode_errors) obj = obj.decode(self._encoding, self._unicode_errors)
else: elif self._raw_as_bytes:
obj = bytes(obj) obj = bytes(obj)
else:
obj = obj.decode('utf_8')
return obj return obj
if typ == TYPE_EXT: if typ == TYPE_EXT:
return self._ext_hook(n, bytes(obj)) return self._ext_hook(n, bytes(obj))
@ -682,9 +708,23 @@ class Packer(object):
:param str unicode_errors: :param str unicode_errors:
(deprecated) Error handler for encoding unicode. (default: 'strict') (deprecated) Error handler for encoding unicode. (default: 'strict')
""" """
def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', def __init__(self, default=None, encoding=None, unicode_errors=None,
use_single_float=False, autoreset=True, use_bin_type=False, use_single_float=False, autoreset=True, use_bin_type=False,
strict_types=False): strict_types=False):
if encoding is None:
encoding = 'utf_8'
else:
warnings.warn(
"encoding is deprecated, Use raw_as_bytes=False instead.",
PendingDeprecationWarning)
if unicode_errors is None:
unicode_errors = 'strict'
else:
warnings.warn(
"unicode_errors is deprecated.",
PendingDeprecationWarning)
self._strict_types = strict_types self._strict_types = strict_types
self._use_float = use_single_float self._use_float = use_single_float
self._autoreset = autoreset self._autoreset = autoreset

View file

@ -20,9 +20,10 @@
#include "unpack_define.h" #include "unpack_define.h"
typedef struct unpack_user { typedef struct unpack_user {
int use_list; bool use_list;
PyObject *object_hook; bool raw_as_bytes;
bool has_pairs_hook; bool has_pairs_hook;
PyObject *object_hook;
PyObject *list_hook; PyObject *list_hook;
PyObject *ext_hook; PyObject *ext_hook;
const char *encoding; const char *encoding;
@ -225,10 +226,13 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char*
} }
PyObject *py; PyObject *py;
if(u->encoding) {
if (u->encoding) {
py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
} else { } else if (u->raw_as_bytes) {
py = PyBytes_FromStringAndSize(p, l); py = PyBytes_FromStringAndSize(p, l);
} else {
py = PyUnicode_DecodeUTF8(p, l, NULL);
} }
if (!py) if (!py)
return -1; return -1;

View file

@ -39,11 +39,11 @@ def test_max_str_len():
d = 'x' * 3 d = 'x' * 3
packed = packb(d) packed = packb(d)
unpacker = Unpacker(max_str_len=3, encoding='utf-8') unpacker = Unpacker(max_str_len=3, raw_as_bytes=False)
unpacker.feed(packed) unpacker.feed(packed)
assert unpacker.unpack() == d assert unpacker.unpack() == d
unpacker = Unpacker(max_str_len=2, encoding='utf-8') unpacker = Unpacker(max_str_len=2, raw_as_bytes=False)
with pytest.raises(UnpackValueError): with pytest.raises(UnpackValueError):
unpacker.feed(packed) unpacker.feed(packed)
unpacker.unpack() unpacker.unpack()

View file

@ -31,14 +31,14 @@ def testPack():
def testPackUnicode(): def testPackUnicode():
test_data = ["", "abcd", ["defgh"], "Русский текст"] test_data = ["", "abcd", ["defgh"], "Русский текст"]
for td in test_data: for td in test_data:
re = unpackb(packb(td, encoding='utf-8'), use_list=1, encoding='utf-8') re = unpackb(packb(td), use_list=1, raw_as_bytes=False)
assert re == td assert re == td
packer = Packer(encoding='utf-8') packer = Packer()
data = packer.pack(td) data = packer.pack(td)
re = Unpacker(BytesIO(data), encoding=str('utf-8'), use_list=1).unpack() re = Unpacker(BytesIO(data), raw_as_bytes=False, use_list=1).unpack()
assert re == td assert re == td
def testPackUTF32(): def testPackUTF32(): # deprecated
try: try:
test_data = [ test_data = [
"", "",
@ -66,26 +66,22 @@ def testPackByteArrays():
for td in test_data: for td in test_data:
check(td) check(td)
def testIgnoreUnicodeErrors(): def testIgnoreUnicodeErrors(): # deprecated
re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1) re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1)
assert re == "abcdef" assert re == "abcdef"
def testStrictUnicodeUnpack(): def testStrictUnicodeUnpack():
with raises(UnicodeDecodeError): with raises(UnicodeDecodeError):
unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1) unpackb(packb(b'abc\xeddef'), raw_as_bytes=False, use_list=1)
def testStrictUnicodePack(): def testStrictUnicodePack(): # deprecated
with raises(UnicodeEncodeError): with raises(UnicodeEncodeError):
packb("abc\xeddef", encoding='ascii', unicode_errors='strict') packb("abc\xeddef", encoding='ascii', unicode_errors='strict')
def testIgnoreErrorsPack(): def testIgnoreErrorsPack(): # deprecated
re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8', use_list=1) re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw_as_bytes=False, use_list=1)
assert re == "abcdef" assert re == "abcdef"
def testNoEncoding():
with raises(TypeError):
packb("abc", encoding=None)
def testDecodeBinary(): def testDecodeBinary():
re = unpackb(packb(b"abc"), encoding=None, use_list=1) re = unpackb(packb(b"abc"), encoding=None, use_list=1)
assert re == b"abc" assert re == b"abc"

View file

@ -11,7 +11,7 @@ def test_namedtuple():
return dict(o._asdict()) return dict(o._asdict())
raise TypeError('Unsupported type %s' % (type(o),)) raise TypeError('Unsupported type %s' % (type(o),))
packed = packb(T(1, 42), strict_types=True, use_bin_type=True, default=default) packed = packb(T(1, 42), strict_types=True, use_bin_type=True, default=default)
unpacked = unpackb(packed, encoding='utf-8') unpacked = unpackb(packed, raw_as_bytes=False)
assert unpacked == {'foo': 1, 'bar': 42} assert unpacked == {'foo': 1, 'bar': 42}
@ -32,7 +32,7 @@ def test_tuple():
return o return o
data = packb(t, strict_types=True, use_bin_type=True, default=default) data = packb(t, strict_types=True, use_bin_type=True, default=default)
expected = unpackb(data, encoding='utf-8', object_hook=convert) expected = unpackb(data, raw_as_bytes=False, object_hook=convert)
assert expected == t assert expected == t
@ -53,10 +53,10 @@ def test_tuple_ext():
def convert(code, payload): def convert(code, payload):
if code == MSGPACK_EXT_TYPE_TUPLE: if code == MSGPACK_EXT_TYPE_TUPLE:
# Unpack and convert to tuple # Unpack and convert to tuple
return tuple(unpackb(payload, encoding='utf-8', ext_hook=convert)) return tuple(unpackb(payload, raw_as_bytes=False, ext_hook=convert))
raise ValueError('Unknown Ext code {}'.format(code)) raise ValueError('Unknown Ext code {}'.format(code))
data = packb(t, strict_types=True, use_bin_type=True, default=default) data = packb(t, strict_types=True, use_bin_type=True, default=default)
expected = unpackb(data, encoding='utf-8', ext_hook=convert) expected = unpackb(data, raw_as_bytes=False, ext_hook=convert)
assert expected == t assert expected == t

View file

@ -47,8 +47,8 @@ def test_unpacker_ext_hook():
class MyUnpacker(Unpacker): class MyUnpacker(Unpacker):
def __init__(self): def __init__(self):
super(MyUnpacker, self).__init__(ext_hook=self._hook, super(MyUnpacker, self).__init__(
encoding='utf-8') ext_hook=self._hook, raw_as_bytes=False)
def _hook(self, code, data): def _hook(self, code, data):
if code == 1: if code == 1:
@ -57,11 +57,11 @@ def test_unpacker_ext_hook():
return ExtType(code, data) return ExtType(code, data)
unpacker = MyUnpacker() unpacker = MyUnpacker()
unpacker.feed(packb({'a': 1}, encoding='utf-8')) unpacker.feed(packb({'a': 1}))
assert unpacker.unpack() == {'a': 1} assert unpacker.unpack() == {'a': 1}
unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8')) unpacker.feed(packb({'a': ExtType(1, b'123')}))
assert unpacker.unpack() == {'a': 123} assert unpacker.unpack() == {'a': 123}
unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8')) unpacker.feed(packb({'a': ExtType(2, b'321')}))
assert unpacker.unpack() == {'a': ExtType(2, b'321')} assert unpacker.unpack() == {'a': ExtType(2, b'321')}