packer: Use PyUnicode_AsUTF8AndSize() for utf-8 (#272)

2026-02-07 02:09:59 +00:00 · 2018-01-11 19:41:05 +09:00 · 2018-01-11 19:41:05 +09:00 · 60ef3879d7
commit 60ef3879d7
parent 5534d0c7af
3 changed files with 71 additions and 14 deletions
--- a/docker/runtests.sh
+++ b/docker/runtests.sh
@ -9,6 +9,6 @@ for V in cp36-cp36m cp35-cp35m cp27-cp27m cp27-cp27mu; do
    pushd test          # prevent importing msgpack package in current directory.
    $PYBIN/python -c 'import sys; print(hex(sys.maxsize))'
    $PYBIN/python -c 'from msgpack import _packer, _unpacker'
-    $PYBIN/py.test -v
+    $PYBIN/pytest -v .
    popd
 done
--- a/msgpack/_packer.pyx
+++ b/msgpack/_packer.pyx
@ -13,6 +13,7 @@ cdef extern from "Python.h":
    int PyMemoryView_Check(object obj)
    int PyByteArray_Check(object obj)
    int PyByteArray_CheckExact(object obj)
+    char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t *l) except NULL


 cdef extern from "pack.h":
@ -37,6 +38,7 @@ cdef extern from "pack.h":
    int msgpack_pack_bin(msgpack_packer* pk, size_t l)
    int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l)
    int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l)
+    int msgpack_pack_unicode(msgpack_packer* pk, object o, long long limit)

 cdef int DEFAULT_RECURSE_LIMIT=511
 cdef long long ITEM_LIMIT = (2**32)-1
@ -126,8 +128,12 @@ cdef class Packer(object):
                raise TypeError("default must be a callable.")
        self._default = default
        if encoding is None:
-            self.encoding = 'utf_8'
-            self.unicode_errors = NULL
+            if unicode_errors is None:
+                self.encoding = NULL
+                self.unicode_errors = NULL
+            else:
+                self.encoding = "utf_8"
+                self.unicode_errors = unicode_errors
        else:
            if isinstance(encoding, unicode):
                self._bencoding = encoding.encode('ascii')
@ -140,6 +146,8 @@ cdef class Packer(object):
                self._berrors = unicode_errors
            if self._berrors is not None:
                self.unicode_errors = PyBytes_AsString(self._berrors)
+            else:
+                self.unicode_errors = NULL

    def __dealloc__(self):
        PyMem_Free(self.pk.buf)
@ -206,17 +214,19 @@ cdef class Packer(object):
                if ret == 0:
                    ret = msgpack_pack_raw_body(&self.pk, rawval, L)
            elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
-                if not self.encoding:
-                    raise TypeError("Can't encode unicode string: no encoding is specified")
-                #TODO: Use faster API for UTF-8
-                o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
-                L = len(o)
-                if L > ITEM_LIMIT:
-                    raise PackValueError("unicode string is too large")
-                rawval = o
-                ret = msgpack_pack_raw(&self.pk, L)
-                if ret == 0:
-                    ret = msgpack_pack_raw_body(&self.pk, rawval, L)
+                if self.encoding == NULL:
+                    ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
+                    if ret == -2:
+                        raise PackValueError("unicode string is too large")
+                else:
+                    o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
+                    L = len(o)
+                    if L > ITEM_LIMIT:
+                        raise PackValueError("unicode string is too large")
+                    ret = msgpack_pack_raw(&self.pk, L)
+                    if ret == 0:
+                        rawval = o
+                        ret = msgpack_pack_raw_body(&self.pk, rawval, L)
            elif PyDict_CheckExact(o):
                d = <dict>o
                L = len(d)
--- a/msgpack/pack.h
+++ b/msgpack/pack.h
@ -67,6 +67,53 @@ static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_

 #include "pack_template.h"

+// return -2 when o is too long
+static inline int
+msgpack_pack_unicode(msgpack_packer *pk, PyObject *o, long long limit)
+{
+#if PY_MAJOR_VERSION >= 3
+    assert(PyUnicode_Check(o));
+
+    Py_ssize_t len;
+    const char* buf = PyUnicode_AsUTF8AndSize(o, &len);
+    if (buf == NULL)
+        return -1;
+
+    if (len > limit) {
+        return -2;
+    }
+
+    int ret = msgpack_pack_raw(pk, len);
+    if (ret) return ret;
+
+    return msgpack_pack_raw_body(pk, buf, len);
+#else
+    PyObject *bytes;
+    Py_ssize_t len;
+    int ret;
+
+    // py2
+    bytes = PyUnicode_AsUTF8String(o);
+    if (bytes == NULL)
+        return -1;
+
+    len = PyString_GET_SIZE(bytes);
+    if (len > limit) {
+        Py_DECREF(bytes);
+        return -2;
+    }
+
+    ret = msgpack_pack_raw(pk, len);
+    if (ret) {
+        Py_DECREF(bytes);
+        return -1;
+    }
+    ret = msgpack_pack_raw_body(pk, PyString_AS_STRING(bytes), len);
+    Py_DECREF(bytes);
+    return ret;
+#endif
+}
+
 #ifdef __cplusplus
 }
 #endif