mirror of
https://github.com/python/cpython.git
synced 2026-01-06 15:32:22 +00:00
Patch #1359618: Speed-up charmap encoder.
This commit is contained in:
parent
67966bed72
commit
3f767795f6
51 changed files with 510 additions and 11516 deletions
|
|
@ -3057,6 +3057,219 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* Charmap encoding: the lookup table */
|
||||
|
||||
struct encoding_map{
|
||||
PyObject_HEAD
|
||||
unsigned char level1[32];
|
||||
int count2, count3;
|
||||
unsigned char level23[1];
|
||||
};
|
||||
|
||||
static PyObject*
|
||||
encoding_map_size(PyObject *obj, PyObject* args)
|
||||
{
|
||||
struct encoding_map *map = (struct encoding_map*)obj;
|
||||
return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
|
||||
128*map->count3);
|
||||
}
|
||||
|
||||
static PyMethodDef encoding_map_methods[] = {
|
||||
{"size", encoding_map_size, METH_NOARGS,
|
||||
PyDoc_STR("Return the size (in bytes) of this object") },
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
static void
|
||||
encoding_map_dealloc(PyObject* o)
|
||||
{
|
||||
PyObject_FREE(o);
|
||||
}
|
||||
|
||||
static PyTypeObject EncodingMapType = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /*ob_size*/
|
||||
"EncodingMap", /*tp_name*/
|
||||
sizeof(struct encoding_map), /*tp_basicsize*/
|
||||
0, /*tp_itemsize*/
|
||||
/* methods */
|
||||
encoding_map_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
0, /*tp_getattr*/
|
||||
0, /*tp_setattr*/
|
||||
0, /*tp_compare*/
|
||||
0, /*tp_repr*/
|
||||
0, /*tp_as_number*/
|
||||
0, /*tp_as_sequence*/
|
||||
0, /*tp_as_mapping*/
|
||||
0, /*tp_hash*/
|
||||
0, /*tp_call*/
|
||||
0, /*tp_str*/
|
||||
0, /*tp_getattro*/
|
||||
0, /*tp_setattro*/
|
||||
0, /*tp_as_buffer*/
|
||||
Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
||||
0, /*tp_doc*/
|
||||
0, /*tp_traverse*/
|
||||
0, /*tp_clear*/
|
||||
0, /*tp_richcompare*/
|
||||
0, /*tp_weaklistoffset*/
|
||||
0, /*tp_iter*/
|
||||
0, /*tp_iternext*/
|
||||
encoding_map_methods, /*tp_methods*/
|
||||
0, /*tp_members*/
|
||||
0, /*tp_getset*/
|
||||
0, /*tp_base*/
|
||||
0, /*tp_dict*/
|
||||
0, /*tp_descr_get*/
|
||||
0, /*tp_descr_set*/
|
||||
0, /*tp_dictoffset*/
|
||||
0, /*tp_init*/
|
||||
0, /*tp_alloc*/
|
||||
0, /*tp_new*/
|
||||
0, /*tp_free*/
|
||||
0, /*tp_is_gc*/
|
||||
};
|
||||
|
||||
PyObject*
|
||||
PyUnicode_BuildEncodingMap(PyObject* string)
|
||||
{
|
||||
Py_UNICODE *decode;
|
||||
PyObject *result;
|
||||
struct encoding_map *mresult;
|
||||
int i;
|
||||
int need_dict = 0;
|
||||
unsigned char level1[32];
|
||||
unsigned char level2[512];
|
||||
unsigned char *mlevel1, *mlevel2, *mlevel3;
|
||||
int count2 = 0, count3 = 0;
|
||||
|
||||
if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
}
|
||||
decode = PyUnicode_AS_UNICODE(string);
|
||||
memset(level1, 0xFF, sizeof level1);
|
||||
memset(level2, 0xFF, sizeof level2);
|
||||
|
||||
/* If there isn't a one-to-one mapping of NULL to \0,
|
||||
or if there are non-BMP characters, we need to use
|
||||
a mapping dictionary. */
|
||||
if (decode[0] != 0)
|
||||
need_dict = 1;
|
||||
for (i = 1; i < 256; i++) {
|
||||
int l1, l2;
|
||||
if (decode[i] == 0
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
|| decode[i] > 0xFFFF
|
||||
#endif
|
||||
) {
|
||||
need_dict = 1;
|
||||
break;
|
||||
}
|
||||
if (decode[i] == 0xFFFE)
|
||||
/* unmapped character */
|
||||
continue;
|
||||
l1 = decode[i] >> 11;
|
||||
l2 = decode[i] >> 7;
|
||||
if (level1[l1] == 0xFF)
|
||||
level1[l1] = count2++;
|
||||
if (level2[l2] == 0xFF)
|
||||
level2[l2] = count3++;
|
||||
}
|
||||
|
||||
if (count2 >= 0xFF || count3 >= 0xFF)
|
||||
need_dict = 1;
|
||||
|
||||
if (need_dict) {
|
||||
PyObject *result = PyDict_New();
|
||||
PyObject *key, *value;
|
||||
if (!result)
|
||||
return NULL;
|
||||
for (i = 0; i < 256; i++) {
|
||||
key = value = NULL;
|
||||
key = PyInt_FromLong(decode[i]);
|
||||
value = PyInt_FromLong(i);
|
||||
if (!key || !value)
|
||||
goto failed1;
|
||||
if (PyDict_SetItem(result, key, value) == -1)
|
||||
goto failed1;
|
||||
}
|
||||
return result;
|
||||
failed1:
|
||||
Py_XDECREF(key);
|
||||
Py_XDECREF(value);
|
||||
Py_DECREF(result);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Create a three-level trie */
|
||||
result = PyObject_MALLOC(sizeof(struct encoding_map) +
|
||||
16*count2 + 128*count3 - 1);
|
||||
if (!result)
|
||||
return PyErr_NoMemory();
|
||||
PyObject_Init(result, &EncodingMapType);
|
||||
mresult = (struct encoding_map*)result;
|
||||
mresult->count2 = count2;
|
||||
mresult->count3 = count3;
|
||||
mlevel1 = mresult->level1;
|
||||
mlevel2 = mresult->level23;
|
||||
mlevel3 = mresult->level23 + 16*count2;
|
||||
memcpy(mlevel1, level1, 32);
|
||||
memset(mlevel2, 0xFF, 16*count2);
|
||||
memset(mlevel3, 0, 128*count3);
|
||||
count3 = 0;
|
||||
for (i = 1; i < 256; i++) {
|
||||
int o1, o2, o3, i2, i3;
|
||||
if (decode[i] == 0xFFFE)
|
||||
/* unmapped character */
|
||||
continue;
|
||||
o1 = decode[i]>>11;
|
||||
o2 = (decode[i]>>7) & 0xF;
|
||||
i2 = 16*mlevel1[o1] + o2;
|
||||
if (mlevel2[i2] == 0xFF)
|
||||
mlevel2[i2] = count3++;
|
||||
o3 = decode[i] & 0x7F;
|
||||
i3 = 128*mlevel2[i2] + o3;
|
||||
mlevel3[i3] = i;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static int
|
||||
encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
|
||||
{
|
||||
struct encoding_map *map = (struct encoding_map*)mapping;
|
||||
int l1 = c>>11;
|
||||
int l2 = (c>>7) & 0xF;
|
||||
int l3 = c & 0x7F;
|
||||
int i;
|
||||
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
if (c > 0xFFFF) {
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
if (c == 0)
|
||||
return 0;
|
||||
/* level 1*/
|
||||
i = map->level1[l1];
|
||||
if (i == 0xFF) {
|
||||
return -1;
|
||||
}
|
||||
/* level 2*/
|
||||
i = map->level23[16*i+l2];
|
||||
if (i == 0xFF) {
|
||||
return -1;
|
||||
}
|
||||
/* level 3 */
|
||||
i = map->level23[16*map->count2 + 128*i + l3];
|
||||
if (i == 0) {
|
||||
return -1;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/* Lookup the character ch in the mapping. If the character
|
||||
can't be found, Py_None is returned (or NULL, if another
|
||||
error occurred). */
|
||||
|
|
@ -3102,6 +3315,22 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
|
|||
}
|
||||
}
|
||||
|
||||
static int
|
||||
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
|
||||
{
|
||||
Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
|
||||
/* exponentially overallocate to minimize reallocations */
|
||||
if (requiredsize < 2*outsize)
|
||||
requiredsize = 2*outsize;
|
||||
if (_PyString_Resize(outobj, requiredsize)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
typedef enum charmapencode_result {
|
||||
enc_SUCCESS, enc_FAILED, enc_EXCEPTION
|
||||
}charmapencode_result;
|
||||
/* lookup the character, put the result in the output string and adjust
|
||||
various state variables. Reallocate the output string if not enough
|
||||
space is available. Return a new reference to the object that
|
||||
|
|
@ -3109,51 +3338,58 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
|
|||
(in which case no character was written) or NULL, if a
|
||||
reallocation error occurred. The caller must decref the result */
|
||||
static
|
||||
PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
|
||||
charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
|
||||
PyObject **outobj, Py_ssize_t *outpos)
|
||||
{
|
||||
PyObject *rep = charmapencode_lookup(c, mapping);
|
||||
PyObject *rep;
|
||||
char *outstart;
|
||||
Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
|
||||
|
||||
if (mapping->ob_type == &EncodingMapType) {
|
||||
int res = encoding_map_lookup(c, mapping);
|
||||
Py_ssize_t requiredsize = *outpos+1;
|
||||
if (res == -1)
|
||||
return enc_FAILED;
|
||||
if (outsize<requiredsize)
|
||||
if (!charmapencode_resize(outobj, outpos, requiredsize))
|
||||
return enc_EXCEPTION;
|
||||
outstart = PyString_AS_STRING(*outobj);
|
||||
outstart[(*outpos)++] = (char)res;
|
||||
return enc_SUCCESS;
|
||||
}
|
||||
|
||||
rep = charmapencode_lookup(c, mapping);
|
||||
if (rep==NULL)
|
||||
return NULL;
|
||||
else if (rep==Py_None)
|
||||
return rep;
|
||||
else {
|
||||
char *outstart = PyString_AS_STRING(*outobj);
|
||||
Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
|
||||
return enc_EXCEPTION;
|
||||
else if (rep==Py_None) {
|
||||
Py_DECREF(rep);
|
||||
return enc_FAILED;
|
||||
} else {
|
||||
if (PyInt_Check(rep)) {
|
||||
Py_ssize_t requiredsize = *outpos+1;
|
||||
if (outsize<requiredsize) {
|
||||
/* exponentially overallocate to minimize reallocations */
|
||||
if (requiredsize < 2*outsize)
|
||||
requiredsize = 2*outsize;
|
||||
if (_PyString_Resize(outobj, requiredsize)) {
|
||||
if (outsize<requiredsize)
|
||||
if (!charmapencode_resize(outobj, outpos, requiredsize)) {
|
||||
Py_DECREF(rep);
|
||||
return NULL;
|
||||
return enc_EXCEPTION;
|
||||
}
|
||||
outstart = PyString_AS_STRING(*outobj);
|
||||
}
|
||||
outstart = PyString_AS_STRING(*outobj);
|
||||
outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
|
||||
}
|
||||
else {
|
||||
const char *repchars = PyString_AS_STRING(rep);
|
||||
Py_ssize_t repsize = PyString_GET_SIZE(rep);
|
||||
Py_ssize_t requiredsize = *outpos+repsize;
|
||||
if (outsize<requiredsize) {
|
||||
/* exponentially overallocate to minimize reallocations */
|
||||
if (requiredsize < 2*outsize)
|
||||
requiredsize = 2*outsize;
|
||||
if (_PyString_Resize(outobj, requiredsize)) {
|
||||
if (outsize<requiredsize)
|
||||
if (!charmapencode_resize(outobj, outpos, requiredsize)) {
|
||||
Py_DECREF(rep);
|
||||
return NULL;
|
||||
return enc_EXCEPTION;
|
||||
}
|
||||
outstart = PyString_AS_STRING(*outobj);
|
||||
}
|
||||
outstart = PyString_AS_STRING(*outobj);
|
||||
memcpy(outstart + *outpos, repchars, repsize);
|
||||
*outpos += repsize;
|
||||
}
|
||||
}
|
||||
return rep;
|
||||
return enc_SUCCESS;
|
||||
}
|
||||
|
||||
/* handle an error in PyUnicode_EncodeCharmap
|
||||
|
|
@ -3175,18 +3411,27 @@ int charmap_encoding_error(
|
|||
Py_ssize_t collpos;
|
||||
char *encoding = "charmap";
|
||||
char *reason = "character maps to <undefined>";
|
||||
charmapencode_result x;
|
||||
|
||||
PyObject *x;
|
||||
/* find all unencodable characters */
|
||||
while (collendpos < size) {
|
||||
x = charmapencode_lookup(p[collendpos], mapping);
|
||||
if (x==NULL)
|
||||
PyObject *rep;
|
||||
if (mapping->ob_type == &EncodingMapType) {
|
||||
int res = encoding_map_lookup(p[collendpos], mapping);
|
||||
if (res != -1)
|
||||
break;
|
||||
++collendpos;
|
||||
continue;
|
||||
}
|
||||
|
||||
rep = charmapencode_lookup(p[collendpos], mapping);
|
||||
if (rep==NULL)
|
||||
return -1;
|
||||
else if (x!=Py_None) {
|
||||
Py_DECREF(x);
|
||||
else if (rep!=Py_None) {
|
||||
Py_DECREF(rep);
|
||||
break;
|
||||
}
|
||||
Py_DECREF(x);
|
||||
Py_DECREF(rep);
|
||||
++collendpos;
|
||||
}
|
||||
/* cache callback name lookup
|
||||
|
|
@ -3210,15 +3455,13 @@ int charmap_encoding_error(
|
|||
case 2: /* replace */
|
||||
for (collpos = collstartpos; collpos<collendpos; ++collpos) {
|
||||
x = charmapencode_output('?', mapping, res, respos);
|
||||
if (x==NULL) {
|
||||
if (x==enc_EXCEPTION) {
|
||||
return -1;
|
||||
}
|
||||
else if (x==Py_None) {
|
||||
Py_DECREF(x);
|
||||
else if (x==enc_FAILED) {
|
||||
raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(x);
|
||||
}
|
||||
/* fall through */
|
||||
case 3: /* ignore */
|
||||
|
|
@ -3232,14 +3475,12 @@ int charmap_encoding_error(
|
|||
sprintf(buffer, "&#%d;", (int)p[collpos]);
|
||||
for (cp = buffer; *cp; ++cp) {
|
||||
x = charmapencode_output(*cp, mapping, res, respos);
|
||||
if (x==NULL)
|
||||
if (x==enc_EXCEPTION)
|
||||
return -1;
|
||||
else if (x==Py_None) {
|
||||
Py_DECREF(x);
|
||||
else if (x==enc_FAILED) {
|
||||
raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(x);
|
||||
}
|
||||
}
|
||||
*inpos = collendpos;
|
||||
|
|
@ -3254,17 +3495,14 @@ int charmap_encoding_error(
|
|||
repsize = PyUnicode_GET_SIZE(repunicode);
|
||||
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
||||
x = charmapencode_output(*uni2, mapping, res, respos);
|
||||
if (x==NULL) {
|
||||
Py_DECREF(repunicode);
|
||||
if (x==enc_EXCEPTION) {
|
||||
return -1;
|
||||
}
|
||||
else if (x==Py_None) {
|
||||
else if (x==enc_FAILED) {
|
||||
Py_DECREF(repunicode);
|
||||
Py_DECREF(x);
|
||||
raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(x);
|
||||
}
|
||||
*inpos = newpos;
|
||||
Py_DECREF(repunicode);
|
||||
|
|
@ -3304,22 +3542,20 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
|
|||
|
||||
while (inpos<size) {
|
||||
/* try to encode it */
|
||||
PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
|
||||
if (x==NULL) /* error */
|
||||
charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
|
||||
if (x==enc_EXCEPTION) /* error */
|
||||
goto onError;
|
||||
if (x==Py_None) { /* unencodable character */
|
||||
if (x==enc_FAILED) { /* unencodable character */
|
||||
if (charmap_encoding_error(p, size, &inpos, mapping,
|
||||
&exc,
|
||||
&known_errorHandler, &errorHandler, errors,
|
||||
&res, &respos)) {
|
||||
Py_DECREF(x);
|
||||
goto onError;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* done with this character => adjust input position */
|
||||
++inpos;
|
||||
Py_DECREF(x);
|
||||
}
|
||||
|
||||
/* Resize if we allocated to much */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue