gh-95005: Replace PyAccu with PyUnicodeWriter (gh-95006)

This commit is contained in:
Aivars Kalvāns 2022-07-27 11:43:34 +03:00 committed by GitHub
parent 565403038b
commit 8c88e360e7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 70 additions and 240 deletions

View file

@ -1,39 +0,0 @@
#ifndef Py_LIMITED_API
#ifndef Py_INTERNAL_ACCU_H
#define Py_INTERNAL_ACCU_H
#ifdef __cplusplus
extern "C" {
#endif
/*** This is a private API for use by the interpreter and the stdlib.
*** Its definition may be changed or removed at any moment.
***/
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
/*
* A two-level accumulator of unicode objects that avoids both the overhead
* of keeping a huge number of small separate objects, and the quadratic
* behaviour of using a naive repeated concatenation scheme.
*/
#undef small /* defined by some Windows headers */
typedef struct {
PyObject *large; /* A list of previously accumulated large strings */
PyObject *small; /* Pending small strings */
} _PyAccu;
PyAPI_FUNC(int) _PyAccu_Init(_PyAccu *acc);
PyAPI_FUNC(int) _PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode);
PyAPI_FUNC(PyObject *) _PyAccu_FinishAsList(_PyAccu *acc);
PyAPI_FUNC(PyObject *) _PyAccu_Finish(_PyAccu *acc);
PyAPI_FUNC(void) _PyAccu_Destroy(_PyAccu *acc);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_ACCU_H */
#endif /* !Py_LIMITED_API */

View file

@ -435,7 +435,6 @@ PYTHON_OBJS= \
# Objects
OBJECT_OBJS= \
Objects/abstract.o \
Objects/accu.o \
Objects/boolobject.o \
Objects/bytes_methods.o \
Objects/bytearrayobject.o \
@ -1565,7 +1564,6 @@ PYTHON_HEADERS= \
$(srcdir)/Include/cpython/weakrefobject.h \
\
$(srcdir)/Include/internal/pycore_abstract.h \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_asdl.h \
$(srcdir)/Include/internal/pycore_ast.h \
$(srcdir)/Include/internal/pycore_ast_state.h \

View file

@ -0,0 +1,2 @@
Replace :c:expr:`_PyAccu` with :c:expr:`_PyUnicodeWriter` in JSON encoder
and StringIO and remove the :c:expr:`_PyAccu` implementation.

View file

@ -1,7 +1,6 @@
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include <stddef.h> // offsetof()
#include "pycore_accu.h"
#include "pycore_object.h"
#include "_iomodule.h"
@ -27,12 +26,12 @@ typedef struct {
/* The stringio object can be in two states: accumulating or realized.
In accumulating state, the internal buffer contains nothing and
the contents are given by the embedded _PyAccu structure.
the contents are given by the embedded _PyUnicodeWriter structure.
In realized state, the internal buffer is meaningful and the
_PyAccu is destroyed.
_PyUnicodeWriter is destroyed.
*/
int state;
_PyAccu accu;
_PyUnicodeWriter writer;
char ok; /* initialized? */
char closed;
@ -126,12 +125,14 @@ resize_buffer(stringio *self, size_t size)
static PyObject *
make_intermediate(stringio *self)
{
PyObject *intermediate = _PyAccu_Finish(&self->accu);
PyObject *intermediate = _PyUnicodeWriter_Finish(&self->writer);
self->state = STATE_REALIZED;
if (intermediate == NULL)
return NULL;
if (_PyAccu_Init(&self->accu) ||
_PyAccu_Accumulate(&self->accu, intermediate)) {
_PyUnicodeWriter_Init(&self->writer);
self->writer.overallocate = 1;
if (_PyUnicodeWriter_WriteStr(&self->writer, intermediate)) {
Py_DECREF(intermediate);
return NULL;
}
@ -150,7 +151,7 @@ realize(stringio *self)
assert(self->state == STATE_ACCUMULATING);
self->state = STATE_REALIZED;
intermediate = _PyAccu_Finish(&self->accu);
intermediate = _PyUnicodeWriter_Finish(&self->writer);
if (intermediate == NULL)
return -1;
@ -218,7 +219,7 @@ write_str(stringio *self, PyObject *obj)
if (self->state == STATE_ACCUMULATING) {
if (self->string_size == self->pos) {
if (_PyAccu_Accumulate(&self->accu, decoded))
if (_PyUnicodeWriter_WriteStr(&self->writer, decoded))
goto fail;
goto success;
}
@ -572,7 +573,7 @@ _io_StringIO_close_impl(stringio *self)
/* Free up some memory */
if (resize_buffer(self, 0) < 0)
return NULL;
_PyAccu_Destroy(&self->accu);
_PyUnicodeWriter_Dealloc(&self->writer);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@ -602,7 +603,7 @@ stringio_dealloc(stringio *self)
PyMem_Free(self->buf);
self->buf = NULL;
}
_PyAccu_Destroy(&self->accu);
_PyUnicodeWriter_Dealloc(&self->writer);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@ -687,7 +688,7 @@ _io_StringIO___init___impl(stringio *self, PyObject *value,
self->ok = 0;
_PyAccu_Destroy(&self->accu);
_PyUnicodeWriter_Dealloc(&self->writer);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@ -742,8 +743,8 @@ _io_StringIO___init___impl(stringio *self, PyObject *value,
/* Empty stringio object, we can start by accumulating */
if (resize_buffer(self, 0) < 0)
return -1;
if (_PyAccu_Init(&self->accu))
return -1;
_PyUnicodeWriter_Init(&self->writer);
self->writer.overallocate = 1;
self->state = STATE_ACCUMULATING;
}
self->pos = 0;

View file

@ -12,7 +12,6 @@
#include "Python.h"
#include "pycore_ceval.h" // _Py_EnterRecursiveCall()
#include "structmember.h" // PyMemberDef
#include "pycore_accu.h"
typedef struct _PyScannerObject {
@ -85,11 +84,11 @@ encoder_dealloc(PyObject *self);
static int
encoder_clear(PyEncoderObject *self);
static int
encoder_listencode_list(PyEncoderObject *s, _PyAccu *acc, PyObject *seq, Py_ssize_t indent_level);
encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *seq, Py_ssize_t indent_level);
static int
encoder_listencode_obj(PyEncoderObject *s, _PyAccu *acc, PyObject *obj, Py_ssize_t indent_level);
encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *obj, Py_ssize_t indent_level);
static int
encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc, PyObject *dct, Py_ssize_t indent_level);
encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *dct, Py_ssize_t indent_level);
static PyObject *
_encoded_const(PyObject *obj);
static void
@ -1280,19 +1279,29 @@ encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds)
{
/* Python callable interface to encode_listencode_obj */
static char *kwlist[] = {"obj", "_current_indent_level", NULL};
PyObject *obj;
PyObject *obj, *result;
Py_ssize_t indent_level;
_PyAccu acc;
_PyUnicodeWriter writer;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "On:_iterencode", kwlist,
&obj, &indent_level))
return NULL;
if (_PyAccu_Init(&acc))
return NULL;
if (encoder_listencode_obj(self, &acc, obj, indent_level)) {
_PyAccu_Destroy(&acc);
_PyUnicodeWriter_Init(&writer);
writer.overallocate = 1;
if (encoder_listencode_obj(self, &writer, obj, indent_level)) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
return _PyAccu_FinishAsList(&acc);
result = PyTuple_New(1);
if (result == NULL ||
PyTuple_SetItem(result, 0, _PyUnicodeWriter_Finish(&writer)) < 0) {
Py_XDECREF(result);
return NULL;
}
return result;
}
static PyObject *
@ -1376,58 +1385,60 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj)
}
static int
_steal_accumulate(_PyAccu *acc, PyObject *stolen)
_steal_accumulate(_PyUnicodeWriter *writer, PyObject *stolen)
{
/* Append stolen and then decrement its reference count */
int rval = _PyAccu_Accumulate(acc, stolen);
int rval = _PyUnicodeWriter_WriteStr(writer, stolen);
Py_DECREF(stolen);
return rval;
}
static int
encoder_listencode_obj(PyEncoderObject *s, _PyAccu *acc,
encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer,
PyObject *obj, Py_ssize_t indent_level)
{
/* Encode Python object obj to a JSON term */
PyObject *newobj;
int rv;
if (obj == Py_None || obj == Py_True || obj == Py_False) {
PyObject *cstr = _encoded_const(obj);
if (cstr == NULL)
return -1;
return _steal_accumulate(acc, cstr);
if (obj == Py_None) {
return _PyUnicodeWriter_WriteASCIIString(writer, "null", 4);
}
else if (PyUnicode_Check(obj))
{
else if (obj == Py_True) {
return _PyUnicodeWriter_WriteASCIIString(writer, "true", 4);
}
else if (obj == Py_False) {
return _PyUnicodeWriter_WriteASCIIString(writer, "false", 5);
}
else if (PyUnicode_Check(obj)) {
PyObject *encoded = encoder_encode_string(s, obj);
if (encoded == NULL)
return -1;
return _steal_accumulate(acc, encoded);
return _steal_accumulate(writer, encoded);
}
else if (PyLong_Check(obj)) {
PyObject *encoded = PyLong_Type.tp_repr(obj);
if (encoded == NULL)
return -1;
return _steal_accumulate(acc, encoded);
return _steal_accumulate(writer, encoded);
}
else if (PyFloat_Check(obj)) {
PyObject *encoded = encoder_encode_float(s, obj);
if (encoded == NULL)
return -1;
return _steal_accumulate(acc, encoded);
return _steal_accumulate(writer, encoded);
}
else if (PyList_Check(obj) || PyTuple_Check(obj)) {
if (_Py_EnterRecursiveCall(" while encoding a JSON object"))
return -1;
rv = encoder_listencode_list(s, acc, obj, indent_level);
rv = encoder_listencode_list(s, writer, obj, indent_level);
_Py_LeaveRecursiveCall();
return rv;
}
else if (PyDict_Check(obj)) {
if (_Py_EnterRecursiveCall(" while encoding a JSON object"))
return -1;
rv = encoder_listencode_dict(s, acc, obj, indent_level);
rv = encoder_listencode_dict(s, writer, obj, indent_level);
_Py_LeaveRecursiveCall();
return rv;
}
@ -1461,7 +1472,7 @@ encoder_listencode_obj(PyEncoderObject *s, _PyAccu *acc,
Py_XDECREF(ident);
return -1;
}
rv = encoder_listencode_obj(s, acc, newobj, indent_level);
rv = encoder_listencode_obj(s, writer, newobj, indent_level);
_Py_LeaveRecursiveCall();
Py_DECREF(newobj);
@ -1481,16 +1492,10 @@ encoder_listencode_obj(PyEncoderObject *s, _PyAccu *acc,
}
static int
encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer,
PyObject *dct, Py_ssize_t indent_level)
{
/* Encode Python dict dct a JSON term */
_Py_static_string(PyId_open_dict, "{");
_Py_static_string(PyId_close_dict, "}");
_Py_static_string(PyId_empty_dict, "{}");
PyObject *open_dict = _PyUnicode_FromId(&PyId_open_dict); // borrowed ref
PyObject *close_dict = _PyUnicode_FromId(&PyId_close_dict); // borrowed ref
PyObject *empty_dict = _PyUnicode_FromId(&PyId_empty_dict); // borrowed ref
PyObject *kstr = NULL;
PyObject *ident = NULL;
PyObject *it = NULL;
@ -1498,11 +1503,8 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
PyObject *item = NULL;
Py_ssize_t idx;
if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) {
return -1;
}
if (PyDict_GET_SIZE(dct) == 0) /* Fast path */
return _PyAccu_Accumulate(acc, empty_dict);
return _PyUnicodeWriter_WriteASCIIString(writer, "{}", 2);
if (s->markers != Py_None) {
int has_key;
@ -1520,7 +1522,7 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
}
}
if (_PyAccu_Accumulate(acc, open_dict))
if (_PyUnicodeWriter_WriteChar(writer, '{'))
goto bail;
if (s->indent != Py_None) {
@ -1586,7 +1588,7 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
}
if (idx) {
if (_PyAccu_Accumulate(acc, s->item_separator))
if (_PyUnicodeWriter_WriteStr(writer, s->item_separator))
goto bail;
}
@ -1594,16 +1596,16 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
Py_CLEAR(kstr);
if (encoded == NULL)
goto bail;
if (_PyAccu_Accumulate(acc, encoded)) {
if (_PyUnicodeWriter_WriteStr(writer, encoded)) {
Py_DECREF(encoded);
goto bail;
}
Py_DECREF(encoded);
if (_PyAccu_Accumulate(acc, s->key_separator))
if (_PyUnicodeWriter_WriteStr(writer, s->key_separator))
goto bail;
value = PyTuple_GET_ITEM(item, 1);
if (encoder_listencode_obj(s, acc, value, indent_level))
if (encoder_listencode_obj(s, writer, value, indent_level))
goto bail;
idx += 1;
Py_DECREF(item);
@ -1623,7 +1625,7 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
yield '\n' + (' ' * (_indent * _current_indent_level))
}*/
if (_PyAccu_Accumulate(acc, close_dict))
if (_PyUnicodeWriter_WriteChar(writer, '}'))
goto bail;
return 0;
@ -1637,30 +1639,20 @@ encoder_listencode_dict(PyEncoderObject *s, _PyAccu *acc,
static int
encoder_listencode_list(PyEncoderObject *s, _PyAccu *acc,
encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer,
PyObject *seq, Py_ssize_t indent_level)
{
/* Encode Python list seq to a JSON term */
_Py_static_string(PyId_open_array, "[");
_Py_static_string(PyId_close_array, "]");
_Py_static_string(PyId_empty_array, "[]");
PyObject *open_array = _PyUnicode_FromId(&PyId_open_array); // borrowed ref
PyObject *close_array = _PyUnicode_FromId(&PyId_close_array); // borrowed ref
PyObject *empty_array = _PyUnicode_FromId(&PyId_empty_array); // borrowed ref
PyObject *ident = NULL;
PyObject *s_fast = NULL;
Py_ssize_t i;
if (open_array == NULL || close_array == NULL || empty_array == NULL) {
return -1;
}
ident = NULL;
s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence");
if (s_fast == NULL)
return -1;
if (PySequence_Fast_GET_SIZE(s_fast) == 0) {
Py_DECREF(s_fast);
return _PyAccu_Accumulate(acc, empty_array);
return _PyUnicodeWriter_WriteASCIIString(writer, "[]", 2);
}
if (s->markers != Py_None) {
@ -1679,7 +1671,7 @@ encoder_listencode_list(PyEncoderObject *s, _PyAccu *acc,
}
}
if (_PyAccu_Accumulate(acc, open_array))
if (_PyUnicodeWriter_WriteChar(writer, '['))
goto bail;
if (s->indent != Py_None) {
/* TODO: DOES NOT RUN */
@ -1693,10 +1685,10 @@ encoder_listencode_list(PyEncoderObject *s, _PyAccu *acc,
for (i = 0; i < PySequence_Fast_GET_SIZE(s_fast); i++) {
PyObject *obj = PySequence_Fast_GET_ITEM(s_fast, i);
if (i) {
if (_PyAccu_Accumulate(acc, s->item_separator))
if (_PyUnicodeWriter_WriteStr(writer, s->item_separator))
goto bail;
}
if (encoder_listencode_obj(s, acc, obj, indent_level))
if (encoder_listencode_obj(s, writer, obj, indent_level))
goto bail;
}
if (ident != NULL) {
@ -1711,7 +1703,7 @@ encoder_listencode_list(PyEncoderObject *s, _PyAccu *acc,
yield '\n' + (' ' * (_indent * _current_indent_level))
}*/
if (_PyAccu_Accumulate(acc, close_array))
if (_PyUnicodeWriter_WriteChar(writer, ']'))
goto bail;
Py_DECREF(s_fast);
return 0;

View file

@ -1,115 +0,0 @@
/* Accumulator struct implementation */
#include "Python.h"
#include "pycore_accu.h"
static PyObject *
join_list_unicode(PyObject *lst)
{
/* return ''.join(lst) */
PyObject *sep, *ret;
sep = PyUnicode_FromStringAndSize("", 0);
ret = PyUnicode_Join(sep, lst);
Py_DECREF(sep);
return ret;
}
int
_PyAccu_Init(_PyAccu *acc)
{
/* Lazily allocated */
acc->large = NULL;
acc->small = PyList_New(0);
if (acc->small == NULL)
return -1;
return 0;
}
static int
flush_accumulator(_PyAccu *acc)
{
Py_ssize_t nsmall = PyList_GET_SIZE(acc->small);
if (nsmall) {
int ret;
PyObject *joined;
if (acc->large == NULL) {
acc->large = PyList_New(0);
if (acc->large == NULL)
return -1;
}
joined = join_list_unicode(acc->small);
if (joined == NULL)
return -1;
if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
Py_DECREF(joined);
return -1;
}
ret = PyList_Append(acc->large, joined);
Py_DECREF(joined);
return ret;
}
return 0;
}
int
_PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode)
{
Py_ssize_t nsmall;
assert(PyUnicode_Check(unicode));
if (PyList_Append(acc->small, unicode))
return -1;
nsmall = PyList_GET_SIZE(acc->small);
/* Each item in a list of unicode objects has an overhead (in 64-bit
* builds) of:
* - 8 bytes for the list slot
* - 56 bytes for the header of the unicode object
* that is, 64 bytes. 100000 such objects waste more than 6 MiB
* compared to a single concatenated string.
*/
if (nsmall < 100000)
return 0;
return flush_accumulator(acc);
}
PyObject *
_PyAccu_FinishAsList(_PyAccu *acc)
{
int ret;
PyObject *res;
ret = flush_accumulator(acc);
Py_CLEAR(acc->small);
if (ret) {
Py_CLEAR(acc->large);
return NULL;
}
res = acc->large;
acc->large = NULL;
return res;
}
PyObject *
_PyAccu_Finish(_PyAccu *acc)
{
PyObject *list, *res;
if (acc->large == NULL) {
list = acc->small;
acc->small = NULL;
}
else {
list = _PyAccu_FinishAsList(acc);
if (!list)
return NULL;
}
res = join_list_unicode(list);
Py_DECREF(list);
return res;
}
void
_PyAccu_Destroy(_PyAccu *acc)
{
Py_CLEAR(acc->small);
Py_CLEAR(acc->large);
}

View file

@ -120,7 +120,6 @@
<ClCompile Include="..\Modules\_io\textio.c" />
<ClCompile Include="..\Modules\_io\winconsoleio.c" />
<ClCompile Include="..\Objects\abstract.c" />
<ClCompile Include="..\Objects\accu.c" />
<ClCompile Include="..\Objects\boolobject.c" />
<ClCompile Include="..\Objects\bytearrayobject.c" />
<ClCompile Include="..\Objects\bytes_methods.c" />

View file

@ -25,9 +25,6 @@
<ClCompile Include="..\Objects\abstract.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Objects\accu.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Python\asdl.c">
<Filter>Source Files</Filter>
</ClCompile>

View file

@ -193,7 +193,6 @@
<ClInclude Include="..\Include\frameobject.h" />
<ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pycore_abstract.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_asdl.h" />
<ClInclude Include="..\Include\internal\pycore_ast.h" />
<ClInclude Include="..\Include\internal\pycore_ast_state.h" />
@ -422,7 +421,6 @@
<ClCompile Include="..\Modules\cjkcodecs\multibytecodec.c" />
<ClCompile Include="..\Modules\_winapi.c" />
<ClCompile Include="..\Objects\abstract.c" />
<ClCompile Include="..\Objects\accu.c" />
<ClCompile Include="..\Objects\boolobject.c" />
<ClCompile Include="..\Objects\bytearrayobject.c" />
<ClCompile Include="..\Objects\bytes_methods.c" />

View file

@ -486,9 +486,6 @@
<ClInclude Include="..\Include\internal\pycore_abstract.h">
<Filter>Include\internal</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_accu.h">
<Filter>Include\internal</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_asdl.h">
<Filter>Include\internal</Filter>
</ClInclude>