gh-139877: Use PyBytesWriter in pycore_blocks_output_buffer.h (#139976)

Previously, the _BlocksOutputBuffer code creates a list of bytes objects to handle the output data from compression libraries. This ends up being slow due to the output buffer code needing to copy each bytes element of the list into the final bytes object buffer at the end of compression.

The new PyBytesWriter API introduced in PEP 782 is an ergonomic and fast method of writing data into a buffer that will later turn into a bytes object. Benchmarks show that using the PyBytesWriter API is 10-30% faster for decompression across a variety of settings. The performance gains are greatest when the decompressor is very performant, such as for Zstandard (and likely zlib-ng). Otherwise the decompressor can bottleneck decompression and the gains are more modest, but still sizable (e.g. 10% faster for zlib)!

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
Emma Smith 2025-10-14 10:03:55 -07:00 committed by GitHub
parent 404425575c
commit f262297d52
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 49 additions and 109 deletions

View file

@ -45,12 +45,14 @@ extern "C" {
#endif
typedef struct {
// List of bytes objects
PyObject *list;
// Bytes writer managing output buffer
PyBytesWriter *writer;
// Number of whole allocated size
Py_ssize_t allocated;
// Max length of the buffer, negative number means unlimited length.
// Max length of the buffer, negative number means unlimited length
Py_ssize_t max_length;
// Number of blocks of bytes. Used to calculate next allocation size
size_t num_blocks;
} _BlocksOutputBuffer;
static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
@ -107,11 +109,10 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
const Py_ssize_t max_length,
void **next_out)
{
PyObject *b;
Py_ssize_t block_size;
// ensure .list was set to NULL
assert(buffer->list == NULL);
// ensure .writer was set to NULL
assert(buffer->writer == NULL);
// get block size
if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
@ -120,25 +121,17 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
block_size = BUFFER_BLOCK_SIZE[0];
}
// the first block
b = PyBytes_FromStringAndSize(NULL, block_size);
if (b == NULL) {
buffer->writer = PyBytesWriter_Create(block_size);
if (buffer->writer == NULL) {
return -1;
}
// create the list
buffer->list = PyList_New(1);
if (buffer->list == NULL) {
Py_DECREF(b);
return -1;
}
PyList_SET_ITEM(buffer->list, 0, b);
// set variables
buffer->allocated = block_size;
buffer->max_length = max_length;
buffer->num_blocks = 1;
*next_out = PyBytes_AS_STRING(b);
*next_out = PyBytesWriter_GetData(buffer->writer);
return block_size;
}
@ -155,31 +148,21 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer,
const Py_ssize_t init_size,
void **next_out)
{
PyObject *b;
// ensure .list was set to NULL
assert(buffer->list == NULL);
// ensure .writer was set to NULL
assert(buffer->writer == NULL);
// the first block
b = PyBytes_FromStringAndSize(NULL, init_size);
if (b == NULL) {
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
buffer->writer = PyBytesWriter_Create(init_size);
if (buffer->writer == NULL) {
return -1;
}
// create the list
buffer->list = PyList_New(1);
if (buffer->list == NULL) {
Py_DECREF(b);
return -1;
}
PyList_SET_ITEM(buffer->list, 0, b);
// set variables
buffer->allocated = init_size;
buffer->max_length = -1;
buffer->num_blocks = 1;
*next_out = PyBytes_AS_STRING(b);
*next_out = PyBytesWriter_GetData(buffer->writer);
return init_size;
}
@ -193,8 +176,6 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
void **next_out,
const Py_ssize_t avail_out)
{
PyObject *b;
const Py_ssize_t list_len = Py_SIZE(buffer->list);
Py_ssize_t block_size;
// ensure no gaps in the data
@ -205,11 +186,10 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
}
// get block size
if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
block_size = BUFFER_BLOCK_SIZE[list_len];
} else {
block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
}
size_t maxblock = Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE);
assert(maxblock >= 1);
size_t block_index = Py_MIN(buffer->num_blocks, maxblock - 1);
block_size = BUFFER_BLOCK_SIZE[block_index];
// check max_length
if (buffer->max_length >= 0) {
@ -229,22 +209,19 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
return -1;
}
// create the block
b = PyBytes_FromStringAndSize(NULL, block_size);
if (b == NULL) {
if (PyBytesWriter_Grow(buffer->writer, block_size)) {
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
return -1;
}
if (PyList_Append(buffer->list, b) < 0) {
Py_DECREF(b);
return -1;
}
Py_DECREF(b);
Py_ssize_t current_size = buffer->allocated;
// set variables
buffer->allocated += block_size;
buffer->num_blocks += 1;
*next_out = PyBytes_AS_STRING(b);
char *data = PyBytesWriter_GetData(buffer->writer);
*next_out = data + current_size;
return block_size;
}
@ -265,54 +242,17 @@ static inline PyObject *
_BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
const Py_ssize_t avail_out)
{
PyObject *result, *block;
const Py_ssize_t list_len = Py_SIZE(buffer->list);
// fast path for single block
if ((list_len == 1 && avail_out == 0) ||
(list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out))
{
block = PyList_GET_ITEM(buffer->list, 0);
Py_INCREF(block);
Py_CLEAR(buffer->list);
return block;
}
// final bytes object
result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
if (result == NULL) {
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
return NULL;
}
// memory copy
if (list_len > 0) {
char *posi = PyBytes_AS_STRING(result);
// blocks except the last one
Py_ssize_t i = 0;
for (; i < list_len-1; i++) {
block = PyList_GET_ITEM(buffer->list, i);
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
posi += Py_SIZE(block);
}
// the last block
block = PyList_GET_ITEM(buffer->list, i);
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
} else {
assert(Py_SIZE(result) == 0);
}
Py_CLEAR(buffer->list);
return result;
assert(buffer->writer != NULL);
return PyBytesWriter_FinishWithSize(buffer->writer,
buffer->allocated - avail_out);
}
/* Clean up the buffer when an error occurred. */
static inline void
_BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
{
Py_CLEAR(buffer->list);
PyBytesWriter_Discard(buffer->writer);
buffer->writer = NULL;
}
#ifdef __cplusplus

View file

@ -190,7 +190,7 @@ static PyObject *
compress(BZ2Compressor *c, char *data, size_t len, int action)
{
PyObject *result;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
if (OutputBuffer_InitAndGrow(&buffer, -1, &c->bzs.next_out, &c->bzs.avail_out) < 0) {
goto error;
@ -429,7 +429,7 @@ decompress_buf(BZ2Decompressor *d, Py_ssize_t max_length)
compare against max_length and PyBytes_GET_SIZE we declare it as
signed */
PyObject *result;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
bz_stream *bzs = &d->bzs;
if (OutputBuffer_InitAndGrow(&buffer, max_length, &bzs->next_out, &bzs->avail_out) < 0) {

View file

@ -554,7 +554,7 @@ static PyObject *
compress(Compressor *c, uint8_t *data, size_t len, lzma_action action)
{
PyObject *result;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
_lzma_state *state = PyType_GetModuleState(Py_TYPE(c));
assert(state != NULL);
@ -940,7 +940,7 @@ decompress_buf(Decompressor *d, Py_ssize_t max_length)
{
PyObject *result;
lzma_stream *lzs = &d->lzs;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
_lzma_state *state = PyType_GetModuleState(Py_TYPE(d));
assert(state != NULL);

View file

@ -16,8 +16,8 @@ static inline int
_OutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob,
Py_ssize_t max_length)
{
/* Ensure .list was set to NULL */
assert(buffer->list == NULL);
/* Ensure .writer was set to NULL */
assert(buffer->writer == NULL);
Py_ssize_t res = _BlocksOutputBuffer_InitAndGrow(buffer, max_length,
&ob->dst);
@ -39,8 +39,8 @@ _OutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob,
{
Py_ssize_t block_size;
/* Ensure .list was set to NULL */
assert(buffer->list == NULL);
/* Ensure .writer was set to NULL */
assert(buffer->writer == NULL);
/* Get block size */
if (0 <= max_length && max_length < init_size) {

View file

@ -446,7 +446,7 @@ compress_lock_held(ZstdCompressor *self, Py_buffer *data,
assert(PyMutex_IsLocked(&self->lock));
ZSTD_inBuffer in;
ZSTD_outBuffer out;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
size_t zstd_ret;
PyObject *ret;
@ -527,7 +527,7 @@ compress_mt_continue_lock_held(ZstdCompressor *self, Py_buffer *data)
assert(PyMutex_IsLocked(&self->lock));
ZSTD_inBuffer in;
ZSTD_outBuffer out;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
size_t zstd_ret;
PyObject *ret;

View file

@ -216,7 +216,7 @@ decompress_lock_held(ZstdDecompressor *self, ZSTD_inBuffer *in,
{
size_t zstd_ret;
ZSTD_outBuffer out;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
PyObject *ret;
/* Initialize the output buffer */

View file

@ -344,7 +344,7 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
PyObject *return_value;
int flush;
z_stream zst;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
zlibstate *state = get_zlib_state(module);
@ -445,7 +445,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits,
Py_ssize_t ibuflen;
int err, flush;
z_stream zst;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
_Uint32Window window; // output buffer's UINT32_MAX sliding window
zlibstate *state = get_zlib_state(module);
@ -774,7 +774,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls,
{
PyObject *return_value;
int err;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
zlibstate *state = PyType_GetModuleState(cls);
ENTER_ZLIB(self);
@ -898,7 +898,7 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls,
int err = Z_OK;
Py_ssize_t ibuflen;
PyObject *return_value;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
PyObject *module = PyType_GetModule(cls);
if (module == NULL)
@ -1005,7 +1005,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode)
{
int err;
PyObject *return_value;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
zlibstate *state = PyType_GetModuleState(cls);
/* Flushing with Z_NO_FLUSH is a no-op, so there's no point in
@ -1267,7 +1267,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls,
Py_buffer data;
PyObject *return_value;
Py_ssize_t ibuflen;
_BlocksOutputBuffer buffer = {.list = NULL};
_BlocksOutputBuffer buffer = {.writer = NULL};
_Uint32Window window; // output buffer's UINT32_MAX sliding window
PyObject *module = PyType_GetModule(cls);