gh-139871: Add bytearray.take_bytes([n]) to efficiently extract bytes (GH-140128)

Update `bytearray` to contain a `bytes` and provide a zero-copy path to
"extract" the `bytes`. This allows making several code paths more efficient.

This does not move any codepaths to make use of this new API. The documentation
changes include common code patterns which can be made more efficient with
this API.

---

When just changing `bytearray` to contain `bytes` I ran pyperformance on a
`--with-lto --enable-optimizations --with-static-libpython` build and don't see
any major speedups or slowdowns with this; all seems to be in the noise of
my machine (Generally changes under 5% or benchmarks that don't touch
bytes/bytearray).


Co-authored-by: Victor Stinner <vstinner@python.org>
Co-authored-by: Maurycy Pawłowski-Wieroński <5383+maurycy@users.noreply.github.com>
This commit is contained in:
Cody Maloney 2025-11-13 05:19:44 -08:00 committed by GitHub
parent 2fbd396666
commit 732224e113
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 407 additions and 96 deletions

View file

@ -1397,6 +1397,16 @@ def test_clear(self):
b.append(ord('p'))
self.assertEqual(b, b'p')
# Cleared object should be empty.
b = bytearray(b'abc')
b.clear()
self.assertEqual(b.__alloc__(), 0)
base_size = sys.getsizeof(bytearray())
self.assertEqual(sys.getsizeof(b), base_size)
c = b.copy()
self.assertEqual(c.__alloc__(), 0)
self.assertEqual(sys.getsizeof(c), base_size)
def test_copy(self):
b = bytearray(b'abc')
bb = b.copy()
@ -1458,6 +1468,61 @@ def test_resize(self):
self.assertRaises(MemoryError, bytearray().resize, sys.maxsize)
self.assertRaises(MemoryError, bytearray(1000).resize, sys.maxsize)
def test_take_bytes(self):
ba = bytearray(b'ab')
self.assertEqual(ba.take_bytes(), b'ab')
self.assertEqual(len(ba), 0)
self.assertEqual(ba, bytearray(b''))
self.assertEqual(ba.__alloc__(), 0)
base_size = sys.getsizeof(bytearray())
self.assertEqual(sys.getsizeof(ba), base_size)
# Positive and negative slicing.
ba = bytearray(b'abcdef')
self.assertEqual(ba.take_bytes(1), b'a')
self.assertEqual(ba, bytearray(b'bcdef'))
self.assertEqual(len(ba), 5)
self.assertEqual(ba.take_bytes(-5), b'')
self.assertEqual(ba, bytearray(b'bcdef'))
self.assertEqual(len(ba), 5)
self.assertEqual(ba.take_bytes(-3), b'bc')
self.assertEqual(ba, bytearray(b'def'))
self.assertEqual(len(ba), 3)
self.assertEqual(ba.take_bytes(3), b'def')
self.assertEqual(ba, bytearray(b''))
self.assertEqual(len(ba), 0)
# Take nothing from emptiness.
self.assertEqual(ba.take_bytes(0), b'')
self.assertEqual(ba.take_bytes(), b'')
self.assertEqual(ba.take_bytes(None), b'')
# Out of bounds, bad take value.
self.assertRaises(IndexError, ba.take_bytes, -1)
self.assertRaises(TypeError, ba.take_bytes, 3.14)
ba = bytearray(b'abcdef')
self.assertRaises(IndexError, ba.take_bytes, 7)
# Offset between physical and logical start (ob_bytes != ob_start).
ba = bytearray(b'abcde')
del ba[:2]
self.assertEqual(ba, bytearray(b'cde'))
self.assertEqual(ba.take_bytes(), b'cde')
# Overallocation at end.
ba = bytearray(b'abcde')
del ba[-2:]
self.assertEqual(ba, bytearray(b'abc'))
self.assertEqual(ba.take_bytes(), b'abc')
ba = bytearray(b'abcde')
ba.resize(4)
self.assertEqual(ba.take_bytes(), b'abcd')
# Take of a bytearray with references should fail.
ba = bytearray(b'abc')
with memoryview(ba) as mv:
self.assertRaises(BufferError, ba.take_bytes)
self.assertEqual(ba.take_bytes(), b'abc')
def test_setitem(self):
def setitem_as_mapping(b, i, val):
@ -2564,6 +2629,18 @@ def zfill(b, a):
c = a.zfill(0x400000)
assert not c or c[-1] not in (0xdd, 0xcd)
def take_bytes(b, a): # MODIFIES!
b.wait()
c = a.take_bytes()
assert not c or c[0] == 48 # '0'
def take_bytes_n(b, a): # MODIFIES!
b.wait()
try:
c = a.take_bytes(10)
assert c == b'0123456789'
except IndexError: pass
def check(funcs, a=None, *args):
if a is None:
a = bytearray(b'0' * 0x400000)
@ -2625,6 +2702,10 @@ def check(funcs, a=None, *args):
check([clear] + [startswith] * 10)
check([clear] + [strip] * 10)
check([clear] + [take_bytes] * 10)
check([take_bytes_n] * 10, bytearray(b'0123456789' * 0x400))
check([take_bytes_n] * 10, bytearray(b'0123456789' * 5))
check([clear] + [contains] * 10)
check([clear] + [subscript] * 10)
check([clear2] + [ass_subscript2] * 10, None, bytearray(b'0' * 0x400000))