[3.14] gh-144438: Fix false sharing between QSBR and tlbc_index (gh-144554) (#144923)

Align the QSBR thread state array to a 64-byte cache line boundary
and add padding at the end of _PyThreadStateImpl. Depending on heap
layout, the QSBR array could end up sharing a cache line with a
thread's tlbc_index, causing QSBR quiescent state updates to contend
with reads of tlbc_index in RESUME_CHECK. This is sensitive to
earlier allocations during interpreter init and can appear or
disappear with seemingly unrelated changes.

Either change alone is sufficient to fix the specific issue, but both
are worthwhile to avoid similar problems in the future.

(cherry picked from commit 6577d870b0)
This commit is contained in:
Sam Gross 2026-03-31 15:20:24 -04:00 committed by GitHub
parent 25b48b84b8
commit 6ea4f842fb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 1657 additions and 1600 deletions

File diff suppressed because it is too large Load diff

View file

@ -83,8 +83,9 @@ struct _qsbr_shared {
// Minimum observed read sequence of all QSBR thread states
uint64_t rd_seq;
// Array of QSBR thread states.
// Array of QSBR thread states (aligned to 64 bytes).
struct _qsbr_pad *array;
void *array_raw; // raw allocation pointer (for free)
Py_ssize_t size;
// Freelist of unused _qsbr_thread_states (protected by mutex)

View file

@ -80,6 +80,11 @@ typedef struct _PyThreadStateImpl {
uintptr_t c_stack_init_base;
uintptr_t c_stack_init_top;
#ifdef Py_GIL_DISABLED
// gh-144438: Add padding to ensure that the fields above don't share a
// cache line with other allocations.
char __padding[64];
#endif
} _PyThreadStateImpl;
#ifdef __cplusplus

View file

@ -0,0 +1,2 @@
Align the QSBR thread state array to a 64-byte cache line boundary to
avoid false sharing in the :term:`free-threaded build`.

View file

@ -84,22 +84,29 @@ grow_thread_array(struct _qsbr_shared *shared)
new_size = MIN_ARRAY_SIZE;
}
struct _qsbr_pad *array = PyMem_RawCalloc(new_size, sizeof(*array));
if (array == NULL) {
// Overallocate by 63 bytes so we can align to a 64-byte boundary.
// This avoids potential false sharing between the first entry and other
// allocations.
size_t alignment = 64;
size_t alloc_size = (size_t)new_size * sizeof(struct _qsbr_pad) + alignment - 1;
void *raw = PyMem_RawCalloc(1, alloc_size);
if (raw == NULL) {
return -1;
}
struct _qsbr_pad *array = _Py_ALIGN_UP(raw, alignment);
struct _qsbr_pad *old = shared->array;
if (old != NULL) {
void *old_raw = shared->array_raw;
if (shared->array != NULL) {
memcpy(array, shared->array, shared->size * sizeof(*array));
}
shared->array = array;
shared->array_raw = raw;
shared->size = new_size;
shared->freelist = NULL;
initialize_new_array(shared);
PyMem_RawFree(old);
PyMem_RawFree(old_raw);
return 0;
}
@ -256,8 +263,9 @@ void
_Py_qsbr_fini(PyInterpreterState *interp)
{
struct _qsbr_shared *shared = &interp->qsbr;
PyMem_RawFree(shared->array);
PyMem_RawFree(shared->array_raw);
shared->array = NULL;
shared->array_raw = NULL;
shared->size = 0;
shared->freelist = NULL;
}