cpython/Modules/_remote_debugging/frame_cache.c
Pablo Galindo Salgado 661df25692
gh-149584: Fix excessive overhead in the Tachyon profiler regarding the cache behavior (#149649)
Use exact remote reads for interpreter state, thread state, and
interpreter frame structs instead of pulling full remote pages into the
profiler page cache. This matches the core change from
python/cpython#149585.

The profiler clears the page cache between samples, so live entries are
always packed at the front. Track the live count and only clear/search
that prefix instead of scanning all 1024 slots on the hot path.

Use the frame cache to predict the next thread state and top frame
address, then batch interpreter/thread/frame reads with process_vm_readv
when profiling a Linux target. Reuse prefetched frame buffers in the
frame walker when the prediction is valid.

Cache the last FrameInfo tuple per code object/instruction offset, reuse
cached thread id objects, and append cached parent frames directly on
full frame-cache hits. This cuts Python allocation churn in the
steady-state profiler path.
2026-05-20 04:32:08 -07:00

292 lines
9.6 KiB
C

/******************************************************************************
* Remote Debugging Module - Frame Cache
*
* This file contains functions for caching frame information to optimize
* repeated stack unwinding for profiling.
******************************************************************************/
#include "_remote_debugging.h"
/* ============================================================================
* FRAME CACHE - stores (address, frame_info) pairs per thread
* Uses preallocated fixed-size arrays for efficiency and bounded memory.
* ============================================================================ */
int
frame_cache_init(RemoteUnwinderObject *unwinder)
{
unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
if (!unwinder->frame_cache) {
PyErr_NoMemory();
return -1;
}
return 0;
}
void
frame_cache_cleanup(RemoteUnwinderObject *unwinder)
{
if (!unwinder->frame_cache) {
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
unwinder->frame_cache = NULL;
}
// Find cache entry by thread_id
FrameCacheEntry *
frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
assert(i >= 0 && i < FRAME_CACHE_MAX_THREADS);
if (unwinder->frame_cache[i].thread_id == thread_id) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}
FrameCacheEntry *
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
{
if (!unwinder->frame_cache || tstate_addr == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}
// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
// First check if thread already has an entry
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == thread_id) {
return &unwinder->frame_cache[i];
}
}
// Find empty slot
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
return &unwinder->frame_cache[i];
}
}
// Cache full - graceful degradation
return NULL;
}
// Remove cache entries for threads not seen in the result
// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
// and ThreadInfo[0] is the thread_id
void
frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
{
if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
return;
}
// Build array of seen thread IDs from result
uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
int num_seen = 0;
Py_ssize_t num_interps = PyList_GET_SIZE(result);
for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
PyObject *interp_info = PyList_GET_ITEM(result, i);
PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
if (!threads || !PyList_Check(threads)) {
continue;
}
Py_ssize_t num_threads = PyList_GET_SIZE(threads);
for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
PyObject *thread_info = PyList_GET_ITEM(threads, j);
PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
if (tid_obj) {
uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
if (!PyErr_Occurred()) {
seen_threads[num_seen++] = tid;
} else {
PyErr_Clear();
}
}
}
}
// Invalidate entries not in seen list
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
continue;
}
int found = 0;
for (int j = 0; j < num_seen; j++) {
if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
found = 1;
break;
}
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
}
}
// Find last_profiled_frame in cache and extend frame_info with cached continuation
// If frame_addrs is provided (not NULL), also extends it with cached addresses
int
frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs)
{
if (!unwinder->frame_cache || last_profiled_frame == 0) {
return 0;
}
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
// Find the index where last_profiled_frame matches
Py_ssize_t start_idx = -1;
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
if (entry->addrs[i] == last_profiled_frame) {
start_idx = i;
break;
}
}
if (start_idx < 0) {
return 0; // Not found
}
assert(start_idx < entry->num_addrs);
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
// Extend frame_info with frames ABOVE start_idx (not including it).
// The frame at start_idx (last_profiled_frame) was the executing frame
// in the previous sample and its line number may have changed.
// Only frames above it (its callers) are frozen at their call sites.
Py_ssize_t cache_start = start_idx + 1;
if (cache_start >= num_frames) {
return 0; // Nothing above last_profiled_frame to extend with
}
PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
if (!slice) {
return -1;
}
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
Py_DECREF(slice);
if (result < 0) {
return -1;
}
// Also extend frame_addrs with cached addresses (above last_profiled_frame)
if (frame_addrs) {
for (Py_ssize_t i = cache_start; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
frame_addrs[(*num_addrs)++] = entry->addrs[i];
}
}
return 1;
}
// Store frame list with addresses in cache
// Only stores complete stacks that reach base_frame_addr (validation done internally)
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
int
frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
if (!unwinder->frame_cache || thread_id == 0) {
return 0;
}
// Validate we have a complete stack before caching.
// Only cache if last_frame_visited matches base_frame_addr (the sentinel
// at the bottom of the stack). Note: we use last_frame_visited rather than
// addrs[num_addrs-1] because the base frame is visited but not added to the
// addrs array (it returns frame==NULL from is_frame_valid due to
// owner==FRAME_OWNED_BY_INTERPRETER).
if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
// Incomplete stack - don't cache (graceful degradation)
return 0;
}
// Clamp to max frames
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
num_addrs = FRAME_CACHE_MAX_FRAMES;
}
assert(num_addrs >= 0 && num_addrs <= FRAME_CACHE_MAX_FRAMES);
FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
if (!entry) {
// Cache full - graceful degradation
return 0;
}
// Clear old frame_list if replacing
Py_CLEAR(entry->frame_list);
// Store full frame list (don't truncate to num_addrs - frames beyond the
// address array limit are still valid and needed for full cache hits)
Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
if (!entry->frame_list) {
return -1;
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
return -1;
}
}
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);
assert(entry->thread_id == thread_id);
return 1;
}