mirror of
https://github.com/python/cpython.git
synced 2025-12-07 13:50:06 +00:00
This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks. When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process. The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory. The sampling profiler now enables frame caching by default.
236 lines
7.2 KiB
C
236 lines
7.2 KiB
C
/******************************************************************************
|
|
* Remote Debugging Module - Frame Cache
|
|
*
|
|
* This file contains functions for caching frame information to optimize
|
|
* repeated stack unwinding for profiling.
|
|
******************************************************************************/
|
|
|
|
#include "_remote_debugging.h"
|
|
|
|
/* ============================================================================
|
|
* FRAME CACHE - stores (address, frame_info) pairs per thread
|
|
* Uses preallocated fixed-size arrays for efficiency and bounded memory.
|
|
* ============================================================================ */
|
|
|
|
int
|
|
frame_cache_init(RemoteUnwinderObject *unwinder)
|
|
{
|
|
unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
|
|
if (!unwinder->frame_cache) {
|
|
PyErr_NoMemory();
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
frame_cache_cleanup(RemoteUnwinderObject *unwinder)
|
|
{
|
|
if (!unwinder->frame_cache) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
|
}
|
|
PyMem_Free(unwinder->frame_cache);
|
|
unwinder->frame_cache = NULL;
|
|
}
|
|
|
|
// Find cache entry by thread_id
|
|
FrameCacheEntry *
|
|
frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return NULL;
|
|
}
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == thread_id) {
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// Allocate a cache slot for a thread
|
|
// Returns NULL if cache is full (graceful degradation)
|
|
static FrameCacheEntry *
|
|
frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return NULL;
|
|
}
|
|
// First check if thread already has an entry
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == thread_id) {
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
// Find empty slot
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == 0) {
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
// Cache full - graceful degradation
|
|
return NULL;
|
|
}
|
|
|
|
// Remove cache entries for threads not seen in the result
|
|
// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
|
|
// and ThreadInfo[0] is the thread_id
|
|
void
|
|
frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
|
|
{
|
|
if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
|
|
return;
|
|
}
|
|
|
|
// Build array of seen thread IDs from result
|
|
uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
|
|
int num_seen = 0;
|
|
|
|
Py_ssize_t num_interps = PyList_GET_SIZE(result);
|
|
for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
|
|
PyObject *interp_info = PyList_GET_ITEM(result, i);
|
|
PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
|
|
if (!threads || !PyList_Check(threads)) {
|
|
continue;
|
|
}
|
|
Py_ssize_t num_threads = PyList_GET_SIZE(threads);
|
|
for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
|
|
PyObject *thread_info = PyList_GET_ITEM(threads, j);
|
|
PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
|
|
if (tid_obj) {
|
|
uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
|
|
if (!PyErr_Occurred()) {
|
|
seen_threads[num_seen++] = tid;
|
|
} else {
|
|
PyErr_Clear();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Invalidate entries not in seen list
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == 0) {
|
|
continue;
|
|
}
|
|
int found = 0;
|
|
for (int j = 0; j < num_seen; j++) {
|
|
if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
// Clear this entry
|
|
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
|
unwinder->frame_cache[i].thread_id = 0;
|
|
unwinder->frame_cache[i].num_addrs = 0;
|
|
STATS_INC(unwinder, stale_cache_invalidations);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find last_profiled_frame in cache and extend frame_info with cached continuation
|
|
// If frame_addrs is provided (not NULL), also extends it with cached addresses
|
|
int
|
|
frame_cache_lookup_and_extend(
|
|
RemoteUnwinderObject *unwinder,
|
|
uint64_t thread_id,
|
|
uintptr_t last_profiled_frame,
|
|
PyObject *frame_info,
|
|
uintptr_t *frame_addrs,
|
|
Py_ssize_t *num_addrs,
|
|
Py_ssize_t max_addrs)
|
|
{
|
|
if (!unwinder->frame_cache || last_profiled_frame == 0) {
|
|
return 0;
|
|
}
|
|
|
|
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
|
|
if (!entry || !entry->frame_list) {
|
|
return 0;
|
|
}
|
|
|
|
// Find the index where last_profiled_frame matches
|
|
Py_ssize_t start_idx = -1;
|
|
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
|
|
if (entry->addrs[i] == last_profiled_frame) {
|
|
start_idx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (start_idx < 0) {
|
|
return 0; // Not found
|
|
}
|
|
|
|
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
|
|
|
|
// Extend frame_info with frames from start_idx onwards
|
|
PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames);
|
|
if (!slice) {
|
|
return -1;
|
|
}
|
|
|
|
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
|
|
int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
|
|
Py_DECREF(slice);
|
|
|
|
if (result < 0) {
|
|
return -1;
|
|
}
|
|
|
|
// Also extend frame_addrs with cached addresses if provided
|
|
if (frame_addrs) {
|
|
for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
|
|
frame_addrs[(*num_addrs)++] = entry->addrs[i];
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
// Store frame list with addresses in cache
|
|
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
|
|
int
|
|
frame_cache_store(
|
|
RemoteUnwinderObject *unwinder,
|
|
uint64_t thread_id,
|
|
PyObject *frame_list,
|
|
const uintptr_t *addrs,
|
|
Py_ssize_t num_addrs)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return 0;
|
|
}
|
|
|
|
// Clamp to max frames
|
|
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
|
|
num_addrs = FRAME_CACHE_MAX_FRAMES;
|
|
}
|
|
|
|
FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
|
|
if (!entry) {
|
|
// Cache full - graceful degradation
|
|
return 0;
|
|
}
|
|
|
|
// Clear old frame_list if replacing
|
|
Py_CLEAR(entry->frame_list);
|
|
|
|
// Store full frame list (don't truncate to num_addrs - frames beyond the
|
|
// address array limit are still valid and needed for full cache hits)
|
|
Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
|
|
entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
|
|
if (!entry->frame_list) {
|
|
return -1;
|
|
}
|
|
entry->thread_id = thread_id;
|
|
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
|
|
entry->num_addrs = num_addrs;
|
|
|
|
return 1;
|
|
}
|