mirror of
https://github.com/python/cpython.git
synced 2026-06-04 16:50:51 +00:00
Use exact remote reads for interpreter state, thread state, and interpreter frame structs instead of pulling full remote pages into the profiler page cache. This matches the core change from python/cpython#149585. The profiler clears the page cache between samples, so live entries are always packed at the front. Track the live count and only clear/search that prefix instead of scanning all 1024 slots on the hot path. Use the frame cache to predict the next thread state and top frame address, then batch interpreter/thread/frame reads with process_vm_readv when profiling a Linux target. Reuse prefetched frame buffers in the frame walker when the prediction is valid. Cache the last FrameInfo tuple per code object/instruction offset, reuse cached thread id objects, and append cached parent frames directly on full frame-cache hits. This cuts Python allocation churn in the steady-state profiler path.
292 lines
9.6 KiB
C
292 lines
9.6 KiB
C
/******************************************************************************
|
|
* Remote Debugging Module - Frame Cache
|
|
*
|
|
* This file contains functions for caching frame information to optimize
|
|
* repeated stack unwinding for profiling.
|
|
******************************************************************************/
|
|
|
|
#include "_remote_debugging.h"
|
|
|
|
/* ============================================================================
|
|
* FRAME CACHE - stores (address, frame_info) pairs per thread
|
|
* Uses preallocated fixed-size arrays for efficiency and bounded memory.
|
|
* ============================================================================ */
|
|
|
|
int
|
|
frame_cache_init(RemoteUnwinderObject *unwinder)
|
|
{
|
|
unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
|
|
if (!unwinder->frame_cache) {
|
|
PyErr_NoMemory();
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
frame_cache_cleanup(RemoteUnwinderObject *unwinder)
|
|
{
|
|
if (!unwinder->frame_cache) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
|
|
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
|
}
|
|
PyMem_Free(unwinder->frame_cache);
|
|
unwinder->frame_cache = NULL;
|
|
}
|
|
|
|
// Find cache entry by thread_id
|
|
FrameCacheEntry *
|
|
frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return NULL;
|
|
}
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
assert(i >= 0 && i < FRAME_CACHE_MAX_THREADS);
|
|
if (unwinder->frame_cache[i].thread_id == thread_id) {
|
|
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
FrameCacheEntry *
|
|
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
|
|
{
|
|
if (!unwinder->frame_cache || tstate_addr == 0) {
|
|
return NULL;
|
|
}
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
|
|
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// Allocate a cache slot for a thread
|
|
// Returns NULL if cache is full (graceful degradation)
|
|
static FrameCacheEntry *
|
|
frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return NULL;
|
|
}
|
|
// First check if thread already has an entry
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == thread_id) {
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
// Find empty slot
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == 0) {
|
|
return &unwinder->frame_cache[i];
|
|
}
|
|
}
|
|
// Cache full - graceful degradation
|
|
return NULL;
|
|
}
|
|
|
|
// Remove cache entries for threads not seen in the result
|
|
// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
|
|
// and ThreadInfo[0] is the thread_id
|
|
void
|
|
frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
|
|
{
|
|
if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
|
|
return;
|
|
}
|
|
|
|
// Build array of seen thread IDs from result
|
|
uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
|
|
int num_seen = 0;
|
|
|
|
Py_ssize_t num_interps = PyList_GET_SIZE(result);
|
|
for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
|
|
PyObject *interp_info = PyList_GET_ITEM(result, i);
|
|
PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
|
|
if (!threads || !PyList_Check(threads)) {
|
|
continue;
|
|
}
|
|
Py_ssize_t num_threads = PyList_GET_SIZE(threads);
|
|
for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
|
|
PyObject *thread_info = PyList_GET_ITEM(threads, j);
|
|
PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
|
|
if (tid_obj) {
|
|
uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
|
|
if (!PyErr_Occurred()) {
|
|
seen_threads[num_seen++] = tid;
|
|
} else {
|
|
PyErr_Clear();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Invalidate entries not in seen list
|
|
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
|
if (unwinder->frame_cache[i].thread_id == 0) {
|
|
continue;
|
|
}
|
|
int found = 0;
|
|
for (int j = 0; j < num_seen; j++) {
|
|
if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
// Clear this entry
|
|
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
|
|
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
|
unwinder->frame_cache[i].thread_id = 0;
|
|
unwinder->frame_cache[i].thread_state_addr = 0;
|
|
unwinder->frame_cache[i].num_addrs = 0;
|
|
STATS_INC(unwinder, stale_cache_invalidations);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find last_profiled_frame in cache and extend frame_info with cached continuation
|
|
// If frame_addrs is provided (not NULL), also extends it with cached addresses
|
|
int
|
|
frame_cache_lookup_and_extend(
|
|
RemoteUnwinderObject *unwinder,
|
|
uint64_t thread_id,
|
|
uintptr_t last_profiled_frame,
|
|
PyObject *frame_info,
|
|
uintptr_t *frame_addrs,
|
|
Py_ssize_t *num_addrs,
|
|
Py_ssize_t max_addrs)
|
|
{
|
|
if (!unwinder->frame_cache || last_profiled_frame == 0) {
|
|
return 0;
|
|
}
|
|
|
|
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
|
|
if (!entry || !entry->frame_list) {
|
|
return 0;
|
|
}
|
|
|
|
assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
|
|
|
|
// Find the index where last_profiled_frame matches
|
|
Py_ssize_t start_idx = -1;
|
|
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
|
|
if (entry->addrs[i] == last_profiled_frame) {
|
|
start_idx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (start_idx < 0) {
|
|
return 0; // Not found
|
|
}
|
|
assert(start_idx < entry->num_addrs);
|
|
|
|
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
|
|
|
|
// Extend frame_info with frames ABOVE start_idx (not including it).
|
|
// The frame at start_idx (last_profiled_frame) was the executing frame
|
|
// in the previous sample and its line number may have changed.
|
|
// Only frames above it (its callers) are frozen at their call sites.
|
|
Py_ssize_t cache_start = start_idx + 1;
|
|
if (cache_start >= num_frames) {
|
|
return 0; // Nothing above last_profiled_frame to extend with
|
|
}
|
|
|
|
PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
|
|
if (!slice) {
|
|
return -1;
|
|
}
|
|
|
|
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
|
|
int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
|
|
Py_DECREF(slice);
|
|
|
|
if (result < 0) {
|
|
return -1;
|
|
}
|
|
|
|
// Also extend frame_addrs with cached addresses (above last_profiled_frame)
|
|
if (frame_addrs) {
|
|
for (Py_ssize_t i = cache_start; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
|
|
frame_addrs[(*num_addrs)++] = entry->addrs[i];
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
// Store frame list with addresses in cache
|
|
// Only stores complete stacks that reach base_frame_addr (validation done internally)
|
|
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
|
|
int
|
|
frame_cache_store(
|
|
RemoteUnwinderObject *unwinder,
|
|
uint64_t thread_id,
|
|
PyObject *frame_list,
|
|
const uintptr_t *addrs,
|
|
Py_ssize_t num_addrs,
|
|
uintptr_t thread_state_addr,
|
|
uintptr_t base_frame_addr,
|
|
uintptr_t last_frame_visited)
|
|
{
|
|
if (!unwinder->frame_cache || thread_id == 0) {
|
|
return 0;
|
|
}
|
|
|
|
// Validate we have a complete stack before caching.
|
|
// Only cache if last_frame_visited matches base_frame_addr (the sentinel
|
|
// at the bottom of the stack). Note: we use last_frame_visited rather than
|
|
// addrs[num_addrs-1] because the base frame is visited but not added to the
|
|
// addrs array (it returns frame==NULL from is_frame_valid due to
|
|
// owner==FRAME_OWNED_BY_INTERPRETER).
|
|
if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
|
|
// Incomplete stack - don't cache (graceful degradation)
|
|
return 0;
|
|
}
|
|
|
|
// Clamp to max frames
|
|
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
|
|
num_addrs = FRAME_CACHE_MAX_FRAMES;
|
|
}
|
|
assert(num_addrs >= 0 && num_addrs <= FRAME_CACHE_MAX_FRAMES);
|
|
|
|
FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
|
|
if (!entry) {
|
|
// Cache full - graceful degradation
|
|
return 0;
|
|
}
|
|
|
|
// Clear old frame_list if replacing
|
|
Py_CLEAR(entry->frame_list);
|
|
|
|
// Store full frame list (don't truncate to num_addrs - frames beyond the
|
|
// address array limit are still valid and needed for full cache hits)
|
|
Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
|
|
entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
|
|
if (!entry->frame_list) {
|
|
return -1;
|
|
}
|
|
entry->thread_id = thread_id;
|
|
entry->thread_state_addr = thread_state_addr;
|
|
if (entry->thread_id_obj == NULL) {
|
|
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
|
|
if (entry->thread_id_obj == NULL) {
|
|
return -1;
|
|
}
|
|
}
|
|
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
|
|
entry->num_addrs = num_addrs;
|
|
assert(entry->num_addrs == num_addrs);
|
|
assert(entry->thread_id == thread_id);
|
|
|
|
return 1;
|
|
}
|