gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks.

When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process.

The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory.

The sampling profiler now enables frame caching by default.
This commit is contained in:
Pablo Galindo Salgado 2025-12-06 22:37:34 +00:00 committed by GitHub
parent 332da6295f
commit 572c780aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1855 additions and 142 deletions

View file

@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__
skip_non_matching_threads: bool = True
native: bool = False
gc: bool = False
cache_frames: bool = False
stats: bool = False
Initialize a new RemoteUnwinder object for debugging a remote Python process.
@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
non-Python code.
gc: If True, include artificial "<GC>" frames to denote active garbage
collection.
cache_frames: If True, enable frame caching optimization to avoid re-reading
unchanged parent frames between samples.
stats: If True, collect statistics about cache hits, memory reads, etc.
Use get_stats() to retrieve the collected statistics.
The RemoteUnwinder provides functionality to inspect and debug a running Python
process, including examining thread states, stack frames and other runtime data.
@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
int only_active_thread,
int mode, int debug,
int skip_non_matching_threads,
int native, int gc)
/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/
int native, int gc,
int cache_frames, int stats)
/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/
{
// Validate that all_threads and only_active_thread are not both True
if (all_threads && only_active_thread) {
@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
#ifdef Py_GIL_DISABLED
if (only_active_thread) {
PyErr_SetString(PyExc_ValueError,
"only_active_thread is not supported when Py_GIL_DISABLED is not defined");
"only_active_thread is not supported in free-threaded builds");
return -1;
}
#endif
self->native = native;
self->gc = gc;
self->cache_frames = cache_frames;
self->collect_stats = stats;
self->stale_invalidation_counter = 0;
self->debug = debug;
self->only_active_thread = only_active_thread;
self->mode = mode;
self->skip_non_matching_threads = skip_non_matching_threads;
self->cached_state = NULL;
self->frame_cache = NULL;
// Initialize stats to zero
memset(&self->stats, 0, sizeof(self->stats));
if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
return -1;
@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
self->win_process_buffer_size = 0;
#endif
if (cache_frames && frame_cache_init(self) < 0) {
return -1;
}
// Clear stale last_profiled_frame values from previous profilers
// This prevents us from stopping frame walking early due to stale values
if (cache_frames) {
clear_last_profiled_frames(self);
}
return 0;
}
@ -429,6 +452,8 @@ static PyObject *
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/
{
STATS_INC(self, total_samples);
PyObject* result = PyList_New(0);
if (!result) {
set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list");
@ -591,7 +616,15 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
}
exit:
_Py_RemoteDebug_ClearCache(&self->handle);
// Invalidate cache entries for threads not seen in this sample.
// Only do this every 1024 iterations to avoid performance overhead.
if (self->cache_frames && result) {
if (++self->stale_invalidation_counter >= 1024) {
self->stale_invalidation_counter = 0;
frame_cache_invalidate_stale(self, result);
}
}
_Py_RemoteDebug_ClearCache(&self->handle);
return result;
}
@ -757,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
return NULL;
}
/*[clinic input]
@permit_long_docstring_body
@critical_section
_remote_debugging.RemoteUnwinder.get_stats
Get collected statistics about profiling performance.
Returns a dictionary containing statistics about cache performance,
memory reads, and other profiling metrics. Only available if the
RemoteUnwinder was created with stats=True.
Returns:
dict: A dictionary containing:
- total_samples: Total number of get_stack_trace calls
- frame_cache_hits: Full cache hits (entire stack unchanged)
- frame_cache_misses: Cache misses requiring full walk
- frame_cache_partial_hits: Partial hits (stopped at cached frame)
- frames_read_from_cache: Total frames retrieved from cache
- frames_read_from_memory: Total frames read from remote memory
- memory_reads: Total remote memory read operations
- memory_bytes_read: Total bytes read from remote memory
- code_object_cache_hits: Code object cache hits
- code_object_cache_misses: Code object cache misses
- stale_cache_invalidations: Times stale cache entries were cleared
- frame_cache_hit_rate: Percentage of samples that hit the cache
- code_object_cache_hit_rate: Percentage of code object lookups that hit cache
Raises:
RuntimeError: If stats collection was not enabled (stats=False)
[clinic start generated code]*/
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
{
if (!self->collect_stats) {
PyErr_SetString(PyExc_RuntimeError,
"Statistics collection was not enabled. "
"Create RemoteUnwinder with stats=True to collect statistics.");
return NULL;
}
PyObject *result = PyDict_New();
if (!result) {
return NULL;
}
#define ADD_STAT(name) do { \
PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \
if (!val || PyDict_SetItemString(result, #name, val) < 0) { \
Py_XDECREF(val); \
Py_DECREF(result); \
return NULL; \
} \
Py_DECREF(val); \
} while(0)
ADD_STAT(total_samples);
ADD_STAT(frame_cache_hits);
ADD_STAT(frame_cache_misses);
ADD_STAT(frame_cache_partial_hits);
ADD_STAT(frames_read_from_cache);
ADD_STAT(frames_read_from_memory);
ADD_STAT(memory_reads);
ADD_STAT(memory_bytes_read);
ADD_STAT(code_object_cache_hits);
ADD_STAT(code_object_cache_misses);
ADD_STAT(stale_cache_invalidations);
#undef ADD_STAT
// Calculate and add derived statistics
// Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
double frame_cache_hit_rate = 0.0;
uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses;
if (total_cache_lookups > 0) {
frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
/ (double)total_cache_lookups;
}
PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
Py_XDECREF(hit_rate);
Py_DECREF(result);
return NULL;
}
Py_DECREF(hit_rate);
double code_object_hit_rate = 0.0;
uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
if (total_code_lookups > 0) {
code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
}
PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
Py_XDECREF(code_hit_rate);
Py_DECREF(result);
return NULL;
}
Py_DECREF(code_hit_rate);
return result;
}
static PyMethodDef RemoteUnwinder_methods[] = {
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF
{NULL, NULL}
};
@ -787,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op)
_Py_RemoteDebug_ClearCache(&self->handle);
_Py_RemoteDebug_CleanupProcHandle(&self->handle);
}
frame_cache_cleanup(self);
PyObject_Del(self);
Py_DECREF(tp);
}