mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)
This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks. When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process. The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory. The sampling profiler now enables frame caching by default.
This commit is contained in:
parent
332da6295f
commit
572c780aa8
24 changed files with 1855 additions and 142 deletions
|
|
@ -154,6 +154,39 @@ typedef struct {
|
|||
uintptr_t addr_code_adaptive;
|
||||
} CachedCodeMetadata;
|
||||
|
||||
/* Frame cache constants and types */
|
||||
#define FRAME_CACHE_MAX_THREADS 32
|
||||
#define FRAME_CACHE_MAX_FRAMES 1024
|
||||
|
||||
typedef struct {
|
||||
uint64_t thread_id; // 0 = empty slot
|
||||
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
|
||||
Py_ssize_t num_addrs;
|
||||
PyObject *frame_list; // owned reference, NULL if empty
|
||||
} FrameCacheEntry;
|
||||
|
||||
/* Statistics for profiling performance analysis */
|
||||
typedef struct {
|
||||
uint64_t total_samples; // Total number of get_stack_trace calls
|
||||
uint64_t frame_cache_hits; // Full cache hits (entire stack unchanged)
|
||||
uint64_t frame_cache_misses; // Cache misses requiring full walk
|
||||
uint64_t frame_cache_partial_hits; // Partial hits (stopped at cached frame)
|
||||
uint64_t frames_read_from_cache; // Total frames retrieved from cache
|
||||
uint64_t frames_read_from_memory; // Total frames read from remote memory
|
||||
uint64_t memory_reads; // Total remote memory read operations
|
||||
uint64_t memory_bytes_read; // Total bytes read from remote memory
|
||||
uint64_t code_object_cache_hits; // Code object cache hits
|
||||
uint64_t code_object_cache_misses; // Code object cache misses
|
||||
uint64_t stale_cache_invalidations; // Times stale entries were cleared
|
||||
} UnwinderStats;
|
||||
|
||||
/* Stats tracking macros - no-op when stats collection is disabled */
|
||||
#define STATS_INC(unwinder, field) \
|
||||
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
|
||||
|
||||
#define STATS_ADD(unwinder, field, val) \
|
||||
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
|
||||
|
||||
typedef struct {
|
||||
PyTypeObject *RemoteDebugging_Type;
|
||||
PyTypeObject *TaskInfo_Type;
|
||||
|
|
@ -195,7 +228,12 @@ typedef struct {
|
|||
int skip_non_matching_threads;
|
||||
int native;
|
||||
int gc;
|
||||
int cache_frames;
|
||||
int collect_stats; // whether to collect statistics
|
||||
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
|
||||
RemoteDebuggingState *cached_state;
|
||||
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
|
||||
UnwinderStats stats; // statistics for performance analysis
|
||||
#ifdef Py_GIL_DISABLED
|
||||
uint32_t tlbc_generation;
|
||||
_Py_hashtable_t *tlbc_cache;
|
||||
|
|
@ -363,9 +401,45 @@ extern int process_frame_chain(
|
|||
uintptr_t initial_frame_addr,
|
||||
StackChunkList *chunks,
|
||||
PyObject *frame_info,
|
||||
uintptr_t gc_frame
|
||||
uintptr_t gc_frame,
|
||||
uintptr_t last_profiled_frame,
|
||||
int *stopped_at_cached_frame,
|
||||
uintptr_t *frame_addrs,
|
||||
Py_ssize_t *num_addrs,
|
||||
Py_ssize_t max_addrs
|
||||
);
|
||||
|
||||
/* Frame cache functions */
|
||||
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
|
||||
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
|
||||
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
|
||||
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
|
||||
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
|
||||
extern int frame_cache_lookup_and_extend(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uint64_t thread_id,
|
||||
uintptr_t last_profiled_frame,
|
||||
PyObject *frame_info,
|
||||
uintptr_t *frame_addrs,
|
||||
Py_ssize_t *num_addrs,
|
||||
Py_ssize_t max_addrs);
|
||||
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
|
||||
extern int frame_cache_store(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uint64_t thread_id,
|
||||
PyObject *frame_list,
|
||||
const uintptr_t *addrs,
|
||||
Py_ssize_t num_addrs);
|
||||
|
||||
extern int collect_frames_with_cache(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uintptr_t frame_addr,
|
||||
StackChunkList *chunks,
|
||||
PyObject *frame_info,
|
||||
uintptr_t gc_frame,
|
||||
uintptr_t last_profiled_frame,
|
||||
uint64_t thread_id);
|
||||
|
||||
/* ============================================================================
|
||||
* THREAD FUNCTION DECLARATIONS
|
||||
* ============================================================================ */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue