gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks.

When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process.

The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory.

The sampling profiler now enables frame caching by default.
This commit is contained in:
Pablo Galindo Salgado 2025-12-06 22:37:34 +00:00 committed by GitHub
parent 332da6295f
commit 572c780aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1855 additions and 142 deletions

View file

@ -296,6 +296,8 @@ unwind_stack_for_thread(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
goto error;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
@ -309,6 +311,8 @@ unwind_stack_for_thread(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
goto error;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size);
// Calculate thread status using flags (always)
int status_flags = 0;
@ -383,14 +387,36 @@ unwind_stack_for_thread(
goto error;
}
if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
goto error;
// In cache mode, copying stack chunks is more expensive than direct memory reads
if (!unwinder->cache_frames) {
if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
goto error;
}
}
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
if (unwinder->cache_frames) {
// Use cache to avoid re-reading unchanged parent frames
uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
gc_frame, last_profiled_frame, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
// Update last_profiled_frame for next sample
uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
sizeof(uintptr_t), &frame_addr) < 0) {
PyErr_Clear(); // Non-fatal
}
} else {
// No caching - process entire frame chain
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
}
}
*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);