gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks.

When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process.

The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory.

The sampling profiler now enables frame caching by default.
This commit is contained in:
Pablo Galindo Salgado 2025-12-06 22:37:34 +00:00 committed by GitHub
parent 332da6295f
commit 572c780aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1855 additions and 142 deletions

View file

@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d
return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst);
}
// Why is pwritev not guarded? Except on Android API level 23 (no longer
// supported), HAVE_PROCESS_VM_READV is sufficient.
#if defined(__linux__) && HAVE_PROCESS_VM_READV
static int
write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
if (handle->memfd == -1) {
if (open_proc_mem_fd(handle) < 0) {
return -1;
}
}
struct iovec local[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (char*)src + result;
local[0].iov_len = len - result;
off_t offset = remote_address + result;
written = pwritev(handle->memfd, local, 1, offset);
if (written < 0) {
PyErr_SetFromErrno(PyExc_OSError);
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
}
#endif // __linux__
// Use the shared write function from remote_debug.h
static int
write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
#ifdef MS_WINDOWS
SIZE_T written = 0;
SIZE_T result = 0;
do {
if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
PyErr_SetFromWindowsErr(0);
return -1;
}
result += written;
} while (result < len);
return 0;
#elif defined(__linux__) && HAVE_PROCESS_VM_READV
if (handle->memfd != -1) {
return write_memory_fallback(handle, remote_address, len, src);
}
struct iovec local[1];
struct iovec remote[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (void*)((char*)src + result);
local[0].iov_len = len - result;
remote[0].iov_base = (void*)((char*)remote_address + result);
remote[0].iov_len = len - result;
written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
if (written < 0) {
if (errno == ENOSYS) {
return write_memory_fallback(handle, remote_address, len, src);
}
PyErr_SetFromErrno(PyExc_OSError);
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
#elif defined(__APPLE__) && TARGET_OS_OSX
kern_return_t kr = mach_vm_write(
pid_to_task(handle->pid),
(mach_vm_address_t)remote_address,
(vm_offset_t)src,
(mach_msg_type_number_t)len);
if (kr != KERN_SUCCESS) {
switch (kr) {
case KERN_PROTECTION_FAILURE:
PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
break;
case KERN_INVALID_ARGUMENT:
PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
break;
default:
PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
}
return -1;
}
return 0;
#else
Py_UNREACHABLE();
#endif
return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src);
}
static int