gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks.

When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process.

The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory.

The sampling profiler now enables frame caching by default.
This commit is contained in:
Pablo Galindo Salgado 2025-12-06 22:37:34 +00:00 committed by GitHub
parent 332da6295f
commit 572c780aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1855 additions and 142 deletions

View file

@ -135,6 +135,8 @@ struct _ts {
/* Pointer to currently executing frame. */
struct _PyInterpreterFrame *current_frame;
struct _PyInterpreterFrame *last_profiled_frame;
Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
PyObject *c_profileobj;

View file

@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
uint64_t next;
uint64_t interp;
uint64_t current_frame;
uint64_t last_profiled_frame;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
.next = offsetof(PyThreadState, next), \
.interp = offsetof(PyThreadState, interp), \
.current_frame = offsetof(PyThreadState, current_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \

View file

@ -1609,6 +1609,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_parameter_type));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_stack));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cache_frames));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
@ -2053,6 +2054,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));

View file

@ -332,6 +332,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(c_parameter_type)
STRUCT_FOR_ID(c_return)
STRUCT_FOR_ID(c_stack)
STRUCT_FOR_ID(cache_frames)
STRUCT_FOR_ID(cached_datetime_module)
STRUCT_FOR_ID(cached_statements)
STRUCT_FOR_ID(cadata)
@ -776,6 +777,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(stacklevel)
STRUCT_FOR_ID(start)
STRUCT_FOR_ID(statement)
STRUCT_FOR_ID(stats)
STRUCT_FOR_ID(status)
STRUCT_FOR_ID(stderr)
STRUCT_FOR_ID(stdin)

View file

@ -1607,6 +1607,7 @@ extern "C" {
INIT_ID(c_parameter_type), \
INIT_ID(c_return), \
INIT_ID(c_stack), \
INIT_ID(cache_frames), \
INIT_ID(cached_datetime_module), \
INIT_ID(cached_statements), \
INIT_ID(cadata), \
@ -2051,6 +2052,7 @@ extern "C" {
INIT_ID(stacklevel), \
INIT_ID(start), \
INIT_ID(statement), \
INIT_ID(stats), \
INIT_ID(status), \
INIT_ID(stderr), \
INIT_ID(stdin), \

View file

@ -1108,6 +1108,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(cache_frames);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(cached_datetime_module);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@ -2884,6 +2888,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(stats);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(status);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));

View file

@ -111,6 +111,26 @@ ### Shim frames
instruction which cleans up the shim frame and returns.
### Remote Profiling Frame Cache
The `last_profiled_frame` field in `PyThreadState` supports an optimization for
remote profilers that sample call stacks from external processes. When a remote
profiler reads the call stack, it writes the current frame address to this field.
The eval loop then keeps this pointer valid by updating it to the parent frame
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
`last_profiled_frame`, knowing that frames from that point downward are unchanged
and can be retrieved from a cache. This significantly reduces the amount of remote
memory reads needed when call stacks are deep and stable at their base.
The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
prevents transient frames (called and returned between profiler samples) from
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
### The Instruction Pointer
`_PyInterpreterFrame` has two fields which are used to maintain the instruction

View file

@ -27,21 +27,24 @@
class SampleProfiler:
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
self.pid = pid
self.sample_interval_usec = sample_interval_usec
self.all_threads = all_threads
self.mode = mode # Store mode for later use
self.collect_stats = collect_stats
if _FREE_THREADED_BUILD:
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
else:
only_active_threads = bool(self.all_threads)
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
# Track sample intervals and total sample count
self.sample_intervals = deque(maxlen=100)
@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
print(f"Sample rate: {sample_rate:.2f} samples/sec")
print(f"Error rate: {error_rate:.2f}%")
# Print unwinder stats if stats collection is enabled
if self.collect_stats:
self._print_unwinder_stats()
# Pass stats to flamegraph collector if it's the right type
if hasattr(collector, 'set_stats'):
collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@ -176,17 +183,100 @@ def _print_realtime_stats(self):
(1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
) # Max time = Min Hz
# Build cache stats string if stats collection is enabled
cache_stats_str = ""
if self.collect_stats:
try:
stats = self.unwinder.get_stats()
hits = stats.get('frame_cache_hits', 0)
partial = stats.get('frame_cache_partial_hits', 0)
misses = stats.get('frame_cache_misses', 0)
total = hits + partial + misses
if total > 0:
hit_pct = (hits + partial) / total * 100
cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
except RuntimeError:
pass
# Clear line and print stats
print(
f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
f"{cache_stats_str}",
end="",
flush=True,
)
def _print_unwinder_stats(self):
"""Print unwinder statistics including cache performance."""
try:
stats = self.unwinder.get_stats()
except RuntimeError:
return # Stats not enabled
print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
# Frame cache stats
total_samples = stats.get('total_samples', 0)
frame_cache_hits = stats.get('frame_cache_hits', 0)
frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
frame_cache_misses = stats.get('frame_cache_misses', 0)
total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
# Calculate percentages
hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
print(f" {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
print(f" Total samples: {total_samples:,}")
print(f" Full hits: {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Partial hits: {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
# Frame read stats
frames_from_cache = stats.get('frames_read_from_cache', 0)
frames_from_memory = stats.get('frames_read_from_memory', 0)
total_frames = frames_from_cache + frames_from_memory
cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
print(f" {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
print(f" From cache: {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
print(f" From memory: {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
# Code object cache stats
code_hits = stats.get('code_object_cache_hits', 0)
code_misses = stats.get('code_object_cache_misses', 0)
total_code = code_hits + code_misses
code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
print(f" {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
print(f" Hits: {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)
if memory_bytes >= 1024 * 1024:
memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
elif memory_bytes >= 1024:
memory_str = f"{memory_bytes / 1024:.1f} KB"
else:
memory_str = f"{memory_bytes} B"
print(f" {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
print(f" Read operations: {memory_reads:,} ({memory_str})")
# Stale invalidations
stale_invalidations = stats.get('stale_cache_invalidations', 0)
if stale_invalidations > 0:
print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
def sample(
pid,
@ -234,7 +324,8 @@ def sample(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats
@ -290,7 +381,8 @@ def sample_live(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats

View file

@ -1,3 +1,4 @@
import contextlib
import unittest
import os
import textwrap
@ -2038,5 +2039,766 @@ def busy_thread():
p.stderr.close()
class TestFrameCaching(unittest.TestCase):
"""Test that frame caching produces correct results.
Uses socket-based synchronization for deterministic testing.
All tests verify cache reuse via object identity checks (assertIs).
"""
maxDiff = None
MAX_TRIES = 10
@contextlib.contextmanager
def _target_process(self, script_body):
"""Context manager for running a target process with socket sync."""
port = find_unused_port()
script = f"""\
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect(('localhost', {port}))
{textwrap.dedent(script_body)}
"""
with os_helper.temp_dir() as work_dir:
script_dir = os.path.join(work_dir, "script_pkg")
os.mkdir(script_dir)
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_socket.bind(("localhost", port))
server_socket.settimeout(SHORT_TIMEOUT)
server_socket.listen(1)
script_name = _make_test_script(script_dir, "script", script)
client_socket = None
p = None
try:
p = subprocess.Popen([sys.executable, script_name])
client_socket, _ = server_socket.accept()
server_socket.close()
def make_unwinder(cache_frames=True):
return RemoteUnwinder(p.pid, all_threads=True, cache_frames=cache_frames)
yield p, client_socket, make_unwinder
except PermissionError:
self.skipTest("Insufficient permissions to read the stack trace")
finally:
if client_socket:
client_socket.close()
if p:
p.kill()
p.terminate()
p.wait(timeout=SHORT_TIMEOUT)
def _wait_for_signal(self, client_socket, signal):
"""Block until signal received from target."""
response = b""
while signal not in response:
chunk = client_socket.recv(64)
if not chunk:
break
response += chunk
return response
def _get_frames(self, unwinder, required_funcs):
"""Sample and return frame_info list for thread containing required_funcs."""
traces = unwinder.get_stack_trace()
for interp in traces:
for thread in interp.threads:
funcs = [f.funcname for f in thread.frame_info]
if required_funcs.issubset(set(funcs)):
return thread.frame_info
return None
def _sample_frames(self, client_socket, unwinder, wait_signal, send_ack, required_funcs, expected_frames=1):
"""Wait for signal, sample frames, send ack. Returns frame_info list."""
self._wait_for_signal(client_socket, wait_signal)
# Give at least MAX_TRIES tries for the process to arrive to a steady state
for _ in range(self.MAX_TRIES):
frames = self._get_frames(unwinder, required_funcs)
if frames and len(frames) >= expected_frames:
break
time.sleep(0.1)
client_socket.sendall(send_ack)
return frames
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_hit_same_stack(self):
"""Test that consecutive samples reuse cached parent frame objects.
The current frame (index 0) is always re-read from memory to get
updated line numbers, so it may be a different object. Parent frames
(index 1+) should be identical objects from cache.
"""
script_body = """\
def level3():
sock.sendall(b"sync1")
sock.recv(16)
sock.sendall(b"sync2")
sock.recv(16)
sock.sendall(b"sync3")
sock.recv(16)
def level2():
level3()
def level1():
level2()
level1()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
expected = {"level1", "level2", "level3"}
frames1 = self._sample_frames(client_socket, unwinder, b"sync1", b"ack", expected)
frames2 = self._sample_frames(client_socket, unwinder, b"sync2", b"ack", expected)
frames3 = self._sample_frames(client_socket, unwinder, b"sync3", b"done", expected)
self.assertIsNotNone(frames1)
self.assertIsNotNone(frames2)
self.assertIsNotNone(frames3)
self.assertEqual(len(frames1), len(frames2))
self.assertEqual(len(frames2), len(frames3))
# Current frame (index 0) is always re-read, so check value equality
self.assertEqual(frames1[0].funcname, frames2[0].funcname)
self.assertEqual(frames2[0].funcname, frames3[0].funcname)
# Parent frames (index 1+) must be identical objects (cache reuse)
for i in range(1, len(frames1)):
f1, f2, f3 = frames1[i], frames2[i], frames3[i]
self.assertIs(f1, f2, f"Frame {i}: samples 1-2 must be same object")
self.assertIs(f2, f3, f"Frame {i}: samples 2-3 must be same object")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_line_number_updates_in_same_frame(self):
"""Test that line numbers are correctly updated when execution moves within a function.
When the profiler samples at different points within the same function,
it must report the correct line number for each sample, not stale cached values.
"""
script_body = """\
def outer():
inner()
def inner():
sock.sendall(b"line_a"); sock.recv(16)
sock.sendall(b"line_b"); sock.recv(16)
sock.sendall(b"line_c"); sock.recv(16)
sock.sendall(b"line_d"); sock.recv(16)
outer()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
frames_a = self._sample_frames(client_socket, unwinder, b"line_a", b"ack", {"inner"})
frames_b = self._sample_frames(client_socket, unwinder, b"line_b", b"ack", {"inner"})
frames_c = self._sample_frames(client_socket, unwinder, b"line_c", b"ack", {"inner"})
frames_d = self._sample_frames(client_socket, unwinder, b"line_d", b"done", {"inner"})
self.assertIsNotNone(frames_a)
self.assertIsNotNone(frames_b)
self.assertIsNotNone(frames_c)
self.assertIsNotNone(frames_d)
# Get the 'inner' frame from each sample (should be index 0)
inner_a = frames_a[0]
inner_b = frames_b[0]
inner_c = frames_c[0]
inner_d = frames_d[0]
self.assertEqual(inner_a.funcname, "inner")
self.assertEqual(inner_b.funcname, "inner")
self.assertEqual(inner_c.funcname, "inner")
self.assertEqual(inner_d.funcname, "inner")
# Line numbers must be different and increasing (execution moves forward)
self.assertLess(inner_a.lineno, inner_b.lineno,
"Line B should be after line A")
self.assertLess(inner_b.lineno, inner_c.lineno,
"Line C should be after line B")
self.assertLess(inner_c.lineno, inner_d.lineno,
"Line D should be after line C")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_invalidation_on_return(self):
"""Test cache invalidation when stack shrinks (function returns)."""
script_body = """\
def inner():
sock.sendall(b"at_inner")
sock.recv(16)
def outer():
inner()
sock.sendall(b"at_outer")
sock.recv(16)
outer()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
frames_deep = self._sample_frames(
client_socket, unwinder, b"at_inner", b"ack", {"inner", "outer"})
frames_shallow = self._sample_frames(
client_socket, unwinder, b"at_outer", b"done", {"outer"})
self.assertIsNotNone(frames_deep)
self.assertIsNotNone(frames_shallow)
funcs_deep = [f.funcname for f in frames_deep]
funcs_shallow = [f.funcname for f in frames_shallow]
self.assertIn("inner", funcs_deep)
self.assertIn("outer", funcs_deep)
self.assertNotIn("inner", funcs_shallow)
self.assertIn("outer", funcs_shallow)
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_invalidation_on_call(self):
"""Test cache invalidation when stack grows (new function called)."""
script_body = """\
def deeper():
sock.sendall(b"at_deeper")
sock.recv(16)
def middle():
sock.sendall(b"at_middle")
sock.recv(16)
deeper()
def top():
middle()
top()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
frames_before = self._sample_frames(
client_socket, unwinder, b"at_middle", b"ack", {"middle", "top"})
frames_after = self._sample_frames(
client_socket, unwinder, b"at_deeper", b"done", {"deeper", "middle", "top"})
self.assertIsNotNone(frames_before)
self.assertIsNotNone(frames_after)
funcs_before = [f.funcname for f in frames_before]
funcs_after = [f.funcname for f in frames_after]
self.assertIn("middle", funcs_before)
self.assertIn("top", funcs_before)
self.assertNotIn("deeper", funcs_before)
self.assertIn("deeper", funcs_after)
self.assertIn("middle", funcs_after)
self.assertIn("top", funcs_after)
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_partial_stack_reuse(self):
"""Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D)."""
script_body = """\
def func_c():
sock.sendall(b"at_c")
sock.recv(16)
def func_d():
sock.sendall(b"at_d")
sock.recv(16)
def func_b():
func_c()
func_d()
def func_a():
func_b()
func_a()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
# Sample at C: stack is A→B→C
frames_c = self._sample_frames(
client_socket, unwinder, b"at_c", b"ack", {"func_a", "func_b", "func_c"})
# Sample at D: stack is A→B→D (C returned, D called)
frames_d = self._sample_frames(
client_socket, unwinder, b"at_d", b"done", {"func_a", "func_b", "func_d"})
self.assertIsNotNone(frames_c)
self.assertIsNotNone(frames_d)
# Find func_a and func_b frames in both samples
def find_frame(frames, funcname):
for f in frames:
if f.funcname == funcname:
return f
return None
frame_a_in_c = find_frame(frames_c, "func_a")
frame_b_in_c = find_frame(frames_c, "func_b")
frame_a_in_d = find_frame(frames_d, "func_a")
frame_b_in_d = find_frame(frames_d, "func_b")
self.assertIsNotNone(frame_a_in_c)
self.assertIsNotNone(frame_b_in_c)
self.assertIsNotNone(frame_a_in_d)
self.assertIsNotNone(frame_b_in_d)
# The bottom frames (A, B) should be the SAME objects (cache reuse)
self.assertIs(frame_a_in_c, frame_a_in_d, "func_a frame should be reused from cache")
self.assertIs(frame_b_in_c, frame_b_in_d, "func_b frame should be reused from cache")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_recursive_frames(self):
"""Test caching with same function appearing multiple times (recursion)."""
script_body = """\
def recurse(n):
if n <= 0:
sock.sendall(b"sync1")
sock.recv(16)
sock.sendall(b"sync2")
sock.recv(16)
else:
recurse(n - 1)
recurse(5)
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
frames1 = self._sample_frames(
client_socket, unwinder, b"sync1", b"ack", {"recurse"})
frames2 = self._sample_frames(
client_socket, unwinder, b"sync2", b"done", {"recurse"})
self.assertIsNotNone(frames1)
self.assertIsNotNone(frames2)
# Should have multiple "recurse" frames (6 total: recurse(5) down to recurse(0))
recurse_count = sum(1 for f in frames1 if f.funcname == "recurse")
self.assertEqual(recurse_count, 6, "Should have 6 recursive frames")
self.assertEqual(len(frames1), len(frames2))
# Current frame (index 0) is re-read, check value equality
self.assertEqual(frames1[0].funcname, frames2[0].funcname)
# Parent frames (index 1+) should be identical objects (cache reuse)
for i in range(1, len(frames1)):
self.assertIs(frames1[i], frames2[i],
f"Frame {i}: recursive frames must be same object")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_vs_no_cache_equivalence(self):
"""Test that cache_frames=True and cache_frames=False produce equivalent results."""
script_body = """\
def level3():
sock.sendall(b"ready"); sock.recv(16)
def level2():
level3()
def level1():
level2()
level1()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
self._wait_for_signal(client_socket, b"ready")
# Sample with cache
unwinder_cache = make_unwinder(cache_frames=True)
frames_cached = self._get_frames(unwinder_cache, {"level1", "level2", "level3"})
# Sample without cache
unwinder_no_cache = make_unwinder(cache_frames=False)
frames_no_cache = self._get_frames(unwinder_no_cache, {"level1", "level2", "level3"})
client_socket.sendall(b"done")
self.assertIsNotNone(frames_cached)
self.assertIsNotNone(frames_no_cache)
# Same number of frames
self.assertEqual(len(frames_cached), len(frames_no_cache))
# Same function names in same order
funcs_cached = [f.funcname for f in frames_cached]
funcs_no_cache = [f.funcname for f in frames_no_cache]
self.assertEqual(funcs_cached, funcs_no_cache)
# Same line numbers
lines_cached = [f.lineno for f in frames_cached]
lines_no_cache = [f.lineno for f in frames_no_cache]
self.assertEqual(lines_cached, lines_no_cache)
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_per_thread_isolation(self):
"""Test that frame cache is per-thread and cache invalidation works independently."""
script_body = """\
import threading
lock = threading.Lock()
def sync(msg):
with lock:
sock.sendall(msg + b"\\n")
sock.recv(1)
# Thread 1 functions
def baz1():
sync(b"t1:baz1")
def bar1():
baz1()
def blech1():
sync(b"t1:blech1")
def foo1():
bar1() # Goes down to baz1, syncs
blech1() # Returns up, goes down to blech1, syncs
# Thread 2 functions
def baz2():
sync(b"t2:baz2")
def bar2():
baz2()
def blech2():
sync(b"t2:blech2")
def foo2():
bar2() # Goes down to baz2, syncs
blech2() # Returns up, goes down to blech2, syncs
t1 = threading.Thread(target=foo1)
t2 = threading.Thread(target=foo2)
t1.start()
t2.start()
t1.join()
t2.join()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder = make_unwinder(cache_frames=True)
buffer = b""
def recv_msg():
"""Receive a single message from socket."""
nonlocal buffer
while b"\n" not in buffer:
chunk = client_socket.recv(256)
if not chunk:
return None
buffer += chunk
msg, buffer = buffer.split(b"\n", 1)
return msg
def get_thread_frames(target_funcs):
"""Get frames for thread matching target functions."""
retries = 0
for _ in busy_retry(SHORT_TIMEOUT):
if retries >= 5:
break
retries += 1
# On Windows, ReadProcessMemory can fail with OSError
# (WinError 299) when frame pointers are in flux
with contextlib.suppress(RuntimeError, OSError):
traces = unwinder.get_stack_trace()
for interp in traces:
for thread in interp.threads:
funcs = [f.funcname for f in thread.frame_info]
if any(f in funcs for f in target_funcs):
return funcs
return None
# Track results for each sync point
results = {}
# Process 4 sync points: baz1, baz2, blech1, blech2
# With the lock, threads are serialized - handle one at a time
for _ in range(4):
msg = recv_msg()
self.assertIsNotNone(msg, "Expected message from subprocess")
# Determine which thread/function and take snapshot
if msg == b"t1:baz1":
funcs = get_thread_frames(["baz1", "bar1", "foo1"])
self.assertIsNotNone(funcs, "Thread 1 not found at baz1")
results["t1:baz1"] = funcs
elif msg == b"t2:baz2":
funcs = get_thread_frames(["baz2", "bar2", "foo2"])
self.assertIsNotNone(funcs, "Thread 2 not found at baz2")
results["t2:baz2"] = funcs
elif msg == b"t1:blech1":
funcs = get_thread_frames(["blech1", "foo1"])
self.assertIsNotNone(funcs, "Thread 1 not found at blech1")
results["t1:blech1"] = funcs
elif msg == b"t2:blech2":
funcs = get_thread_frames(["blech2", "foo2"])
self.assertIsNotNone(funcs, "Thread 2 not found at blech2")
results["t2:blech2"] = funcs
# Release thread to continue
client_socket.sendall(b"k")
# Validate Phase 1: baz snapshots
t1_baz = results.get("t1:baz1")
t2_baz = results.get("t2:baz2")
self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot")
self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot")
# Thread 1 at baz1: should have foo1->bar1->baz1
self.assertIn("baz1", t1_baz)
self.assertIn("bar1", t1_baz)
self.assertIn("foo1", t1_baz)
self.assertNotIn("blech1", t1_baz)
# No cross-contamination
self.assertNotIn("baz2", t1_baz)
self.assertNotIn("bar2", t1_baz)
self.assertNotIn("foo2", t1_baz)
# Thread 2 at baz2: should have foo2->bar2->baz2
self.assertIn("baz2", t2_baz)
self.assertIn("bar2", t2_baz)
self.assertIn("foo2", t2_baz)
self.assertNotIn("blech2", t2_baz)
# No cross-contamination
self.assertNotIn("baz1", t2_baz)
self.assertNotIn("bar1", t2_baz)
self.assertNotIn("foo1", t2_baz)
# Validate Phase 2: blech snapshots (cache invalidation test)
t1_blech = results.get("t1:blech1")
t2_blech = results.get("t2:blech2")
self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot")
self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot")
# Thread 1 at blech1: bar1/baz1 should be GONE (cache invalidated)
self.assertIn("blech1", t1_blech)
self.assertIn("foo1", t1_blech)
self.assertNotIn("bar1", t1_blech, "Cache not invalidated: bar1 still present")
self.assertNotIn("baz1", t1_blech, "Cache not invalidated: baz1 still present")
# No cross-contamination
self.assertNotIn("blech2", t1_blech)
# Thread 2 at blech2: bar2/baz2 should be GONE (cache invalidated)
self.assertIn("blech2", t2_blech)
self.assertIn("foo2", t2_blech)
self.assertNotIn("bar2", t2_blech, "Cache not invalidated: bar2 still present")
self.assertNotIn("baz2", t2_blech, "Cache not invalidated: baz2 still present")
# No cross-contamination
self.assertNotIn("blech1", t2_blech)
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_new_unwinder_with_stale_last_profiled_frame(self):
"""Test that a new unwinder returns complete stack when cache lookup misses."""
script_body = """\
def level4():
sock.sendall(b"sync1")
sock.recv(16)
sock.sendall(b"sync2")
sock.recv(16)
def level3():
level4()
def level2():
level3()
def level1():
level2()
level1()
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
expected = {"level1", "level2", "level3", "level4"}
# First unwinder samples - this sets last_profiled_frame in target
unwinder1 = make_unwinder(cache_frames=True)
frames1 = self._sample_frames(client_socket, unwinder1, b"sync1", b"ack", expected)
# Create NEW unwinder (empty cache) and sample
# The target still has last_profiled_frame set from unwinder1
unwinder2 = make_unwinder(cache_frames=True)
frames2 = self._sample_frames(client_socket, unwinder2, b"sync2", b"done", expected)
self.assertIsNotNone(frames1)
self.assertIsNotNone(frames2)
funcs1 = [f.funcname for f in frames1]
funcs2 = [f.funcname for f in frames2]
# Both should have all levels
for level in ["level1", "level2", "level3", "level4"]:
self.assertIn(level, funcs1, f"{level} missing from first sample")
self.assertIn(level, funcs2, f"{level} missing from second sample")
# Should have same stack depth
self.assertEqual(len(frames1), len(frames2),
"New unwinder should return complete stack despite stale last_profiled_frame")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_exhaustion(self):
"""Test cache works when frame limit (1024) is exceeded.
FRAME_CACHE_MAX_FRAMES=1024. With 1100 recursive frames,
the cache can't store all of them but should still work.
"""
# Use 1100 to exceed FRAME_CACHE_MAX_FRAMES=1024
depth = 1100
script_body = f"""\
import sys
sys.setrecursionlimit(2000)
def recurse(n):
if n <= 0:
sock.sendall(b"ready")
sock.recv(16) # wait for ack
sock.sendall(b"ready2")
sock.recv(16) # wait for done
return
recurse(n - 1)
recurse({depth})
"""
with self._target_process(script_body) as (p, client_socket, make_unwinder):
unwinder_cache = make_unwinder(cache_frames=True)
unwinder_no_cache = make_unwinder(cache_frames=False)
frames_cached = self._sample_frames(
client_socket, unwinder_cache, b"ready", b"ack", {"recurse"}, expected_frames=1102
)
# Sample again with no cache for comparison
frames_no_cache = self._sample_frames(
client_socket, unwinder_no_cache, b"ready2", b"done", {"recurse"}, expected_frames=1102
)
self.assertIsNotNone(frames_cached)
self.assertIsNotNone(frames_no_cache)
# Both should have many recurse frames (> 1024 limit)
cached_count = [f.funcname for f in frames_cached].count("recurse")
no_cache_count = [f.funcname for f in frames_no_cache].count("recurse")
self.assertGreater(cached_count, 1000, "Should have >1000 recurse frames")
self.assertGreater(no_cache_count, 1000, "Should have >1000 recurse frames")
# Both modes should produce same frame count
self.assertEqual(len(frames_cached), len(frames_no_cache),
"Cache exhaustion should not affect stack completeness")
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_get_stats(self):
"""Test that get_stats() returns statistics when stats=True."""
script_body = """\
sock.sendall(b"ready")
sock.recv(16)
"""
with self._target_process(script_body) as (p, client_socket, _):
unwinder = RemoteUnwinder(p.pid, all_threads=True, stats=True)
self._wait_for_signal(client_socket, b"ready")
# Take a sample
unwinder.get_stack_trace()
stats = unwinder.get_stats()
client_socket.sendall(b"done")
# Verify expected keys exist
expected_keys = [
'total_samples', 'frame_cache_hits', 'frame_cache_misses',
'frame_cache_partial_hits', 'frames_read_from_cache',
'frames_read_from_memory', 'frame_cache_hit_rate'
]
for key in expected_keys:
self.assertIn(key, stats)
self.assertEqual(stats['total_samples'], 1)
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_get_stats_disabled_raises(self):
"""Test that get_stats() raises RuntimeError when stats=False."""
script_body = """\
sock.sendall(b"ready")
sock.recv(16)
"""
with self._target_process(script_body) as (p, client_socket, _):
unwinder = RemoteUnwinder(p.pid, all_threads=True) # stats=False by default
self._wait_for_signal(client_socket, b"ready")
with self.assertRaises(RuntimeError):
unwinder.get_stats()
client_socket.sendall(b"done")
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1,5 @@
The ``_remote_debugging`` module now implements frame caching in the
``RemoteUnwinder`` class to reduce memory reads when profiling remote
processes. When ``cache_frames=True``, unchanged portions of the call stack
are reused from previous samples, significantly improving profiling
performance for deep call stacks.

View file

@ -41,7 +41,7 @@
@MODULE__PICKLE_TRUE@_pickle _pickle.c
@MODULE__QUEUE_TRUE@_queue _queuemodule.c
@MODULE__RANDOM_TRUE@_random _randommodule.c
@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c
@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c
@MODULE__STRUCT_TRUE@_struct _struct.c
# build supports subinterpreters

View file

@ -154,6 +154,39 @@ typedef struct {
uintptr_t addr_code_adaptive;
} CachedCodeMetadata;
/* Frame cache constants and types */
#define FRAME_CACHE_MAX_THREADS 32
#define FRAME_CACHE_MAX_FRAMES 1024
typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *frame_list; // owned reference, NULL if empty
} FrameCacheEntry;
/* Statistics for profiling performance analysis */
typedef struct {
uint64_t total_samples; // Total number of get_stack_trace calls
uint64_t frame_cache_hits; // Full cache hits (entire stack unchanged)
uint64_t frame_cache_misses; // Cache misses requiring full walk
uint64_t frame_cache_partial_hits; // Partial hits (stopped at cached frame)
uint64_t frames_read_from_cache; // Total frames retrieved from cache
uint64_t frames_read_from_memory; // Total frames read from remote memory
uint64_t memory_reads; // Total remote memory read operations
uint64_t memory_bytes_read; // Total bytes read from remote memory
uint64_t code_object_cache_hits; // Code object cache hits
uint64_t code_object_cache_misses; // Code object cache misses
uint64_t stale_cache_invalidations; // Times stale entries were cleared
} UnwinderStats;
/* Stats tracking macros - no-op when stats collection is disabled */
#define STATS_INC(unwinder, field) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
#define STATS_ADD(unwinder, field, val) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
typedef struct {
PyTypeObject *RemoteDebugging_Type;
PyTypeObject *TaskInfo_Type;
@ -195,7 +228,12 @@ typedef struct {
int skip_non_matching_threads;
int native;
int gc;
int cache_frames;
int collect_stats; // whether to collect statistics
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
RemoteDebuggingState *cached_state;
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
UnwinderStats stats; // statistics for performance analysis
#ifdef Py_GIL_DISABLED
uint32_t tlbc_generation;
_Py_hashtable_t *tlbc_cache;
@ -363,9 +401,45 @@ extern int process_frame_chain(
uintptr_t initial_frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
int *stopped_at_cached_frame,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs
);
/* Frame cache functions */
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
extern int frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs);
extern int collect_frames_with_cache(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id);
/* ============================================================================
* THREAD FUNCTION DECLARATIONS
* ============================================================================ */

View file

@ -12,7 +12,7 @@ preserve
PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
"RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
" mode=0, debug=False, skip_non_matching_threads=True,\n"
" native=False, gc=False)\n"
" native=False, gc=False, cache_frames=False, stats=False)\n"
"--\n"
"\n"
"Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@ -32,6 +32,10 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
" non-Python code.\n"
" gc: If True, include artificial \"<GC>\" frames to denote active garbage\n"
" collection.\n"
" cache_frames: If True, enable frame caching optimization to avoid re-reading\n"
" unchanged parent frames between samples.\n"
" stats: If True, collect statistics about cache hits, memory reads, etc.\n"
" Use get_stats() to retrieve the collected statistics.\n"
"\n"
"The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
"process, including examining thread states, stack frames and other runtime data.\n"
@ -48,7 +52,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
int only_active_thread,
int mode, int debug,
int skip_non_matching_threads,
int native, int gc);
int native, int gc,
int cache_frames, int stats);
static int
_remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@ -56,7 +61,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
int return_value = -1;
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
#define NUM_KEYWORDS 8
#define NUM_KEYWORDS 10
static struct {
PyGC_Head _this_is_not_used;
PyObject_VAR_HEAD
@ -65,7 +70,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
} _kwtuple = {
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
.ob_hash = -1,
.ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), },
.ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), &_Py_ID(cache_frames), &_Py_ID(stats), },
};
#undef NUM_KEYWORDS
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
@ -74,14 +79,14 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
# define KWTUPLE NULL
#endif // !Py_BUILD_CORE
static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", NULL};
static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", "cache_frames", "stats", NULL};
static _PyArg_Parser _parser = {
.keywords = _keywords,
.fname = "RemoteUnwinder",
.kwtuple = KWTUPLE,
};
#undef KWTUPLE
PyObject *argsbuf[8];
PyObject *argsbuf[10];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
@ -93,6 +98,8 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
int skip_non_matching_threads = 1;
int native = 0;
int gc = 0;
int cache_frames = 0;
int stats = 0;
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
/*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@ -160,12 +167,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
goto skip_optional_kwonly;
}
}
if (fastargs[7]) {
gc = PyObject_IsTrue(fastargs[7]);
if (gc < 0) {
goto exit;
}
if (!--noptargs) {
goto skip_optional_kwonly;
}
}
if (fastargs[8]) {
cache_frames = PyObject_IsTrue(fastargs[8]);
if (cache_frames < 0) {
goto exit;
}
if (!--noptargs) {
goto skip_optional_kwonly;
}
}
stats = PyObject_IsTrue(fastargs[9]);
if (stats < 0) {
goto exit;
}
skip_optional_kwonly:
return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc);
return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc, cache_frames, stats);
exit:
return return_value;
@ -347,4 +372,51 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject
return return_value;
}
/*[clinic end generated code: output=99fed5c94cf36881 input=a9049054013a1b77]*/
PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
"get_stats($self, /)\n"
"--\n"
"\n"
"Get collected statistics about profiling performance.\n"
"\n"
"Returns a dictionary containing statistics about cache performance,\n"
"memory reads, and other profiling metrics. Only available if the\n"
"RemoteUnwinder was created with stats=True.\n"
"\n"
"Returns:\n"
" dict: A dictionary containing:\n"
" - total_samples: Total number of get_stack_trace calls\n"
" - frame_cache_hits: Full cache hits (entire stack unchanged)\n"
" - frame_cache_misses: Cache misses requiring full walk\n"
" - frame_cache_partial_hits: Partial hits (stopped at cached frame)\n"
" - frames_read_from_cache: Total frames retrieved from cache\n"
" - frames_read_from_memory: Total frames read from remote memory\n"
" - memory_reads: Total remote memory read operations\n"
" - memory_bytes_read: Total bytes read from remote memory\n"
" - code_object_cache_hits: Code object cache hits\n"
" - code_object_cache_misses: Code object cache misses\n"
" - stale_cache_invalidations: Times stale cache entries were cleared\n"
" - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
" - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
"\n"
"Raises:\n"
" RuntimeError: If stats collection was not enabled (stats=False)");
#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF \
{"get_stats", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stats, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stats__doc__},
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self);
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
{
PyObject *return_value = NULL;
Py_BEGIN_CRITICAL_SECTION(self);
return_value = _remote_debugging_RemoteUnwinder_get_stats_impl((RemoteUnwinderObject *)self);
Py_END_CRITICAL_SECTION();
return return_value;
}
/*[clinic end generated code: output=f1fd6c1d4c4c7254 input=a9049054013a1b77]*/

View file

@ -257,6 +257,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
if (unwinder && unwinder->code_object_cache != NULL) {
meta = _Py_hashtable_get(unwinder->code_object_cache, key);
if (meta) {
STATS_INC(unwinder, code_object_cache_hits);
} else {
STATS_INC(unwinder, code_object_cache_misses);
}
}
if (meta == NULL) {

View file

@ -0,0 +1,236 @@
/******************************************************************************
* Remote Debugging Module - Frame Cache
*
* This file contains functions for caching frame information to optimize
* repeated stack unwinding for profiling.
******************************************************************************/
#include "_remote_debugging.h"
/* ============================================================================
* FRAME CACHE - stores (address, frame_info) pairs per thread
* Uses preallocated fixed-size arrays for efficiency and bounded memory.
* ============================================================================ */
int
frame_cache_init(RemoteUnwinderObject *unwinder)
{
unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
if (!unwinder->frame_cache) {
PyErr_NoMemory();
return -1;
}
return 0;
}
void
frame_cache_cleanup(RemoteUnwinderObject *unwinder)
{
if (!unwinder->frame_cache) {
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
unwinder->frame_cache = NULL;
}
// Find cache entry by thread_id
FrameCacheEntry *
frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == thread_id) {
return &unwinder->frame_cache[i];
}
}
return NULL;
}
// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
// First check if thread already has an entry
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == thread_id) {
return &unwinder->frame_cache[i];
}
}
// Find empty slot
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
return &unwinder->frame_cache[i];
}
}
// Cache full - graceful degradation
return NULL;
}
// Remove cache entries for threads not seen in the result
// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
// and ThreadInfo[0] is the thread_id
void
frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
{
if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
return;
}
// Build array of seen thread IDs from result
uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
int num_seen = 0;
Py_ssize_t num_interps = PyList_GET_SIZE(result);
for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
PyObject *interp_info = PyList_GET_ITEM(result, i);
PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
if (!threads || !PyList_Check(threads)) {
continue;
}
Py_ssize_t num_threads = PyList_GET_SIZE(threads);
for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
PyObject *thread_info = PyList_GET_ITEM(threads, j);
PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
if (tid_obj) {
uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
if (!PyErr_Occurred()) {
seen_threads[num_seen++] = tid;
} else {
PyErr_Clear();
}
}
}
}
// Invalidate entries not in seen list
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
continue;
}
int found = 0;
for (int j = 0; j < num_seen; j++) {
if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
found = 1;
break;
}
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
}
}
// Find last_profiled_frame in cache and extend frame_info with cached continuation
// If frame_addrs is provided (not NULL), also extends it with cached addresses
int
frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs)
{
if (!unwinder->frame_cache || last_profiled_frame == 0) {
return 0;
}
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
// Find the index where last_profiled_frame matches
Py_ssize_t start_idx = -1;
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
if (entry->addrs[i] == last_profiled_frame) {
start_idx = i;
break;
}
}
if (start_idx < 0) {
return 0; // Not found
}
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
// Extend frame_info with frames from start_idx onwards
PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames);
if (!slice) {
return -1;
}
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
Py_DECREF(slice);
if (result < 0) {
return -1;
}
// Also extend frame_addrs with cached addresses if provided
if (frame_addrs) {
for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
frame_addrs[(*num_addrs)++] = entry->addrs[i];
}
}
return 1;
}
// Store frame list with addresses in cache
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
int
frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs)
{
if (!unwinder->frame_cache || thread_id == 0) {
return 0;
}
// Clamp to max frames
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
num_addrs = FRAME_CACHE_MAX_FRAMES;
}
FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
if (!entry) {
// Cache full - graceful degradation
return 0;
}
// Clear old frame_list if replacing
Py_CLEAR(entry->frame_list);
// Store full frame list (don't truncate to num_addrs - frames beyond the
// address array limit are still valid and needed for full cache hits)
Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
if (!entry->frame_list) {
return -1;
}
entry->thread_id = thread_id;
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
return 1;
}

View file

@ -189,6 +189,8 @@ parse_frame_object(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
@ -258,14 +260,39 @@ process_frame_chain(
uintptr_t initial_frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame)
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
int *stopped_at_cached_frame,
uintptr_t *frame_addrs, // optional: C array to receive frame addresses
Py_ssize_t *num_addrs, // in/out: current count / updated count
Py_ssize_t max_addrs) // max capacity of frame_addrs array
{
uintptr_t frame_addr = initial_frame_addr;
uintptr_t prev_frame_addr = 0;
const size_t MAX_FRAMES = 1024;
const size_t MAX_FRAMES = 1024 + 512;
size_t frame_count = 0;
// Initialize output flag
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 0;
}
// Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 1;
}
return 0;
}
while ((void*)frame_addr != NULL) {
// Check if we've reached the cached frame - if so, stop here
if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 1;
}
break;
}
PyObject *frame = NULL;
uintptr_t next_frame_addr = 0;
uintptr_t stackpointer = 0;
@ -286,7 +313,6 @@ process_frame_chain(
}
}
if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
// If the first frame is missing, the chain is broken:
const char *e = "Failed to parse initial frame in chain";
PyErr_SetString(PyExc_RuntimeError, e);
return -1;
@ -310,36 +336,40 @@ process_frame_chain(
extra_frame = &_Py_STR(native);
}
if (extra_frame) {
// Use "~" as file and 0 as line, since that's what pstats uses:
PyObject *extra_frame_info = make_frame_info(
unwinder, _Py_LATIN1_CHR('~'), _PyLong_GetZero(), extra_frame);
if (extra_frame_info == NULL) {
return -1;
}
int error = PyList_Append(frame_info, extra_frame_info);
if (PyList_Append(frame_info, extra_frame_info) < 0) {
Py_DECREF(extra_frame_info);
if (error) {
const char *e = "Failed to append extra frame to frame info list";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
return -1;
}
// Extra frames use 0 as address (they're synthetic)
if (frame_addrs && *num_addrs < max_addrs) {
frame_addrs[(*num_addrs)++] = 0;
}
Py_DECREF(extra_frame_info);
}
if (frame) {
if (prev_frame_addr && frame_addr != prev_frame_addr) {
const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx";
PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr);
Py_DECREF(frame);
const char *e = "Frame chain consistency check failed";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed");
return -1;
}
if (PyList_Append(frame_info, frame) == -1) {
if (PyList_Append(frame_info, frame) < 0) {
Py_DECREF(frame);
const char *e = "Failed to append frame to frame info list";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
return -1;
}
// Track the address for this frame
if (frame_addrs && *num_addrs < max_addrs) {
frame_addrs[(*num_addrs)++] = frame_addr;
}
Py_DECREF(frame);
}
@ -349,3 +379,208 @@ process_frame_chain(
return 0;
}
// Clear last_profiled_frame for all threads in the target process.
// This must be called at the start of profiling to avoid stale values
// from previous profilers causing us to stop frame walking early.
int
clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
{
uintptr_t current_interp = unwinder->interpreter_addr;
uintptr_t zero = 0;
while (current_interp != 0) {
// Get first thread in this interpreter
uintptr_t tstate_addr;
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
current_interp + unwinder->debug_offsets.interpreter_state.threads_head,
sizeof(void*),
&tstate_addr) < 0) {
// Non-fatal: just skip clearing
PyErr_Clear();
return 0;
}
// Iterate all threads in this interpreter
while (tstate_addr != 0) {
// Clear last_profiled_frame
uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
sizeof(uintptr_t), &zero) < 0) {
// Non-fatal: just continue
PyErr_Clear();
}
// Move to next thread
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
tstate_addr + unwinder->debug_offsets.thread_state.next,
sizeof(void*),
&tstate_addr) < 0) {
PyErr_Clear();
break;
}
}
// Move to next interpreter
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
current_interp + unwinder->debug_offsets.interpreter_state.next,
sizeof(void*),
&current_interp) < 0) {
PyErr_Clear();
break;
}
}
return 0;
}
// Fast path: check if we have a full cache hit (parent stack unchanged)
// A "full hit" means current frame == last profiled frame, so we can reuse
// cached parent frames. We always read the current frame from memory to get
// updated line numbers (the line within a frame can change between samples).
// Returns: 1 if full hit (frame_info populated with current frame + cached parents),
// 0 if miss, -1 on error
static int
try_full_cache_hit(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
uintptr_t last_profiled_frame,
uint64_t thread_id,
PyObject *frame_info)
{
if (!unwinder->frame_cache || last_profiled_frame == 0) {
return 0;
}
// Full hit only if current frame == last profiled frame
if (frame_addr != last_profiled_frame) {
return 0;
}
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
// Verify first address matches (sanity check)
if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
return 0;
}
// Always read the current frame from memory to get updated line number
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
uintptr_t previous_frame = 0;
int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
&code_object_addr, &previous_frame);
if (parse_result < 0) {
return -1;
}
// Get cached parent frames first (before modifying frame_info)
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
PyObject *parent_slice = NULL;
if (cached_size > 1) {
parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
if (!parent_slice) {
Py_XDECREF(current_frame);
return -1;
}
}
// Now safe to modify frame_info - add current frame if valid
if (current_frame != NULL) {
if (PyList_Append(frame_info, current_frame) < 0) {
Py_DECREF(current_frame);
Py_XDECREF(parent_slice);
return -1;
}
Py_DECREF(current_frame);
STATS_ADD(unwinder, frames_read_from_memory, 1);
}
// Extend with cached parent frames
if (parent_slice) {
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
Py_DECREF(parent_slice);
if (result < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
}
STATS_INC(unwinder, frame_cache_hits);
return 1;
}
// High-level helper: collect frames with cache optimization
// Returns complete frame_info list, handling all cache logic internally
int
collect_frames_with_cache(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id)
{
// Fast path: check for full cache hit first (no allocations needed)
int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
thread_id, frame_info);
if (full_hit != 0) {
return full_hit < 0 ? -1 : 0; // Either error or success
}
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs = 0;
Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
int stopped_at_cached = 0;
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame,
last_profiled_frame, &stopped_at_cached,
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
}
// Track frames read from memory (frames added by process_frame_chain)
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
// If stopped at cached frame, extend with cached continuation (both frames and addresses)
if (stopped_at_cached) {
Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
frame_info, addrs, &num_addrs,
FRAME_CACHE_MAX_FRAMES);
if (cache_result < 0) {
return -1;
}
if (cache_result == 0) {
// Cache miss - continue walking from last_profiled_frame to get the rest
STATS_INC(unwinder, frame_cache_misses);
Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
} else {
// Partial cache hit
STATS_INC(unwinder, frame_cache_partial_hits);
STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
}
} else if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
}
// Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
return -1;
}
return 0;
}

View file

@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__
skip_non_matching_threads: bool = True
native: bool = False
gc: bool = False
cache_frames: bool = False
stats: bool = False
Initialize a new RemoteUnwinder object for debugging a remote Python process.
@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
non-Python code.
gc: If True, include artificial "<GC>" frames to denote active garbage
collection.
cache_frames: If True, enable frame caching optimization to avoid re-reading
unchanged parent frames between samples.
stats: If True, collect statistics about cache hits, memory reads, etc.
Use get_stats() to retrieve the collected statistics.
The RemoteUnwinder provides functionality to inspect and debug a running Python
process, including examining thread states, stack frames and other runtime data.
@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
int only_active_thread,
int mode, int debug,
int skip_non_matching_threads,
int native, int gc)
/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/
int native, int gc,
int cache_frames, int stats)
/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/
{
// Validate that all_threads and only_active_thread are not both True
if (all_threads && only_active_thread) {
@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
#ifdef Py_GIL_DISABLED
if (only_active_thread) {
PyErr_SetString(PyExc_ValueError,
"only_active_thread is not supported when Py_GIL_DISABLED is not defined");
"only_active_thread is not supported in free-threaded builds");
return -1;
}
#endif
self->native = native;
self->gc = gc;
self->cache_frames = cache_frames;
self->collect_stats = stats;
self->stale_invalidation_counter = 0;
self->debug = debug;
self->only_active_thread = only_active_thread;
self->mode = mode;
self->skip_non_matching_threads = skip_non_matching_threads;
self->cached_state = NULL;
self->frame_cache = NULL;
// Initialize stats to zero
memset(&self->stats, 0, sizeof(self->stats));
if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
return -1;
@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
self->win_process_buffer_size = 0;
#endif
if (cache_frames && frame_cache_init(self) < 0) {
return -1;
}
// Clear stale last_profiled_frame values from previous profilers
// This prevents us from stopping frame walking early due to stale values
if (cache_frames) {
clear_last_profiled_frames(self);
}
return 0;
}
@ -429,6 +452,8 @@ static PyObject *
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/
{
STATS_INC(self, total_samples);
PyObject* result = PyList_New(0);
if (!result) {
set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list");
@ -591,6 +616,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
}
exit:
// Invalidate cache entries for threads not seen in this sample.
// Only do this every 1024 iterations to avoid performance overhead.
if (self->cache_frames && result) {
if (++self->stale_invalidation_counter >= 1024) {
self->stale_invalidation_counter = 0;
frame_cache_invalidate_stale(self, result);
}
}
_Py_RemoteDebug_ClearCache(&self->handle);
return result;
}
@ -757,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
return NULL;
}
/*[clinic input]
@permit_long_docstring_body
@critical_section
_remote_debugging.RemoteUnwinder.get_stats
Get collected statistics about profiling performance.
Returns a dictionary containing statistics about cache performance,
memory reads, and other profiling metrics. Only available if the
RemoteUnwinder was created with stats=True.
Returns:
dict: A dictionary containing:
- total_samples: Total number of get_stack_trace calls
- frame_cache_hits: Full cache hits (entire stack unchanged)
- frame_cache_misses: Cache misses requiring full walk
- frame_cache_partial_hits: Partial hits (stopped at cached frame)
- frames_read_from_cache: Total frames retrieved from cache
- frames_read_from_memory: Total frames read from remote memory
- memory_reads: Total remote memory read operations
- memory_bytes_read: Total bytes read from remote memory
- code_object_cache_hits: Code object cache hits
- code_object_cache_misses: Code object cache misses
- stale_cache_invalidations: Times stale cache entries were cleared
- frame_cache_hit_rate: Percentage of samples that hit the cache
- code_object_cache_hit_rate: Percentage of code object lookups that hit cache
Raises:
RuntimeError: If stats collection was not enabled (stats=False)
[clinic start generated code]*/
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
{
if (!self->collect_stats) {
PyErr_SetString(PyExc_RuntimeError,
"Statistics collection was not enabled. "
"Create RemoteUnwinder with stats=True to collect statistics.");
return NULL;
}
PyObject *result = PyDict_New();
if (!result) {
return NULL;
}
#define ADD_STAT(name) do { \
PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \
if (!val || PyDict_SetItemString(result, #name, val) < 0) { \
Py_XDECREF(val); \
Py_DECREF(result); \
return NULL; \
} \
Py_DECREF(val); \
} while(0)
ADD_STAT(total_samples);
ADD_STAT(frame_cache_hits);
ADD_STAT(frame_cache_misses);
ADD_STAT(frame_cache_partial_hits);
ADD_STAT(frames_read_from_cache);
ADD_STAT(frames_read_from_memory);
ADD_STAT(memory_reads);
ADD_STAT(memory_bytes_read);
ADD_STAT(code_object_cache_hits);
ADD_STAT(code_object_cache_misses);
ADD_STAT(stale_cache_invalidations);
#undef ADD_STAT
// Calculate and add derived statistics
// Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
double frame_cache_hit_rate = 0.0;
uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses;
if (total_cache_lookups > 0) {
frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
/ (double)total_cache_lookups;
}
PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
Py_XDECREF(hit_rate);
Py_DECREF(result);
return NULL;
}
Py_DECREF(hit_rate);
double code_object_hit_rate = 0.0;
uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
if (total_code_lookups > 0) {
code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
}
PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
Py_XDECREF(code_hit_rate);
Py_DECREF(result);
return NULL;
}
Py_DECREF(code_hit_rate);
return result;
}
static PyMethodDef RemoteUnwinder_methods[] = {
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
_REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF
{NULL, NULL}
};
@ -787,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op)
_Py_RemoteDebug_ClearCache(&self->handle);
_Py_RemoteDebug_CleanupProcHandle(&self->handle);
}
frame_cache_cleanup(self);
PyObject_Del(self);
Py_DECREF(tp);
}

View file

@ -296,6 +296,8 @@ unwind_stack_for_thread(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
goto error;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
@ -309,6 +311,8 @@ unwind_stack_for_thread(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
goto error;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size);
// Calculate thread status using flags (always)
int status_flags = 0;
@ -383,15 +387,37 @@ unwind_stack_for_thread(
goto error;
}
// In cache mode, copying stack chunks is more expensive than direct memory reads
if (!unwinder->cache_frames) {
if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
goto error;
}
}
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) {
if (unwinder->cache_frames) {
// Use cache to avoid re-reading unchanged parent frames
uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
gc_frame, last_profiled_frame, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
// Update last_profiled_frame for next sample
uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
sizeof(uintptr_t), &frame_addr) < 0) {
PyErr_Clear(); // Non-fatal
}
} else {
// No caching - process entire frame chain
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
}
}
*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);

View file

@ -102,6 +102,7 @@
<ClCompile Include="..\Modules\_remote_debugging\object_reading.c" />
<ClCompile Include="..\Modules\_remote_debugging\code_objects.c" />
<ClCompile Include="..\Modules\_remote_debugging\frames.c" />
<ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
<ClCompile Include="..\Modules\_remote_debugging\threads.c" />
<ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
</ItemGroup>

View file

@ -24,6 +24,9 @@
<ClCompile Include="..\Modules\_remote_debugging\frames.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Modules\_remote_debugging\frame_cache.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Modules\_remote_debugging\threads.c">
<Filter>Source Files</Filter>
</ClCompile>

View file

@ -2288,6 +2288,16 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
void
_PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame)
{
// Update last_profiled_frame for remote profiler frame caching.
// By this point, tstate->current_frame is already set to the parent frame.
// Only update if we're popping the exact frame that was last profiled.
// This avoids corrupting the cache when transient frames (called and returned
// between profiler samples) update last_profiled_frame to addresses the
// profiler never saw.
if (tstate->last_profiled_frame != NULL && tstate->last_profiled_frame == frame) {
tstate->last_profiled_frame = tstate->current_frame;
}
if (frame->owner == FRAME_OWNED_BY_THREAD) {
clear_thread_frame(tstate, frame);
}

View file

@ -1102,6 +1102,115 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
#endif
}
#if defined(__linux__) && HAVE_PROCESS_VM_READV
// Fallback write using /proc/pid/mem
static int
_Py_RemoteDebug_WriteRemoteMemoryFallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
if (handle->memfd == -1) {
if (open_proc_mem_fd(handle) < 0) {
return -1;
}
}
struct iovec local[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (char*)src + result;
local[0].iov_len = len - result;
off_t offset = remote_address + result;
written = pwritev(handle->memfd, local, 1, offset);
if (written < 0) {
PyErr_SetFromErrno(PyExc_OSError);
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
}
#endif // __linux__
// Platform-independent memory write function
UNUSED static int
_Py_RemoteDebug_WriteRemoteMemory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
#ifdef MS_WINDOWS
SIZE_T written = 0;
SIZE_T result = 0;
do {
if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
PyErr_SetFromWindowsErr(0);
DWORD error = GetLastError();
_set_debug_exception_cause(PyExc_OSError,
"WriteProcessMemory failed for PID %d at address 0x%lx "
"(size %zu, partial write %zu bytes): Windows error %lu",
handle->pid, remote_address + result, len - result, result, error);
return -1;
}
result += written;
} while (result < len);
return 0;
#elif defined(__linux__) && HAVE_PROCESS_VM_READV
if (handle->memfd != -1) {
return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
}
struct iovec local[1];
struct iovec remote[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (void*)((char*)src + result);
local[0].iov_len = len - result;
remote[0].iov_base = (void*)((char*)remote_address + result);
remote[0].iov_len = len - result;
written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
if (written < 0) {
if (errno == ENOSYS) {
return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
}
PyErr_SetFromErrno(PyExc_OSError);
_set_debug_exception_cause(PyExc_OSError,
"process_vm_writev failed for PID %d at address 0x%lx "
"(size %zu, partial write %zd bytes): %s",
handle->pid, remote_address + result, len - result, result, strerror(errno));
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
kern_return_t kr = mach_vm_write(
handle->task,
(mach_vm_address_t)remote_address,
(vm_offset_t)src,
(mach_msg_type_number_t)len);
if (kr != KERN_SUCCESS) {
switch (kr) {
case KERN_PROTECTION_FAILURE:
PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
break;
case KERN_INVALID_ARGUMENT:
PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
break;
default:
PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
}
return -1;
}
return 0;
#else
Py_UNREACHABLE();
#endif
}
UNUSED static int
_Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
uintptr_t addr,

View file

@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d
return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst);
}
// Why is pwritev not guarded? Except on Android API level 23 (no longer
// supported), HAVE_PROCESS_VM_READV is sufficient.
#if defined(__linux__) && HAVE_PROCESS_VM_READV
static int
write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
if (handle->memfd == -1) {
if (open_proc_mem_fd(handle) < 0) {
return -1;
}
}
struct iovec local[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (char*)src + result;
local[0].iov_len = len - result;
off_t offset = remote_address + result;
written = pwritev(handle->memfd, local, 1, offset);
if (written < 0) {
PyErr_SetFromErrno(PyExc_OSError);
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
}
#endif // __linux__
// Use the shared write function from remote_debug.h
static int
write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
{
#ifdef MS_WINDOWS
SIZE_T written = 0;
SIZE_T result = 0;
do {
if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
PyErr_SetFromWindowsErr(0);
return -1;
}
result += written;
} while (result < len);
return 0;
#elif defined(__linux__) && HAVE_PROCESS_VM_READV
if (handle->memfd != -1) {
return write_memory_fallback(handle, remote_address, len, src);
}
struct iovec local[1];
struct iovec remote[1];
Py_ssize_t result = 0;
Py_ssize_t written = 0;
do {
local[0].iov_base = (void*)((char*)src + result);
local[0].iov_len = len - result;
remote[0].iov_base = (void*)((char*)remote_address + result);
remote[0].iov_len = len - result;
written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
if (written < 0) {
if (errno == ENOSYS) {
return write_memory_fallback(handle, remote_address, len, src);
}
PyErr_SetFromErrno(PyExc_OSError);
return -1;
}
result += written;
} while ((size_t)written != local[0].iov_len);
return 0;
#elif defined(__APPLE__) && TARGET_OS_OSX
kern_return_t kr = mach_vm_write(
pid_to_task(handle->pid),
(mach_vm_address_t)remote_address,
(vm_offset_t)src,
(mach_msg_type_number_t)len);
if (kr != KERN_SUCCESS) {
switch (kr) {
case KERN_PROTECTION_FAILURE:
PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
break;
case KERN_INVALID_ARGUMENT:
PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
break;
default:
PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
}
return -1;
}
return 0;
#else
Py_UNREACHABLE();
#endif
return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src);
}
static int

View file

@ -434,7 +434,7 @@ def main():
elif args.threads == "only_active":
kwargs["only_active_thread"] = True
unwinder = _remote_debugging.RemoteUnwinder(
process.pid, **kwargs
process.pid, cache_frames=True, **kwargs
)
results = benchmark(unwinder, duration_seconds=args.duration)
finally: