diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 1e1e46ea4c0..08d71070ddc 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -135,6 +135,8 @@ struct _ts { /* Pointer to currently executing frame. */ struct _PyInterpreterFrame *current_frame; + struct _PyInterpreterFrame *last_profiled_frame; + Py_tracefunc c_profilefunc; Py_tracefunc c_tracefunc; PyObject *c_profileobj; diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index 0f17bf17f82..bfd86c08887 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets { uint64_t next; uint64_t interp; uint64_t current_frame; + uint64_t last_profiled_frame; uint64_t thread_id; uint64_t native_thread_id; uint64_t datastack_chunk; @@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets { .next = offsetof(PyThreadState, next), \ .interp = offsetof(PyThreadState, interp), \ .current_frame = offsetof(PyThreadState, current_frame), \ + .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \ diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 783747d1f01..d23d6d4f91b 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1609,6 +1609,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_parameter_type)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_stack)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cache_frames)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata)); @@ -2053,6 +2054,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 374617d8284..5c3ea474ad0 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -332,6 +332,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(c_parameter_type) STRUCT_FOR_ID(c_return) STRUCT_FOR_ID(c_stack) + STRUCT_FOR_ID(cache_frames) STRUCT_FOR_ID(cached_datetime_module) STRUCT_FOR_ID(cached_statements) STRUCT_FOR_ID(cadata) @@ -776,6 +777,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(stacklevel) STRUCT_FOR_ID(start) STRUCT_FOR_ID(statement) + STRUCT_FOR_ID(stats) STRUCT_FOR_ID(status) STRUCT_FOR_ID(stderr) STRUCT_FOR_ID(stdin) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index a66c97f7f13..31d88339a13 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1607,6 +1607,7 @@ extern "C" { INIT_ID(c_parameter_type), \ INIT_ID(c_return), \ INIT_ID(c_stack), \ + INIT_ID(cache_frames), \ INIT_ID(cached_datetime_module), \ INIT_ID(cached_statements), \ INIT_ID(cadata), \ @@ -2051,6 +2052,7 @@ extern "C" { INIT_ID(stacklevel), \ INIT_ID(start), \ INIT_ID(statement), \ + INIT_ID(stats), \ INIT_ID(status), \ INIT_ID(stderr), \ INIT_ID(stdin), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 2061b1d2049..c5b01ff9876 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1108,6 +1108,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(cache_frames); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(cached_datetime_module); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2884,6 +2888,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(stats); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(status); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md index 804d7436018..d56e5481d3c 100644 --- a/InternalDocs/frames.md +++ b/InternalDocs/frames.md @@ -111,6 +111,26 @@ ### Shim frames instruction which cleans up the shim frame and returns. +### Remote Profiling Frame Cache + +The `last_profiled_frame` field in `PyThreadState` supports an optimization for +remote profilers that sample call stacks from external processes. When a remote +profiler reads the call stack, it writes the current frame address to this field. +The eval loop then keeps this pointer valid by updating it to the parent frame +whenever a frame returns (in `_PyEval_FrameClearAndPop`). + +This creates a "high-water mark" that always points to a frame still on the stack. +On subsequent samples, the profiler can walk from `current_frame` until it reaches +`last_profiled_frame`, knowing that frames from that point downward are unchanged +and can be retrieved from a cache. This significantly reduces the amount of remote +memory reads needed when call stacks are deep and stable at their base. + +The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when +`last_profiled_frame` is non-NULL AND matches the frame being popped. This +prevents transient frames (called and returned between profiler samples) from +corrupting the cache pointer, while avoiding any overhead when profiling is inactive. + + ### The Instruction Pointer `_PyInterpreterFrame` has two fields which are used to maintain the instruction diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py index 99cac71a404..dd4ea1edbf6 100644 --- a/Lib/profiling/sampling/sample.py +++ b/Lib/profiling/sampling/sample.py @@ -27,21 +27,24 @@ class SampleProfiler: - def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True): + def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False): self.pid = pid self.sample_interval_usec = sample_interval_usec self.all_threads = all_threads self.mode = mode # Store mode for later use + self.collect_stats = collect_stats if _FREE_THREADED_BUILD: self.unwinder = _remote_debugging.RemoteUnwinder( self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc, - skip_non_matching_threads=skip_non_matching_threads + skip_non_matching_threads=skip_non_matching_threads, cache_frames=True, + stats=collect_stats ) else: only_active_threads = bool(self.all_threads) self.unwinder = _remote_debugging.RemoteUnwinder( self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc, - skip_non_matching_threads=skip_non_matching_threads + skip_non_matching_threads=skip_non_matching_threads, cache_frames=True, + stats=collect_stats ) # Track sample intervals and total sample count self.sample_intervals = deque(maxlen=100) @@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False): print(f"Sample rate: {sample_rate:.2f} samples/sec") print(f"Error rate: {error_rate:.2f}%") + # Print unwinder stats if stats collection is enabled + if self.collect_stats: + self._print_unwinder_stats() + # Pass stats to flamegraph collector if it's the right type if hasattr(collector, 'set_stats'): collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode) @@ -176,17 +183,100 @@ def _print_realtime_stats(self): (1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0 ) # Max time = Min Hz + # Build cache stats string if stats collection is enabled + cache_stats_str = "" + if self.collect_stats: + try: + stats = self.unwinder.get_stats() + hits = stats.get('frame_cache_hits', 0) + partial = stats.get('frame_cache_partial_hits', 0) + misses = stats.get('frame_cache_misses', 0) + total = hits + partial + misses + if total > 0: + hit_pct = (hits + partial) / total * 100 + cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}" + except RuntimeError: + pass + # Clear line and print stats print( - f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} " - f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} " - f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} " - f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} " - f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}", + f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} " + f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} " + f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} " + f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} " + f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}" + f"{cache_stats_str}", end="", flush=True, ) + def _print_unwinder_stats(self): + """Print unwinder statistics including cache performance.""" + try: + stats = self.unwinder.get_stats() + except RuntimeError: + return # Stats not enabled + + print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}") + print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}") + + # Frame cache stats + total_samples = stats.get('total_samples', 0) + frame_cache_hits = stats.get('frame_cache_hits', 0) + frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0) + frame_cache_misses = stats.get('frame_cache_misses', 0) + total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses + + # Calculate percentages + hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0 + partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0 + misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0 + + print(f" {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}") + print(f" Total samples: {total_samples:,}") + print(f" Full hits: {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})") + print(f" Partial hits: {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})") + print(f" Misses: {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})") + + # Frame read stats + frames_from_cache = stats.get('frames_read_from_cache', 0) + frames_from_memory = stats.get('frames_read_from_memory', 0) + total_frames = frames_from_cache + frames_from_memory + cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0 + memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0 + + print(f" {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}") + print(f" From cache: {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})") + print(f" From memory: {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})") + + # Code object cache stats + code_hits = stats.get('code_object_cache_hits', 0) + code_misses = stats.get('code_object_cache_misses', 0) + total_code = code_hits + code_misses + code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0 + code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0 + + print(f" {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}") + print(f" Hits: {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})") + print(f" Misses: {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})") + + # Memory operations + memory_reads = stats.get('memory_reads', 0) + memory_bytes = stats.get('memory_bytes_read', 0) + if memory_bytes >= 1024 * 1024: + memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB" + elif memory_bytes >= 1024: + memory_str = f"{memory_bytes / 1024:.1f} KB" + else: + memory_str = f"{memory_bytes} B" + print(f" {ANSIColors.CYAN}Memory:{ANSIColors.RESET}") + print(f" Read operations: {memory_reads:,} ({memory_str})") + + # Stale invalidations + stale_invalidations = stats.get('stale_cache_invalidations', 0) + if stale_invalidations > 0: + print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}") + def sample( pid, @@ -234,7 +324,8 @@ def sample( mode=mode, native=native, gc=gc, - skip_non_matching_threads=skip_non_matching_threads + skip_non_matching_threads=skip_non_matching_threads, + collect_stats=realtime_stats, ) profiler.realtime_stats = realtime_stats @@ -290,7 +381,8 @@ def sample_live( mode=mode, native=native, gc=gc, - skip_non_matching_threads=skip_non_matching_threads + skip_non_matching_threads=skip_non_matching_threads, + collect_stats=realtime_stats, ) profiler.realtime_stats = realtime_stats diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index 7decd8f32d5..2e6e6eaad06 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -1,3 +1,4 @@ +import contextlib import unittest import os import textwrap @@ -2038,5 +2039,766 @@ def busy_thread(): p.stderr.close() +class TestFrameCaching(unittest.TestCase): + """Test that frame caching produces correct results. + + Uses socket-based synchronization for deterministic testing. + All tests verify cache reuse via object identity checks (assertIs). + """ + + maxDiff = None + MAX_TRIES = 10 + + @contextlib.contextmanager + def _target_process(self, script_body): + """Context manager for running a target process with socket sync.""" + port = find_unused_port() + script = f"""\ +import socket +sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +sock.connect(('localhost', {port})) +{textwrap.dedent(script_body)} +""" + + with os_helper.temp_dir() as work_dir: + script_dir = os.path.join(work_dir, "script_pkg") + os.mkdir(script_dir) + + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server_socket.bind(("localhost", port)) + server_socket.settimeout(SHORT_TIMEOUT) + server_socket.listen(1) + + script_name = _make_test_script(script_dir, "script", script) + client_socket = None + p = None + try: + p = subprocess.Popen([sys.executable, script_name]) + client_socket, _ = server_socket.accept() + server_socket.close() + + def make_unwinder(cache_frames=True): + return RemoteUnwinder(p.pid, all_threads=True, cache_frames=cache_frames) + + yield p, client_socket, make_unwinder + + except PermissionError: + self.skipTest("Insufficient permissions to read the stack trace") + finally: + if client_socket: + client_socket.close() + if p: + p.kill() + p.terminate() + p.wait(timeout=SHORT_TIMEOUT) + + def _wait_for_signal(self, client_socket, signal): + """Block until signal received from target.""" + response = b"" + while signal not in response: + chunk = client_socket.recv(64) + if not chunk: + break + response += chunk + return response + + def _get_frames(self, unwinder, required_funcs): + """Sample and return frame_info list for thread containing required_funcs.""" + traces = unwinder.get_stack_trace() + for interp in traces: + for thread in interp.threads: + funcs = [f.funcname for f in thread.frame_info] + if required_funcs.issubset(set(funcs)): + return thread.frame_info + return None + + def _sample_frames(self, client_socket, unwinder, wait_signal, send_ack, required_funcs, expected_frames=1): + """Wait for signal, sample frames, send ack. Returns frame_info list.""" + self._wait_for_signal(client_socket, wait_signal) + # Give at least MAX_TRIES tries for the process to arrive to a steady state + for _ in range(self.MAX_TRIES): + frames = self._get_frames(unwinder, required_funcs) + if frames and len(frames) >= expected_frames: + break + time.sleep(0.1) + client_socket.sendall(send_ack) + return frames + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_hit_same_stack(self): + """Test that consecutive samples reuse cached parent frame objects. + + The current frame (index 0) is always re-read from memory to get + updated line numbers, so it may be a different object. Parent frames + (index 1+) should be identical objects from cache. + """ + script_body = """\ + def level3(): + sock.sendall(b"sync1") + sock.recv(16) + sock.sendall(b"sync2") + sock.recv(16) + sock.sendall(b"sync3") + sock.recv(16) + + def level2(): + level3() + + def level1(): + level2() + + level1() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + expected = {"level1", "level2", "level3"} + + frames1 = self._sample_frames(client_socket, unwinder, b"sync1", b"ack", expected) + frames2 = self._sample_frames(client_socket, unwinder, b"sync2", b"ack", expected) + frames3 = self._sample_frames(client_socket, unwinder, b"sync3", b"done", expected) + + self.assertIsNotNone(frames1) + self.assertIsNotNone(frames2) + self.assertIsNotNone(frames3) + self.assertEqual(len(frames1), len(frames2)) + self.assertEqual(len(frames2), len(frames3)) + + # Current frame (index 0) is always re-read, so check value equality + self.assertEqual(frames1[0].funcname, frames2[0].funcname) + self.assertEqual(frames2[0].funcname, frames3[0].funcname) + + # Parent frames (index 1+) must be identical objects (cache reuse) + for i in range(1, len(frames1)): + f1, f2, f3 = frames1[i], frames2[i], frames3[i] + self.assertIs(f1, f2, f"Frame {i}: samples 1-2 must be same object") + self.assertIs(f2, f3, f"Frame {i}: samples 2-3 must be same object") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_line_number_updates_in_same_frame(self): + """Test that line numbers are correctly updated when execution moves within a function. + + When the profiler samples at different points within the same function, + it must report the correct line number for each sample, not stale cached values. + """ + script_body = """\ + def outer(): + inner() + + def inner(): + sock.sendall(b"line_a"); sock.recv(16) + sock.sendall(b"line_b"); sock.recv(16) + sock.sendall(b"line_c"); sock.recv(16) + sock.sendall(b"line_d"); sock.recv(16) + + outer() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + + frames_a = self._sample_frames(client_socket, unwinder, b"line_a", b"ack", {"inner"}) + frames_b = self._sample_frames(client_socket, unwinder, b"line_b", b"ack", {"inner"}) + frames_c = self._sample_frames(client_socket, unwinder, b"line_c", b"ack", {"inner"}) + frames_d = self._sample_frames(client_socket, unwinder, b"line_d", b"done", {"inner"}) + + self.assertIsNotNone(frames_a) + self.assertIsNotNone(frames_b) + self.assertIsNotNone(frames_c) + self.assertIsNotNone(frames_d) + + # Get the 'inner' frame from each sample (should be index 0) + inner_a = frames_a[0] + inner_b = frames_b[0] + inner_c = frames_c[0] + inner_d = frames_d[0] + + self.assertEqual(inner_a.funcname, "inner") + self.assertEqual(inner_b.funcname, "inner") + self.assertEqual(inner_c.funcname, "inner") + self.assertEqual(inner_d.funcname, "inner") + + # Line numbers must be different and increasing (execution moves forward) + self.assertLess(inner_a.lineno, inner_b.lineno, + "Line B should be after line A") + self.assertLess(inner_b.lineno, inner_c.lineno, + "Line C should be after line B") + self.assertLess(inner_c.lineno, inner_d.lineno, + "Line D should be after line C") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_invalidation_on_return(self): + """Test cache invalidation when stack shrinks (function returns).""" + script_body = """\ + def inner(): + sock.sendall(b"at_inner") + sock.recv(16) + + def outer(): + inner() + sock.sendall(b"at_outer") + sock.recv(16) + + outer() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + + frames_deep = self._sample_frames( + client_socket, unwinder, b"at_inner", b"ack", {"inner", "outer"}) + frames_shallow = self._sample_frames( + client_socket, unwinder, b"at_outer", b"done", {"outer"}) + + self.assertIsNotNone(frames_deep) + self.assertIsNotNone(frames_shallow) + + funcs_deep = [f.funcname for f in frames_deep] + funcs_shallow = [f.funcname for f in frames_shallow] + + self.assertIn("inner", funcs_deep) + self.assertIn("outer", funcs_deep) + self.assertNotIn("inner", funcs_shallow) + self.assertIn("outer", funcs_shallow) + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_invalidation_on_call(self): + """Test cache invalidation when stack grows (new function called).""" + script_body = """\ + def deeper(): + sock.sendall(b"at_deeper") + sock.recv(16) + + def middle(): + sock.sendall(b"at_middle") + sock.recv(16) + deeper() + + def top(): + middle() + + top() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + + frames_before = self._sample_frames( + client_socket, unwinder, b"at_middle", b"ack", {"middle", "top"}) + frames_after = self._sample_frames( + client_socket, unwinder, b"at_deeper", b"done", {"deeper", "middle", "top"}) + + self.assertIsNotNone(frames_before) + self.assertIsNotNone(frames_after) + + funcs_before = [f.funcname for f in frames_before] + funcs_after = [f.funcname for f in frames_after] + + self.assertIn("middle", funcs_before) + self.assertIn("top", funcs_before) + self.assertNotIn("deeper", funcs_before) + + self.assertIn("deeper", funcs_after) + self.assertIn("middle", funcs_after) + self.assertIn("top", funcs_after) + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_partial_stack_reuse(self): + """Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D).""" + script_body = """\ + def func_c(): + sock.sendall(b"at_c") + sock.recv(16) + + def func_d(): + sock.sendall(b"at_d") + sock.recv(16) + + def func_b(): + func_c() + func_d() + + def func_a(): + func_b() + + func_a() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + + # Sample at C: stack is A→B→C + frames_c = self._sample_frames( + client_socket, unwinder, b"at_c", b"ack", {"func_a", "func_b", "func_c"}) + # Sample at D: stack is A→B→D (C returned, D called) + frames_d = self._sample_frames( + client_socket, unwinder, b"at_d", b"done", {"func_a", "func_b", "func_d"}) + + self.assertIsNotNone(frames_c) + self.assertIsNotNone(frames_d) + + # Find func_a and func_b frames in both samples + def find_frame(frames, funcname): + for f in frames: + if f.funcname == funcname: + return f + return None + + frame_a_in_c = find_frame(frames_c, "func_a") + frame_b_in_c = find_frame(frames_c, "func_b") + frame_a_in_d = find_frame(frames_d, "func_a") + frame_b_in_d = find_frame(frames_d, "func_b") + + self.assertIsNotNone(frame_a_in_c) + self.assertIsNotNone(frame_b_in_c) + self.assertIsNotNone(frame_a_in_d) + self.assertIsNotNone(frame_b_in_d) + + # The bottom frames (A, B) should be the SAME objects (cache reuse) + self.assertIs(frame_a_in_c, frame_a_in_d, "func_a frame should be reused from cache") + self.assertIs(frame_b_in_c, frame_b_in_d, "func_b frame should be reused from cache") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_recursive_frames(self): + """Test caching with same function appearing multiple times (recursion).""" + script_body = """\ + def recurse(n): + if n <= 0: + sock.sendall(b"sync1") + sock.recv(16) + sock.sendall(b"sync2") + sock.recv(16) + else: + recurse(n - 1) + + recurse(5) + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + + frames1 = self._sample_frames( + client_socket, unwinder, b"sync1", b"ack", {"recurse"}) + frames2 = self._sample_frames( + client_socket, unwinder, b"sync2", b"done", {"recurse"}) + + self.assertIsNotNone(frames1) + self.assertIsNotNone(frames2) + + # Should have multiple "recurse" frames (6 total: recurse(5) down to recurse(0)) + recurse_count = sum(1 for f in frames1 if f.funcname == "recurse") + self.assertEqual(recurse_count, 6, "Should have 6 recursive frames") + + self.assertEqual(len(frames1), len(frames2)) + + # Current frame (index 0) is re-read, check value equality + self.assertEqual(frames1[0].funcname, frames2[0].funcname) + + # Parent frames (index 1+) should be identical objects (cache reuse) + for i in range(1, len(frames1)): + self.assertIs(frames1[i], frames2[i], + f"Frame {i}: recursive frames must be same object") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_vs_no_cache_equivalence(self): + """Test that cache_frames=True and cache_frames=False produce equivalent results.""" + script_body = """\ + def level3(): + sock.sendall(b"ready"); sock.recv(16) + + def level2(): + level3() + + def level1(): + level2() + + level1() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + self._wait_for_signal(client_socket, b"ready") + + # Sample with cache + unwinder_cache = make_unwinder(cache_frames=True) + frames_cached = self._get_frames(unwinder_cache, {"level1", "level2", "level3"}) + + # Sample without cache + unwinder_no_cache = make_unwinder(cache_frames=False) + frames_no_cache = self._get_frames(unwinder_no_cache, {"level1", "level2", "level3"}) + + client_socket.sendall(b"done") + + self.assertIsNotNone(frames_cached) + self.assertIsNotNone(frames_no_cache) + + # Same number of frames + self.assertEqual(len(frames_cached), len(frames_no_cache)) + + # Same function names in same order + funcs_cached = [f.funcname for f in frames_cached] + funcs_no_cache = [f.funcname for f in frames_no_cache] + self.assertEqual(funcs_cached, funcs_no_cache) + + # Same line numbers + lines_cached = [f.lineno for f in frames_cached] + lines_no_cache = [f.lineno for f in frames_no_cache] + self.assertEqual(lines_cached, lines_no_cache) + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_per_thread_isolation(self): + """Test that frame cache is per-thread and cache invalidation works independently.""" + script_body = """\ + import threading + + lock = threading.Lock() + + def sync(msg): + with lock: + sock.sendall(msg + b"\\n") + sock.recv(1) + + # Thread 1 functions + def baz1(): + sync(b"t1:baz1") + + def bar1(): + baz1() + + def blech1(): + sync(b"t1:blech1") + + def foo1(): + bar1() # Goes down to baz1, syncs + blech1() # Returns up, goes down to blech1, syncs + + # Thread 2 functions + def baz2(): + sync(b"t2:baz2") + + def bar2(): + baz2() + + def blech2(): + sync(b"t2:blech2") + + def foo2(): + bar2() # Goes down to baz2, syncs + blech2() # Returns up, goes down to blech2, syncs + + t1 = threading.Thread(target=foo1) + t2 = threading.Thread(target=foo2) + t1.start() + t2.start() + t1.join() + t2.join() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder = make_unwinder(cache_frames=True) + buffer = b"" + + def recv_msg(): + """Receive a single message from socket.""" + nonlocal buffer + while b"\n" not in buffer: + chunk = client_socket.recv(256) + if not chunk: + return None + buffer += chunk + msg, buffer = buffer.split(b"\n", 1) + return msg + + def get_thread_frames(target_funcs): + """Get frames for thread matching target functions.""" + retries = 0 + for _ in busy_retry(SHORT_TIMEOUT): + if retries >= 5: + break + retries += 1 + # On Windows, ReadProcessMemory can fail with OSError + # (WinError 299) when frame pointers are in flux + with contextlib.suppress(RuntimeError, OSError): + traces = unwinder.get_stack_trace() + for interp in traces: + for thread in interp.threads: + funcs = [f.funcname for f in thread.frame_info] + if any(f in funcs for f in target_funcs): + return funcs + return None + + # Track results for each sync point + results = {} + + # Process 4 sync points: baz1, baz2, blech1, blech2 + # With the lock, threads are serialized - handle one at a time + for _ in range(4): + msg = recv_msg() + self.assertIsNotNone(msg, "Expected message from subprocess") + + # Determine which thread/function and take snapshot + if msg == b"t1:baz1": + funcs = get_thread_frames(["baz1", "bar1", "foo1"]) + self.assertIsNotNone(funcs, "Thread 1 not found at baz1") + results["t1:baz1"] = funcs + elif msg == b"t2:baz2": + funcs = get_thread_frames(["baz2", "bar2", "foo2"]) + self.assertIsNotNone(funcs, "Thread 2 not found at baz2") + results["t2:baz2"] = funcs + elif msg == b"t1:blech1": + funcs = get_thread_frames(["blech1", "foo1"]) + self.assertIsNotNone(funcs, "Thread 1 not found at blech1") + results["t1:blech1"] = funcs + elif msg == b"t2:blech2": + funcs = get_thread_frames(["blech2", "foo2"]) + self.assertIsNotNone(funcs, "Thread 2 not found at blech2") + results["t2:blech2"] = funcs + + # Release thread to continue + client_socket.sendall(b"k") + + # Validate Phase 1: baz snapshots + t1_baz = results.get("t1:baz1") + t2_baz = results.get("t2:baz2") + self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot") + self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot") + + # Thread 1 at baz1: should have foo1->bar1->baz1 + self.assertIn("baz1", t1_baz) + self.assertIn("bar1", t1_baz) + self.assertIn("foo1", t1_baz) + self.assertNotIn("blech1", t1_baz) + # No cross-contamination + self.assertNotIn("baz2", t1_baz) + self.assertNotIn("bar2", t1_baz) + self.assertNotIn("foo2", t1_baz) + + # Thread 2 at baz2: should have foo2->bar2->baz2 + self.assertIn("baz2", t2_baz) + self.assertIn("bar2", t2_baz) + self.assertIn("foo2", t2_baz) + self.assertNotIn("blech2", t2_baz) + # No cross-contamination + self.assertNotIn("baz1", t2_baz) + self.assertNotIn("bar1", t2_baz) + self.assertNotIn("foo1", t2_baz) + + # Validate Phase 2: blech snapshots (cache invalidation test) + t1_blech = results.get("t1:blech1") + t2_blech = results.get("t2:blech2") + self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot") + self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot") + + # Thread 1 at blech1: bar1/baz1 should be GONE (cache invalidated) + self.assertIn("blech1", t1_blech) + self.assertIn("foo1", t1_blech) + self.assertNotIn("bar1", t1_blech, "Cache not invalidated: bar1 still present") + self.assertNotIn("baz1", t1_blech, "Cache not invalidated: baz1 still present") + # No cross-contamination + self.assertNotIn("blech2", t1_blech) + + # Thread 2 at blech2: bar2/baz2 should be GONE (cache invalidated) + self.assertIn("blech2", t2_blech) + self.assertIn("foo2", t2_blech) + self.assertNotIn("bar2", t2_blech, "Cache not invalidated: bar2 still present") + self.assertNotIn("baz2", t2_blech, "Cache not invalidated: baz2 still present") + # No cross-contamination + self.assertNotIn("blech1", t2_blech) + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_new_unwinder_with_stale_last_profiled_frame(self): + """Test that a new unwinder returns complete stack when cache lookup misses.""" + script_body = """\ + def level4(): + sock.sendall(b"sync1") + sock.recv(16) + sock.sendall(b"sync2") + sock.recv(16) + + def level3(): + level4() + + def level2(): + level3() + + def level1(): + level2() + + level1() + """ + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + expected = {"level1", "level2", "level3", "level4"} + + # First unwinder samples - this sets last_profiled_frame in target + unwinder1 = make_unwinder(cache_frames=True) + frames1 = self._sample_frames(client_socket, unwinder1, b"sync1", b"ack", expected) + + # Create NEW unwinder (empty cache) and sample + # The target still has last_profiled_frame set from unwinder1 + unwinder2 = make_unwinder(cache_frames=True) + frames2 = self._sample_frames(client_socket, unwinder2, b"sync2", b"done", expected) + + self.assertIsNotNone(frames1) + self.assertIsNotNone(frames2) + + funcs1 = [f.funcname for f in frames1] + funcs2 = [f.funcname for f in frames2] + + # Both should have all levels + for level in ["level1", "level2", "level3", "level4"]: + self.assertIn(level, funcs1, f"{level} missing from first sample") + self.assertIn(level, funcs2, f"{level} missing from second sample") + + # Should have same stack depth + self.assertEqual(len(frames1), len(frames2), + "New unwinder should return complete stack despite stale last_profiled_frame") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_exhaustion(self): + """Test cache works when frame limit (1024) is exceeded. + + FRAME_CACHE_MAX_FRAMES=1024. With 1100 recursive frames, + the cache can't store all of them but should still work. + """ + # Use 1100 to exceed FRAME_CACHE_MAX_FRAMES=1024 + depth = 1100 + script_body = f"""\ +import sys +sys.setrecursionlimit(2000) + +def recurse(n): + if n <= 0: + sock.sendall(b"ready") + sock.recv(16) # wait for ack + sock.sendall(b"ready2") + sock.recv(16) # wait for done + return + recurse(n - 1) + +recurse({depth}) +""" + + with self._target_process(script_body) as (p, client_socket, make_unwinder): + unwinder_cache = make_unwinder(cache_frames=True) + unwinder_no_cache = make_unwinder(cache_frames=False) + + frames_cached = self._sample_frames( + client_socket, unwinder_cache, b"ready", b"ack", {"recurse"}, expected_frames=1102 + ) + # Sample again with no cache for comparison + frames_no_cache = self._sample_frames( + client_socket, unwinder_no_cache, b"ready2", b"done", {"recurse"}, expected_frames=1102 + ) + + self.assertIsNotNone(frames_cached) + self.assertIsNotNone(frames_no_cache) + + # Both should have many recurse frames (> 1024 limit) + cached_count = [f.funcname for f in frames_cached].count("recurse") + no_cache_count = [f.funcname for f in frames_no_cache].count("recurse") + + self.assertGreater(cached_count, 1000, "Should have >1000 recurse frames") + self.assertGreater(no_cache_count, 1000, "Should have >1000 recurse frames") + + # Both modes should produce same frame count + self.assertEqual(len(frames_cached), len(frames_no_cache), + "Cache exhaustion should not affect stack completeness") + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_get_stats(self): + """Test that get_stats() returns statistics when stats=True.""" + script_body = """\ + sock.sendall(b"ready") + sock.recv(16) + """ + + with self._target_process(script_body) as (p, client_socket, _): + unwinder = RemoteUnwinder(p.pid, all_threads=True, stats=True) + self._wait_for_signal(client_socket, b"ready") + + # Take a sample + unwinder.get_stack_trace() + + stats = unwinder.get_stats() + client_socket.sendall(b"done") + + # Verify expected keys exist + expected_keys = [ + 'total_samples', 'frame_cache_hits', 'frame_cache_misses', + 'frame_cache_partial_hits', 'frames_read_from_cache', + 'frames_read_from_memory', 'frame_cache_hit_rate' + ] + for key in expected_keys: + self.assertIn(key, stats) + + self.assertEqual(stats['total_samples'], 1) + + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_get_stats_disabled_raises(self): + """Test that get_stats() raises RuntimeError when stats=False.""" + script_body = """\ + sock.sendall(b"ready") + sock.recv(16) + """ + + with self._target_process(script_body) as (p, client_socket, _): + unwinder = RemoteUnwinder(p.pid, all_threads=True) # stats=False by default + self._wait_for_signal(client_socket, b"ready") + + with self.assertRaises(RuntimeError): + unwinder.get_stats() + + client_socket.sendall(b"done") + + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst new file mode 100644 index 00000000000..e24fea416ff --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst @@ -0,0 +1,5 @@ +The ``_remote_debugging`` module now implements frame caching in the +``RemoteUnwinder`` class to reduce memory reads when profiling remote +processes. When ``cache_frames=True``, unchanged portions of the call stack +are reused from previous samples, significantly improving profiling +performance for deep call stacks. diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index b1582c75bda..1be83b45526 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -41,7 +41,7 @@ @MODULE__PICKLE_TRUE@_pickle _pickle.c @MODULE__QUEUE_TRUE@_queue _queuemodule.c @MODULE__RANDOM_TRUE@_random _randommodule.c -@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c +@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c @MODULE__STRUCT_TRUE@_struct _struct.c # build supports subinterpreters diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 70e362ccada..804e2c904e1 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -154,6 +154,39 @@ typedef struct { uintptr_t addr_code_adaptive; } CachedCodeMetadata; +/* Frame cache constants and types */ +#define FRAME_CACHE_MAX_THREADS 32 +#define FRAME_CACHE_MAX_FRAMES 1024 + +typedef struct { + uint64_t thread_id; // 0 = empty slot + uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; + Py_ssize_t num_addrs; + PyObject *frame_list; // owned reference, NULL if empty +} FrameCacheEntry; + +/* Statistics for profiling performance analysis */ +typedef struct { + uint64_t total_samples; // Total number of get_stack_trace calls + uint64_t frame_cache_hits; // Full cache hits (entire stack unchanged) + uint64_t frame_cache_misses; // Cache misses requiring full walk + uint64_t frame_cache_partial_hits; // Partial hits (stopped at cached frame) + uint64_t frames_read_from_cache; // Total frames retrieved from cache + uint64_t frames_read_from_memory; // Total frames read from remote memory + uint64_t memory_reads; // Total remote memory read operations + uint64_t memory_bytes_read; // Total bytes read from remote memory + uint64_t code_object_cache_hits; // Code object cache hits + uint64_t code_object_cache_misses; // Code object cache misses + uint64_t stale_cache_invalidations; // Times stale entries were cleared +} UnwinderStats; + +/* Stats tracking macros - no-op when stats collection is disabled */ +#define STATS_INC(unwinder, field) \ + do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0) + +#define STATS_ADD(unwinder, field, val) \ + do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0) + typedef struct { PyTypeObject *RemoteDebugging_Type; PyTypeObject *TaskInfo_Type; @@ -195,7 +228,12 @@ typedef struct { int skip_non_matching_threads; int native; int gc; + int cache_frames; + int collect_stats; // whether to collect statistics + uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale RemoteDebuggingState *cached_state; + FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries + UnwinderStats stats; // statistics for performance analysis #ifdef Py_GIL_DISABLED uint32_t tlbc_generation; _Py_hashtable_t *tlbc_cache; @@ -363,9 +401,45 @@ extern int process_frame_chain( uintptr_t initial_frame_addr, StackChunkList *chunks, PyObject *frame_info, - uintptr_t gc_frame + uintptr_t gc_frame, + uintptr_t last_profiled_frame, + int *stopped_at_cached_frame, + uintptr_t *frame_addrs, + Py_ssize_t *num_addrs, + Py_ssize_t max_addrs ); +/* Frame cache functions */ +extern int frame_cache_init(RemoteUnwinderObject *unwinder); +extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); +extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); +extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); +extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); +extern int frame_cache_lookup_and_extend( + RemoteUnwinderObject *unwinder, + uint64_t thread_id, + uintptr_t last_profiled_frame, + PyObject *frame_info, + uintptr_t *frame_addrs, + Py_ssize_t *num_addrs, + Py_ssize_t max_addrs); +// Returns: 1 = stored, 0 = not stored (graceful), -1 = error +extern int frame_cache_store( + RemoteUnwinderObject *unwinder, + uint64_t thread_id, + PyObject *frame_list, + const uintptr_t *addrs, + Py_ssize_t num_addrs); + +extern int collect_frames_with_cache( + RemoteUnwinderObject *unwinder, + uintptr_t frame_addr, + StackChunkList *chunks, + PyObject *frame_info, + uintptr_t gc_frame, + uintptr_t last_profiled_frame, + uint64_t thread_id); + /* ============================================================================ * THREAD FUNCTION DECLARATIONS * ============================================================================ */ diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index 60adb357e32..03127b753cc 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -12,7 +12,7 @@ preserve PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__, "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n" " mode=0, debug=False, skip_non_matching_threads=True,\n" -" native=False, gc=False)\n" +" native=False, gc=False, cache_frames=False, stats=False)\n" "--\n" "\n" "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n" @@ -32,6 +32,10 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__, " non-Python code.\n" " gc: If True, include artificial \"\" frames to denote active garbage\n" " collection.\n" +" cache_frames: If True, enable frame caching optimization to avoid re-reading\n" +" unchanged parent frames between samples.\n" +" stats: If True, collect statistics about cache hits, memory reads, etc.\n" +" Use get_stats() to retrieve the collected statistics.\n" "\n" "The RemoteUnwinder provides functionality to inspect and debug a running Python\n" "process, including examining thread states, stack frames and other runtime data.\n" @@ -48,7 +52,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, int only_active_thread, int mode, int debug, int skip_non_matching_threads, - int native, int gc); + int native, int gc, + int cache_frames, int stats); static int _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs) @@ -56,7 +61,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje int return_value = -1; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 8 + #define NUM_KEYWORDS 10 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD @@ -65,7 +70,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) .ob_hash = -1, - .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), }, + .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), &_Py_ID(cache_frames), &_Py_ID(stats), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -74,14 +79,14 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", NULL}; + static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", "cache_frames", "stats", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, .fname = "RemoteUnwinder", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[8]; + PyObject *argsbuf[10]; PyObject * const *fastargs; Py_ssize_t nargs = PyTuple_GET_SIZE(args); Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1; @@ -93,6 +98,8 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje int skip_non_matching_threads = 1; int native = 0; int gc = 0; + int cache_frames = 0; + int stats = 0; fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); @@ -160,12 +167,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje goto skip_optional_kwonly; } } - gc = PyObject_IsTrue(fastargs[7]); - if (gc < 0) { + if (fastargs[7]) { + gc = PyObject_IsTrue(fastargs[7]); + if (gc < 0) { + goto exit; + } + if (!--noptargs) { + goto skip_optional_kwonly; + } + } + if (fastargs[8]) { + cache_frames = PyObject_IsTrue(fastargs[8]); + if (cache_frames < 0) { + goto exit; + } + if (!--noptargs) { + goto skip_optional_kwonly; + } + } + stats = PyObject_IsTrue(fastargs[9]); + if (stats < 0) { goto exit; } skip_optional_kwonly: - return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc); + return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc, cache_frames, stats); exit: return return_value; @@ -347,4 +372,51 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject return return_value; } -/*[clinic end generated code: output=99fed5c94cf36881 input=a9049054013a1b77]*/ + +PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__, +"get_stats($self, /)\n" +"--\n" +"\n" +"Get collected statistics about profiling performance.\n" +"\n" +"Returns a dictionary containing statistics about cache performance,\n" +"memory reads, and other profiling metrics. Only available if the\n" +"RemoteUnwinder was created with stats=True.\n" +"\n" +"Returns:\n" +" dict: A dictionary containing:\n" +" - total_samples: Total number of get_stack_trace calls\n" +" - frame_cache_hits: Full cache hits (entire stack unchanged)\n" +" - frame_cache_misses: Cache misses requiring full walk\n" +" - frame_cache_partial_hits: Partial hits (stopped at cached frame)\n" +" - frames_read_from_cache: Total frames retrieved from cache\n" +" - frames_read_from_memory: Total frames read from remote memory\n" +" - memory_reads: Total remote memory read operations\n" +" - memory_bytes_read: Total bytes read from remote memory\n" +" - code_object_cache_hits: Code object cache hits\n" +" - code_object_cache_misses: Code object cache misses\n" +" - stale_cache_invalidations: Times stale cache entries were cleared\n" +" - frame_cache_hit_rate: Percentage of samples that hit the cache\n" +" - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n" +"\n" +"Raises:\n" +" RuntimeError: If stats collection was not enabled (stats=False)"); + +#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF \ + {"get_stats", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stats, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stats__doc__}, + +static PyObject * +_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self); + +static PyObject * +_remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + PyObject *return_value = NULL; + + Py_BEGIN_CRITICAL_SECTION(self); + return_value = _remote_debugging_RemoteUnwinder_get_stats_impl((RemoteUnwinderObject *)self); + Py_END_CRITICAL_SECTION(); + + return return_value; +} +/*[clinic end generated code: output=f1fd6c1d4c4c7254 input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c index ea3f00c802b..2cd2505d0f9 100644 --- a/Modules/_remote_debugging/code_objects.c +++ b/Modules/_remote_debugging/code_objects.c @@ -257,6 +257,11 @@ parse_code_object(RemoteUnwinderObject *unwinder, if (unwinder && unwinder->code_object_cache != NULL) { meta = _Py_hashtable_get(unwinder->code_object_cache, key); + if (meta) { + STATS_INC(unwinder, code_object_cache_hits); + } else { + STATS_INC(unwinder, code_object_cache_misses); + } } if (meta == NULL) { diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c new file mode 100644 index 00000000000..4598b9dc353 --- /dev/null +++ b/Modules/_remote_debugging/frame_cache.c @@ -0,0 +1,236 @@ +/****************************************************************************** + * Remote Debugging Module - Frame Cache + * + * This file contains functions for caching frame information to optimize + * repeated stack unwinding for profiling. + ******************************************************************************/ + +#include "_remote_debugging.h" + +/* ============================================================================ + * FRAME CACHE - stores (address, frame_info) pairs per thread + * Uses preallocated fixed-size arrays for efficiency and bounded memory. + * ============================================================================ */ + +int +frame_cache_init(RemoteUnwinderObject *unwinder) +{ + unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry)); + if (!unwinder->frame_cache) { + PyErr_NoMemory(); + return -1; + } + return 0; +} + +void +frame_cache_cleanup(RemoteUnwinderObject *unwinder) +{ + if (!unwinder->frame_cache) { + return; + } + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + Py_CLEAR(unwinder->frame_cache[i].frame_list); + } + PyMem_Free(unwinder->frame_cache); + unwinder->frame_cache = NULL; +} + +// Find cache entry by thread_id +FrameCacheEntry * +frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id) +{ + if (!unwinder->frame_cache || thread_id == 0) { + return NULL; + } + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_id == thread_id) { + return &unwinder->frame_cache[i]; + } + } + return NULL; +} + +// Allocate a cache slot for a thread +// Returns NULL if cache is full (graceful degradation) +static FrameCacheEntry * +frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id) +{ + if (!unwinder->frame_cache || thread_id == 0) { + return NULL; + } + // First check if thread already has an entry + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_id == thread_id) { + return &unwinder->frame_cache[i]; + } + } + // Find empty slot + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_id == 0) { + return &unwinder->frame_cache[i]; + } + } + // Cache full - graceful degradation + return NULL; +} + +// Remove cache entries for threads not seen in the result +// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list, +// and ThreadInfo[0] is the thread_id +void +frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) +{ + if (!unwinder->frame_cache || !result || !PyList_Check(result)) { + return; + } + + // Build array of seen thread IDs from result + uint64_t seen_threads[FRAME_CACHE_MAX_THREADS]; + int num_seen = 0; + + Py_ssize_t num_interps = PyList_GET_SIZE(result); + for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) { + PyObject *interp_info = PyList_GET_ITEM(result, i); + PyObject *threads = PyStructSequence_GetItem(interp_info, 1); + if (!threads || !PyList_Check(threads)) { + continue; + } + Py_ssize_t num_threads = PyList_GET_SIZE(threads); + for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) { + PyObject *thread_info = PyList_GET_ITEM(threads, j); + PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0); + if (tid_obj) { + uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj); + if (!PyErr_Occurred()) { + seen_threads[num_seen++] = tid; + } else { + PyErr_Clear(); + } + } + } + } + + // Invalidate entries not in seen list + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_id == 0) { + continue; + } + int found = 0; + for (int j = 0; j < num_seen; j++) { + if (unwinder->frame_cache[i].thread_id == seen_threads[j]) { + found = 1; + break; + } + } + if (!found) { + // Clear this entry + Py_CLEAR(unwinder->frame_cache[i].frame_list); + unwinder->frame_cache[i].thread_id = 0; + unwinder->frame_cache[i].num_addrs = 0; + STATS_INC(unwinder, stale_cache_invalidations); + } + } +} + +// Find last_profiled_frame in cache and extend frame_info with cached continuation +// If frame_addrs is provided (not NULL), also extends it with cached addresses +int +frame_cache_lookup_and_extend( + RemoteUnwinderObject *unwinder, + uint64_t thread_id, + uintptr_t last_profiled_frame, + PyObject *frame_info, + uintptr_t *frame_addrs, + Py_ssize_t *num_addrs, + Py_ssize_t max_addrs) +{ + if (!unwinder->frame_cache || last_profiled_frame == 0) { + return 0; + } + + FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id); + if (!entry || !entry->frame_list) { + return 0; + } + + // Find the index where last_profiled_frame matches + Py_ssize_t start_idx = -1; + for (Py_ssize_t i = 0; i < entry->num_addrs; i++) { + if (entry->addrs[i] == last_profiled_frame) { + start_idx = i; + break; + } + } + + if (start_idx < 0) { + return 0; // Not found + } + + Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list); + + // Extend frame_info with frames from start_idx onwards + PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames); + if (!slice) { + return -1; + } + + Py_ssize_t cur_size = PyList_GET_SIZE(frame_info); + int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice); + Py_DECREF(slice); + + if (result < 0) { + return -1; + } + + // Also extend frame_addrs with cached addresses if provided + if (frame_addrs) { + for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) { + frame_addrs[(*num_addrs)++] = entry->addrs[i]; + } + } + + return 1; +} + +// Store frame list with addresses in cache +// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error +int +frame_cache_store( + RemoteUnwinderObject *unwinder, + uint64_t thread_id, + PyObject *frame_list, + const uintptr_t *addrs, + Py_ssize_t num_addrs) +{ + if (!unwinder->frame_cache || thread_id == 0) { + return 0; + } + + // Clamp to max frames + if (num_addrs > FRAME_CACHE_MAX_FRAMES) { + num_addrs = FRAME_CACHE_MAX_FRAMES; + } + + FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id); + if (!entry) { + // Cache full - graceful degradation + return 0; + } + + // Clear old frame_list if replacing + Py_CLEAR(entry->frame_list); + + // Store full frame list (don't truncate to num_addrs - frames beyond the + // address array limit are still valid and needed for full cache hits) + Py_ssize_t num_frames = PyList_GET_SIZE(frame_list); + entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames); + if (!entry->frame_list) { + return -1; + } + entry->thread_id = thread_id; + memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); + entry->num_addrs = num_addrs; + + return 1; +} diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index d60caadcb9a..b77c0ca556d 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -189,6 +189,8 @@ parse_frame_object( set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); return -1; } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous); uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable); @@ -258,14 +260,39 @@ process_frame_chain( uintptr_t initial_frame_addr, StackChunkList *chunks, PyObject *frame_info, - uintptr_t gc_frame) + uintptr_t gc_frame, + uintptr_t last_profiled_frame, + int *stopped_at_cached_frame, + uintptr_t *frame_addrs, // optional: C array to receive frame addresses + Py_ssize_t *num_addrs, // in/out: current count / updated count + Py_ssize_t max_addrs) // max capacity of frame_addrs array { uintptr_t frame_addr = initial_frame_addr; uintptr_t prev_frame_addr = 0; - const size_t MAX_FRAMES = 1024; + const size_t MAX_FRAMES = 1024 + 512; size_t frame_count = 0; + // Initialize output flag + if (stopped_at_cached_frame) { + *stopped_at_cached_frame = 0; + } + + // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged + if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) { + if (stopped_at_cached_frame) { + *stopped_at_cached_frame = 1; + } + return 0; + } + while ((void*)frame_addr != NULL) { + // Check if we've reached the cached frame - if so, stop here + if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) { + if (stopped_at_cached_frame) { + *stopped_at_cached_frame = 1; + } + break; + } PyObject *frame = NULL; uintptr_t next_frame_addr = 0; uintptr_t stackpointer = 0; @@ -286,7 +313,6 @@ process_frame_chain( } } if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) { - // If the first frame is missing, the chain is broken: const char *e = "Failed to parse initial frame in chain"; PyErr_SetString(PyExc_RuntimeError, e); return -1; @@ -310,36 +336,40 @@ process_frame_chain( extra_frame = &_Py_STR(native); } if (extra_frame) { - // Use "~" as file and 0 as line, since that's what pstats uses: PyObject *extra_frame_info = make_frame_info( unwinder, _Py_LATIN1_CHR('~'), _PyLong_GetZero(), extra_frame); if (extra_frame_info == NULL) { return -1; } - int error = PyList_Append(frame_info, extra_frame_info); - Py_DECREF(extra_frame_info); - if (error) { - const char *e = "Failed to append extra frame to frame info list"; - set_exception_cause(unwinder, PyExc_RuntimeError, e); + if (PyList_Append(frame_info, extra_frame_info) < 0) { + Py_DECREF(extra_frame_info); + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame"); return -1; } + // Extra frames use 0 as address (they're synthetic) + if (frame_addrs && *num_addrs < max_addrs) { + frame_addrs[(*num_addrs)++] = 0; + } + Py_DECREF(extra_frame_info); } if (frame) { if (prev_frame_addr && frame_addr != prev_frame_addr) { const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx"; PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr); Py_DECREF(frame); - const char *e = "Frame chain consistency check failed"; - set_exception_cause(unwinder, PyExc_RuntimeError, e); + set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed"); return -1; } - if (PyList_Append(frame_info, frame) == -1) { + if (PyList_Append(frame_info, frame) < 0) { Py_DECREF(frame); - const char *e = "Failed to append frame to frame info list"; - set_exception_cause(unwinder, PyExc_RuntimeError, e); + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame"); return -1; } + // Track the address for this frame + if (frame_addrs && *num_addrs < max_addrs) { + frame_addrs[(*num_addrs)++] = frame_addr; + } Py_DECREF(frame); } @@ -349,3 +379,208 @@ process_frame_chain( return 0; } + +// Clear last_profiled_frame for all threads in the target process. +// This must be called at the start of profiling to avoid stale values +// from previous profilers causing us to stop frame walking early. +int +clear_last_profiled_frames(RemoteUnwinderObject *unwinder) +{ + uintptr_t current_interp = unwinder->interpreter_addr; + uintptr_t zero = 0; + + while (current_interp != 0) { + // Get first thread in this interpreter + uintptr_t tstate_addr; + if (_Py_RemoteDebug_PagedReadRemoteMemory( + &unwinder->handle, + current_interp + unwinder->debug_offsets.interpreter_state.threads_head, + sizeof(void*), + &tstate_addr) < 0) { + // Non-fatal: just skip clearing + PyErr_Clear(); + return 0; + } + + // Iterate all threads in this interpreter + while (tstate_addr != 0) { + // Clear last_profiled_frame + uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame; + if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, + sizeof(uintptr_t), &zero) < 0) { + // Non-fatal: just continue + PyErr_Clear(); + } + + // Move to next thread + if (_Py_RemoteDebug_PagedReadRemoteMemory( + &unwinder->handle, + tstate_addr + unwinder->debug_offsets.thread_state.next, + sizeof(void*), + &tstate_addr) < 0) { + PyErr_Clear(); + break; + } + } + + // Move to next interpreter + if (_Py_RemoteDebug_PagedReadRemoteMemory( + &unwinder->handle, + current_interp + unwinder->debug_offsets.interpreter_state.next, + sizeof(void*), + ¤t_interp) < 0) { + PyErr_Clear(); + break; + } + } + + return 0; +} + +// Fast path: check if we have a full cache hit (parent stack unchanged) +// A "full hit" means current frame == last profiled frame, so we can reuse +// cached parent frames. We always read the current frame from memory to get +// updated line numbers (the line within a frame can change between samples). +// Returns: 1 if full hit (frame_info populated with current frame + cached parents), +// 0 if miss, -1 on error +static int +try_full_cache_hit( + RemoteUnwinderObject *unwinder, + uintptr_t frame_addr, + uintptr_t last_profiled_frame, + uint64_t thread_id, + PyObject *frame_info) +{ + if (!unwinder->frame_cache || last_profiled_frame == 0) { + return 0; + } + // Full hit only if current frame == last profiled frame + if (frame_addr != last_profiled_frame) { + return 0; + } + + FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id); + if (!entry || !entry->frame_list) { + return 0; + } + + // Verify first address matches (sanity check) + if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) { + return 0; + } + + // Always read the current frame from memory to get updated line number + PyObject *current_frame = NULL; + uintptr_t code_object_addr = 0; + uintptr_t previous_frame = 0; + int parse_result = parse_frame_object(unwinder, ¤t_frame, frame_addr, + &code_object_addr, &previous_frame); + if (parse_result < 0) { + return -1; + } + + // Get cached parent frames first (before modifying frame_info) + Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); + PyObject *parent_slice = NULL; + if (cached_size > 1) { + parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size); + if (!parent_slice) { + Py_XDECREF(current_frame); + return -1; + } + } + + // Now safe to modify frame_info - add current frame if valid + if (current_frame != NULL) { + if (PyList_Append(frame_info, current_frame) < 0) { + Py_DECREF(current_frame); + Py_XDECREF(parent_slice); + return -1; + } + Py_DECREF(current_frame); + STATS_ADD(unwinder, frames_read_from_memory, 1); + } + + // Extend with cached parent frames + if (parent_slice) { + Py_ssize_t cur_size = PyList_GET_SIZE(frame_info); + int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice); + Py_DECREF(parent_slice); + if (result < 0) { + return -1; + } + STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1); + } + + STATS_INC(unwinder, frame_cache_hits); + return 1; +} + +// High-level helper: collect frames with cache optimization +// Returns complete frame_info list, handling all cache logic internally +int +collect_frames_with_cache( + RemoteUnwinderObject *unwinder, + uintptr_t frame_addr, + StackChunkList *chunks, + PyObject *frame_info, + uintptr_t gc_frame, + uintptr_t last_profiled_frame, + uint64_t thread_id) +{ + // Fast path: check for full cache hit first (no allocations needed) + int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame, + thread_id, frame_info); + if (full_hit != 0) { + return full_hit < 0 ? -1 : 0; // Either error or success + } + + uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; + Py_ssize_t num_addrs = 0; + Py_ssize_t frames_before = PyList_GET_SIZE(frame_info); + + int stopped_at_cached = 0; + if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame, + last_profiled_frame, &stopped_at_cached, + addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + return -1; + } + + // Track frames read from memory (frames added by process_frame_chain) + STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before); + + // If stopped at cached frame, extend with cached continuation (both frames and addresses) + if (stopped_at_cached) { + Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info); + int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame, + frame_info, addrs, &num_addrs, + FRAME_CACHE_MAX_FRAMES); + if (cache_result < 0) { + return -1; + } + if (cache_result == 0) { + // Cache miss - continue walking from last_profiled_frame to get the rest + STATS_INC(unwinder, frame_cache_misses); + Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info); + if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame, + 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + return -1; + } + STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk); + } else { + // Partial cache hit + STATS_INC(unwinder, frame_cache_partial_hits); + STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache); + } + } else if (last_profiled_frame == 0) { + // No cache involvement (no last_profiled_frame or cache disabled) + STATS_INC(unwinder, frame_cache_misses); + } + + // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES) + if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) { + return -1; + } + + return 0; +} diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 6cd9fad37de..123e4f5c4d7 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__ skip_non_matching_threads: bool = True native: bool = False gc: bool = False + cache_frames: bool = False + stats: bool = False Initialize a new RemoteUnwinder object for debugging a remote Python process. @@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process. non-Python code. gc: If True, include artificial "" frames to denote active garbage collection. + cache_frames: If True, enable frame caching optimization to avoid re-reading + unchanged parent frames between samples. + stats: If True, collect statistics about cache hits, memory reads, etc. + Use get_stats() to retrieve the collected statistics. The RemoteUnwinder provides functionality to inspect and debug a running Python process, including examining thread states, stack frames and other runtime data. @@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, int only_active_thread, int mode, int debug, int skip_non_matching_threads, - int native, int gc) -/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/ + int native, int gc, + int cache_frames, int stats) +/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/ { // Validate that all_threads and only_active_thread are not both True if (all_threads && only_active_thread) { @@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, #ifdef Py_GIL_DISABLED if (only_active_thread) { PyErr_SetString(PyExc_ValueError, - "only_active_thread is not supported when Py_GIL_DISABLED is not defined"); + "only_active_thread is not supported in free-threaded builds"); return -1; } #endif self->native = native; self->gc = gc; + self->cache_frames = cache_frames; + self->collect_stats = stats; + self->stale_invalidation_counter = 0; self->debug = debug; self->only_active_thread = only_active_thread; self->mode = mode; self->skip_non_matching_threads = skip_non_matching_threads; self->cached_state = NULL; + self->frame_cache = NULL; + // Initialize stats to zero + memset(&self->stats, 0, sizeof(self->stats)); if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) { set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle"); return -1; @@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->win_process_buffer_size = 0; #endif + if (cache_frames && frame_cache_init(self) < 0) { + return -1; + } + + // Clear stale last_profiled_frame values from previous profilers + // This prevents us from stopping frame walking early due to stale values + if (cache_frames) { + clear_last_profiled_frames(self); + } + return 0; } @@ -429,6 +452,8 @@ static PyObject * _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self) /*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/ { + STATS_INC(self, total_samples); + PyObject* result = PyList_New(0); if (!result) { set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list"); @@ -591,7 +616,15 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self } exit: - _Py_RemoteDebug_ClearCache(&self->handle); + // Invalidate cache entries for threads not seen in this sample. + // Only do this every 1024 iterations to avoid performance overhead. + if (self->cache_frames && result) { + if (++self->stale_invalidation_counter >= 1024) { + self->stale_invalidation_counter = 0; + frame_cache_invalidate_stale(self, result); + } + } + _Py_RemoteDebug_ClearCache(&self->handle); return result; } @@ -757,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject return NULL; } +/*[clinic input] +@permit_long_docstring_body +@critical_section +_remote_debugging.RemoteUnwinder.get_stats + +Get collected statistics about profiling performance. + +Returns a dictionary containing statistics about cache performance, +memory reads, and other profiling metrics. Only available if the +RemoteUnwinder was created with stats=True. + +Returns: + dict: A dictionary containing: + - total_samples: Total number of get_stack_trace calls + - frame_cache_hits: Full cache hits (entire stack unchanged) + - frame_cache_misses: Cache misses requiring full walk + - frame_cache_partial_hits: Partial hits (stopped at cached frame) + - frames_read_from_cache: Total frames retrieved from cache + - frames_read_from_memory: Total frames read from remote memory + - memory_reads: Total remote memory read operations + - memory_bytes_read: Total bytes read from remote memory + - code_object_cache_hits: Code object cache hits + - code_object_cache_misses: Code object cache misses + - stale_cache_invalidations: Times stale cache entries were cleared + - frame_cache_hit_rate: Percentage of samples that hit the cache + - code_object_cache_hit_rate: Percentage of code object lookups that hit cache + +Raises: + RuntimeError: If stats collection was not enabled (stats=False) +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) +/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/ +{ + if (!self->collect_stats) { + PyErr_SetString(PyExc_RuntimeError, + "Statistics collection was not enabled. " + "Create RemoteUnwinder with stats=True to collect statistics."); + return NULL; + } + + PyObject *result = PyDict_New(); + if (!result) { + return NULL; + } + +#define ADD_STAT(name) do { \ + PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \ + if (!val || PyDict_SetItemString(result, #name, val) < 0) { \ + Py_XDECREF(val); \ + Py_DECREF(result); \ + return NULL; \ + } \ + Py_DECREF(val); \ +} while(0) + + ADD_STAT(total_samples); + ADD_STAT(frame_cache_hits); + ADD_STAT(frame_cache_misses); + ADD_STAT(frame_cache_partial_hits); + ADD_STAT(frames_read_from_cache); + ADD_STAT(frames_read_from_memory); + ADD_STAT(memory_reads); + ADD_STAT(memory_bytes_read); + ADD_STAT(code_object_cache_hits); + ADD_STAT(code_object_cache_misses); + ADD_STAT(stale_cache_invalidations); + +#undef ADD_STAT + + // Calculate and add derived statistics + // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups + double frame_cache_hit_rate = 0.0; + uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses; + if (total_cache_lookups > 0) { + frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits) + / (double)total_cache_lookups; + } + PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate); + if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) { + Py_XDECREF(hit_rate); + Py_DECREF(result); + return NULL; + } + Py_DECREF(hit_rate); + + double code_object_hit_rate = 0.0; + uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses; + if (total_code_lookups > 0) { + code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups; + } + PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate); + if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) { + Py_XDECREF(code_hit_rate); + Py_DECREF(result); + return NULL; + } + Py_DECREF(code_hit_rate); + + return result; +} + static PyMethodDef RemoteUnwinder_methods[] = { _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF + _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF {NULL, NULL} }; @@ -787,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op) _Py_RemoteDebug_ClearCache(&self->handle); _Py_RemoteDebug_CleanupProcHandle(&self->handle); } + frame_cache_cleanup(self); PyObject_Del(self); Py_DECREF(tp); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 99147b01a1b..ce013f902d1 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -296,6 +296,8 @@ unwind_stack_for_thread( set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); goto error; } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size); long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id); @@ -309,6 +311,8 @@ unwind_stack_for_thread( set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state"); goto error; } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size); // Calculate thread status using flags (always) int status_flags = 0; @@ -383,14 +387,36 @@ unwind_stack_for_thread( goto error; } - if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks"); - goto error; + // In cache mode, copying stack chunks is more expensive than direct memory reads + if (!unwinder->cache_frames) { + if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks"); + goto error; + } } - if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain"); - goto error; + if (unwinder->cache_frames) { + // Use cache to avoid re-reading unchanged parent frames + uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts, + unwinder->debug_offsets.thread_state.last_profiled_frame); + if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info, + gc_frame, last_profiled_frame, tid) < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames"); + goto error; + } + // Update last_profiled_frame for next sample + uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame; + if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, + sizeof(uintptr_t), &frame_addr) < 0) { + PyErr_Clear(); // Non-fatal + } + } else { + // No caching - process entire frame chain + if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, + gc_frame, 0, NULL, NULL, NULL, 0) < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain"); + goto error; + } } *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next); diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj index 3ef34ef0563..c91c9cf3652 100644 --- a/PCbuild/_remote_debugging.vcxproj +++ b/PCbuild/_remote_debugging.vcxproj @@ -102,6 +102,7 @@ + diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters index 5c117a79f3b..b37a2c5575c 100644 --- a/PCbuild/_remote_debugging.vcxproj.filters +++ b/PCbuild/_remote_debugging.vcxproj.filters @@ -24,6 +24,9 @@ Source Files + + Source Files + Source Files diff --git a/Python/ceval.c b/Python/ceval.c index aadc6369cbe..382ae210ebb 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -2288,6 +2288,16 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame) void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame) { + // Update last_profiled_frame for remote profiler frame caching. + // By this point, tstate->current_frame is already set to the parent frame. + // Only update if we're popping the exact frame that was last profiled. + // This avoids corrupting the cache when transient frames (called and returned + // between profiler samples) update last_profiled_frame to addresses the + // profiler never saw. + if (tstate->last_profiled_frame != NULL && tstate->last_profiled_frame == frame) { + tstate->last_profiled_frame = tstate->current_frame; + } + if (frame->owner == FRAME_OWNED_BY_THREAD) { clear_thread_frame(tstate, frame); } diff --git a/Python/remote_debug.h b/Python/remote_debug.h index 517568358a0..1c02870d3af 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -1102,6 +1102,115 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address #endif } +#if defined(__linux__) && HAVE_PROCESS_VM_READV +// Fallback write using /proc/pid/mem +static int +_Py_RemoteDebug_WriteRemoteMemoryFallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src) +{ + if (handle->memfd == -1) { + if (open_proc_mem_fd(handle) < 0) { + return -1; + } + } + + struct iovec local[1]; + Py_ssize_t result = 0; + Py_ssize_t written = 0; + + do { + local[0].iov_base = (char*)src + result; + local[0].iov_len = len - result; + off_t offset = remote_address + result; + + written = pwritev(handle->memfd, local, 1, offset); + if (written < 0) { + PyErr_SetFromErrno(PyExc_OSError); + return -1; + } + + result += written; + } while ((size_t)written != local[0].iov_len); + return 0; +} +#endif // __linux__ + +// Platform-independent memory write function +UNUSED static int +_Py_RemoteDebug_WriteRemoteMemory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src) +{ +#ifdef MS_WINDOWS + SIZE_T written = 0; + SIZE_T result = 0; + do { + if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) { + PyErr_SetFromWindowsErr(0); + DWORD error = GetLastError(); + _set_debug_exception_cause(PyExc_OSError, + "WriteProcessMemory failed for PID %d at address 0x%lx " + "(size %zu, partial write %zu bytes): Windows error %lu", + handle->pid, remote_address + result, len - result, result, error); + return -1; + } + result += written; + } while (result < len); + return 0; +#elif defined(__linux__) && HAVE_PROCESS_VM_READV + if (handle->memfd != -1) { + return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src); + } + struct iovec local[1]; + struct iovec remote[1]; + Py_ssize_t result = 0; + Py_ssize_t written = 0; + + do { + local[0].iov_base = (void*)((char*)src + result); + local[0].iov_len = len - result; + remote[0].iov_base = (void*)((char*)remote_address + result); + remote[0].iov_len = len - result; + + written = process_vm_writev(handle->pid, local, 1, remote, 1, 0); + if (written < 0) { + if (errno == ENOSYS) { + return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src); + } + PyErr_SetFromErrno(PyExc_OSError); + _set_debug_exception_cause(PyExc_OSError, + "process_vm_writev failed for PID %d at address 0x%lx " + "(size %zu, partial write %zd bytes): %s", + handle->pid, remote_address + result, len - result, result, strerror(errno)); + return -1; + } + + result += written; + } while ((size_t)written != local[0].iov_len); + return 0; +#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX + kern_return_t kr = mach_vm_write( + handle->task, + (mach_vm_address_t)remote_address, + (vm_offset_t)src, + (mach_msg_type_number_t)len); + + if (kr != KERN_SUCCESS) { + switch (kr) { + case KERN_PROTECTION_FAILURE: + PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory"); + break; + case KERN_INVALID_ARGUMENT: + PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write"); + break; + default: + PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr); + } + return -1; + } + return 0; +#else + Py_UNREACHABLE(); +#endif +} + UNUSED static int _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, uintptr_t addr, diff --git a/Python/remote_debugging.c b/Python/remote_debugging.c index 71ffb17ed68..5b50b95db94 100644 --- a/Python/remote_debugging.c +++ b/Python/remote_debugging.c @@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst); } -// Why is pwritev not guarded? Except on Android API level 23 (no longer -// supported), HAVE_PROCESS_VM_READV is sufficient. -#if defined(__linux__) && HAVE_PROCESS_VM_READV -static int -write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src) -{ - if (handle->memfd == -1) { - if (open_proc_mem_fd(handle) < 0) { - return -1; - } - } - - struct iovec local[1]; - Py_ssize_t result = 0; - Py_ssize_t written = 0; - - do { - local[0].iov_base = (char*)src + result; - local[0].iov_len = len - result; - off_t offset = remote_address + result; - - written = pwritev(handle->memfd, local, 1, offset); - if (written < 0) { - PyErr_SetFromErrno(PyExc_OSError); - return -1; - } - - result += written; - } while ((size_t)written != local[0].iov_len); - return 0; -} -#endif // __linux__ - +// Use the shared write function from remote_debug.h static int write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src) { -#ifdef MS_WINDOWS - SIZE_T written = 0; - SIZE_T result = 0; - do { - if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) { - PyErr_SetFromWindowsErr(0); - return -1; - } - result += written; - } while (result < len); - return 0; -#elif defined(__linux__) && HAVE_PROCESS_VM_READV - if (handle->memfd != -1) { - return write_memory_fallback(handle, remote_address, len, src); - } - struct iovec local[1]; - struct iovec remote[1]; - Py_ssize_t result = 0; - Py_ssize_t written = 0; - - do { - local[0].iov_base = (void*)((char*)src + result); - local[0].iov_len = len - result; - remote[0].iov_base = (void*)((char*)remote_address + result); - remote[0].iov_len = len - result; - - written = process_vm_writev(handle->pid, local, 1, remote, 1, 0); - if (written < 0) { - if (errno == ENOSYS) { - return write_memory_fallback(handle, remote_address, len, src); - } - PyErr_SetFromErrno(PyExc_OSError); - return -1; - } - - result += written; - } while ((size_t)written != local[0].iov_len); - return 0; -#elif defined(__APPLE__) && TARGET_OS_OSX - kern_return_t kr = mach_vm_write( - pid_to_task(handle->pid), - (mach_vm_address_t)remote_address, - (vm_offset_t)src, - (mach_msg_type_number_t)len); - - if (kr != KERN_SUCCESS) { - switch (kr) { - case KERN_PROTECTION_FAILURE: - PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory"); - break; - case KERN_INVALID_ARGUMENT: - PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write"); - break; - default: - PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr); - } - return -1; - } - return 0; -#else - Py_UNREACHABLE(); -#endif + return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src); } static int diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py index 0ac7ac4d385..9c40c2f4492 100644 --- a/Tools/inspection/benchmark_external_inspection.py +++ b/Tools/inspection/benchmark_external_inspection.py @@ -434,7 +434,7 @@ def main(): elif args.threads == "only_active": kwargs["only_active_thread"] = True unwinder = _remote_debugging.RemoteUnwinder( - process.pid, **kwargs + process.pid, cache_frames=True, **kwargs ) results = benchmark(unwinder, duration_seconds=args.duration) finally: