gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks. When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process. The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory. The sampling profiler now enables frame caching by default.
2025-12-07 13:50:06 +00:00 · 2025-12-06 22:37:34 +00:00 · 2025-12-06 22:37:34 +00:00 · 572c780aa8
commit 572c780aa8
parent 332da6295f
24 changed files with 1855 additions and 142 deletions
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@ -135,6 +135,8 @@ struct _ts {
    /* Pointer to currently executing frame. */
    struct _PyInterpreterFrame *current_frame;

+    struct _PyInterpreterFrame *last_profiled_frame;
+
    Py_tracefunc c_profilefunc;
    Py_tracefunc c_tracefunc;
    PyObject *c_profileobj;
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
        uint64_t next;
        uint64_t interp;
        uint64_t current_frame;
+        uint64_t last_profiled_frame;
        uint64_t thread_id;
        uint64_t native_thread_id;
        uint64_t datastack_chunk;
@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
        .next = offsetof(PyThreadState, next), \
        .interp = offsetof(PyThreadState, interp), \
        .current_frame = offsetof(PyThreadState, current_frame), \
+        .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
        .thread_id = offsetof(PyThreadState, thread_id), \
        .native_thread_id = offsetof(PyThreadState, native_thread_id), \
        .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@ -1609,6 +1609,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_parameter_type));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_stack));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cache_frames));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
@ -2053,6 +2054,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@ -332,6 +332,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(c_parameter_type)
        STRUCT_FOR_ID(c_return)
        STRUCT_FOR_ID(c_stack)
+        STRUCT_FOR_ID(cache_frames)
        STRUCT_FOR_ID(cached_datetime_module)
        STRUCT_FOR_ID(cached_statements)
        STRUCT_FOR_ID(cadata)
@ -776,6 +777,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(stacklevel)
        STRUCT_FOR_ID(start)
        STRUCT_FOR_ID(statement)
+        STRUCT_FOR_ID(stats)
        STRUCT_FOR_ID(status)
        STRUCT_FOR_ID(stderr)
        STRUCT_FOR_ID(stdin)
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@ -1607,6 +1607,7 @@ extern "C" {
    INIT_ID(c_parameter_type), \
    INIT_ID(c_return), \
    INIT_ID(c_stack), \
+    INIT_ID(cache_frames), \
    INIT_ID(cached_datetime_module), \
    INIT_ID(cached_statements), \
    INIT_ID(cadata), \
@ -2051,6 +2052,7 @@ extern "C" {
    INIT_ID(stacklevel), \
    INIT_ID(start), \
    INIT_ID(statement), \
+    INIT_ID(stats), \
    INIT_ID(status), \
    INIT_ID(stderr), \
    INIT_ID(stdin), \
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@ -1108,6 +1108,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cache_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(cached_datetime_module);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -2884,6 +2888,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stats);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(status);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
--- a/InternalDocs/frames.md
+++ b/InternalDocs/frames.md
@ -111,6 +111,26 @@ ### Shim frames
 instruction which cleans up the shim frame and returns.


+### Remote Profiling Frame Cache
+
+The `last_profiled_frame` field in `PyThreadState` supports an optimization for
+remote profilers that sample call stacks from external processes. When a remote
+profiler reads the call stack, it writes the current frame address to this field.
+The eval loop then keeps this pointer valid by updating it to the parent frame
+whenever a frame returns (in `_PyEval_FrameClearAndPop`).
+
+This creates a "high-water mark" that always points to a frame still on the stack.
+On subsequent samples, the profiler can walk from `current_frame` until it reaches
+`last_profiled_frame`, knowing that frames from that point downward are unchanged
+and can be retrieved from a cache. This significantly reduces the amount of remote
+memory reads needed when call stacks are deep and stable at their base.
+
+The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
+`last_profiled_frame` is non-NULL AND matches the frame being popped. This
+prevents transient frames (called and returned between profiler samples) from
+corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
+
+
 ### The Instruction Pointer

 `_PyInterpreterFrame` has two fields which are used to maintain the instruction
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@ -27,21 +27,24 @@


 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
        self.pid = pid
        self.sample_interval_usec = sample_interval_usec
        self.all_threads = all_threads
        self.mode = mode  # Store mode for later use
+        self.collect_stats = collect_stats
        if _FREE_THREADED_BUILD:
            self.unwinder = _remote_debugging.RemoteUnwinder(
                self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
            )
        else:
            only_active_threads = bool(self.all_threads)
            self.unwinder = _remote_debugging.RemoteUnwinder(
                self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
            )
        # Track sample intervals and total sample count
        self.sample_intervals = deque(maxlen=100)
@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
            print(f"Sample rate: {sample_rate:.2f} samples/sec")
            print(f"Error rate: {error_rate:.2f}%")

+            # Print unwinder stats if stats collection is enabled
+            if self.collect_stats:
+                self._print_unwinder_stats()
+
        # Pass stats to flamegraph collector if it's the right type
        if hasattr(collector, 'set_stats'):
            collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@ -176,17 +183,100 @@ def _print_realtime_stats(self):
            (1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
        )  # Max time = Min Hz

+        # Build cache stats string if stats collection is enabled
+        cache_stats_str = ""
+        if self.collect_stats:
+            try:
+                stats = self.unwinder.get_stats()
+                hits = stats.get('frame_cache_hits', 0)
+                partial = stats.get('frame_cache_partial_hits', 0)
+                misses = stats.get('frame_cache_misses', 0)
+                total = hits + partial + misses
+                if total > 0:
+                    hit_pct = (hits + partial) / total * 100
+                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
+            except RuntimeError:
+                pass
+
        # Clear line and print stats
        print(
-            f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
-            f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
+            f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
+            f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
+            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
+            f"{cache_stats_str}",
            end="",
            flush=True,
        )

+    def _print_unwinder_stats(self):
+        """Print unwinder statistics including cache performance."""
+        try:
+            stats = self.unwinder.get_stats()
+        except RuntimeError:
+            return  # Stats not enabled
+
+        print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
+        print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
+
+        # Frame cache stats
+        total_samples = stats.get('total_samples', 0)
+        frame_cache_hits = stats.get('frame_cache_hits', 0)
+        frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
+        frame_cache_misses = stats.get('frame_cache_misses', 0)
+        total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
+
+        # Calculate percentages
+        hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
+        partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
+        misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
+        print(f"    Total samples:    {total_samples:,}")
+        print(f"    Full hits:        {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Partial hits:     {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Frame read stats
+        frames_from_cache = stats.get('frames_read_from_cache', 0)
+        frames_from_memory = stats.get('frames_read_from_memory', 0)
+        total_frames = frames_from_cache + frames_from_memory
+        cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
+        memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
+        print(f"    From cache:       {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    From memory:      {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
+
+        # Code object cache stats
+        code_hits = stats.get('code_object_cache_hits', 0)
+        code_misses = stats.get('code_object_cache_misses', 0)
+        total_code = code_hits + code_misses
+        code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
+        code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
+        print(f"    Hits:             {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Memory operations
+        memory_reads = stats.get('memory_reads', 0)
+        memory_bytes = stats.get('memory_bytes_read', 0)
+        if memory_bytes >= 1024 * 1024:
+            memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
+        elif memory_bytes >= 1024:
+            memory_str = f"{memory_bytes / 1024:.1f} KB"
+        else:
+            memory_str = f"{memory_bytes} B"
+        print(f"  {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
+        print(f"    Read operations:  {memory_reads:,} ({memory_str})")
+
+        # Stale invalidations
+        stale_invalidations = stats.get('stale_cache_invalidations', 0)
+        if stale_invalidations > 0:
+            print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
+

 def sample(
    pid,
@ -234,7 +324,8 @@ def sample(
        mode=mode,
        native=native,
        gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
    )
    profiler.realtime_stats = realtime_stats

@ -290,7 +381,8 @@ def sample_live(
        mode=mode,
        native=native,
        gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
    )
    profiler.realtime_stats = realtime_stats

--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@ -1,3 +1,4 @@
+import contextlib
 import unittest
 import os
 import textwrap
@ -2038,5 +2039,766 @@ def busy_thread():
                p.stderr.close()


+class TestFrameCaching(unittest.TestCase):
+    """Test that frame caching produces correct results.
+
+    Uses socket-based synchronization for deterministic testing.
+    All tests verify cache reuse via object identity checks (assertIs).
+    """
+
+    maxDiff = None
+    MAX_TRIES = 10
+
+    @contextlib.contextmanager
+    def _target_process(self, script_body):
+        """Context manager for running a target process with socket sync."""
+        port = find_unused_port()
+        script = f"""\
+import socket
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.connect(('localhost', {port}))
+{textwrap.dedent(script_body)}
+"""
+
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "script", script)
+            client_socket = None
+            p = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+
+                def make_unwinder(cache_frames=True):
+                    return RemoteUnwinder(p.pid, all_threads=True, cache_frames=cache_frames)
+
+                yield p, client_socket, make_unwinder
+
+            except PermissionError:
+                self.skipTest("Insufficient permissions to read the stack trace")
+            finally:
+                if client_socket:
+                    client_socket.close()
+                if p:
+                    p.kill()
+                    p.terminate()
+                    p.wait(timeout=SHORT_TIMEOUT)
+
+    def _wait_for_signal(self, client_socket, signal):
+        """Block until signal received from target."""
+        response = b""
+        while signal not in response:
+            chunk = client_socket.recv(64)
+            if not chunk:
+                break
+            response += chunk
+        return response
+
+    def _get_frames(self, unwinder, required_funcs):
+        """Sample and return frame_info list for thread containing required_funcs."""
+        traces = unwinder.get_stack_trace()
+        for interp in traces:
+            for thread in interp.threads:
+                funcs = [f.funcname for f in thread.frame_info]
+                if required_funcs.issubset(set(funcs)):
+                    return thread.frame_info
+        return None
+
+    def _sample_frames(self, client_socket, unwinder, wait_signal, send_ack, required_funcs, expected_frames=1):
+        """Wait for signal, sample frames, send ack. Returns frame_info list."""
+        self._wait_for_signal(client_socket, wait_signal)
+        # Give at least MAX_TRIES tries for the process to arrive to a steady state
+        for _ in range(self.MAX_TRIES):
+            frames = self._get_frames(unwinder, required_funcs)
+            if frames and len(frames) >= expected_frames:
+                break
+            time.sleep(0.1)
+        client_socket.sendall(send_ack)
+        return frames
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_hit_same_stack(self):
+        """Test that consecutive samples reuse cached parent frame objects.
+
+        The current frame (index 0) is always re-read from memory to get
+        updated line numbers, so it may be a different object. Parent frames
+        (index 1+) should be identical objects from cache.
+        """
+        script_body = """\
+            def level3():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+                sock.sendall(b"sync3")
+                sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+            expected = {"level1", "level2", "level3"}
+
+            frames1 = self._sample_frames(client_socket, unwinder, b"sync1", b"ack", expected)
+            frames2 = self._sample_frames(client_socket, unwinder, b"sync2", b"ack", expected)
+            frames3 = self._sample_frames(client_socket, unwinder, b"sync3", b"done", expected)
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+        self.assertIsNotNone(frames3)
+        self.assertEqual(len(frames1), len(frames2))
+        self.assertEqual(len(frames2), len(frames3))
+
+        # Current frame (index 0) is always re-read, so check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+        self.assertEqual(frames2[0].funcname, frames3[0].funcname)
+
+        # Parent frames (index 1+) must be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            f1, f2, f3 = frames1[i], frames2[i], frames3[i]
+            self.assertIs(f1, f2, f"Frame {i}: samples 1-2 must be same object")
+            self.assertIs(f2, f3, f"Frame {i}: samples 2-3 must be same object")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_line_number_updates_in_same_frame(self):
+        """Test that line numbers are correctly updated when execution moves within a function.
+
+        When the profiler samples at different points within the same function,
+        it must report the correct line number for each sample, not stale cached values.
+        """
+        script_body = """\
+            def outer():
+                inner()
+
+            def inner():
+                sock.sendall(b"line_a"); sock.recv(16)
+                sock.sendall(b"line_b"); sock.recv(16)
+                sock.sendall(b"line_c"); sock.recv(16)
+                sock.sendall(b"line_d"); sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_a = self._sample_frames(client_socket, unwinder, b"line_a", b"ack", {"inner"})
+            frames_b = self._sample_frames(client_socket, unwinder, b"line_b", b"ack", {"inner"})
+            frames_c = self._sample_frames(client_socket, unwinder, b"line_c", b"ack", {"inner"})
+            frames_d = self._sample_frames(client_socket, unwinder, b"line_d", b"done", {"inner"})
+
+        self.assertIsNotNone(frames_a)
+        self.assertIsNotNone(frames_b)
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Get the 'inner' frame from each sample (should be index 0)
+        inner_a = frames_a[0]
+        inner_b = frames_b[0]
+        inner_c = frames_c[0]
+        inner_d = frames_d[0]
+
+        self.assertEqual(inner_a.funcname, "inner")
+        self.assertEqual(inner_b.funcname, "inner")
+        self.assertEqual(inner_c.funcname, "inner")
+        self.assertEqual(inner_d.funcname, "inner")
+
+        # Line numbers must be different and increasing (execution moves forward)
+        self.assertLess(inner_a.lineno, inner_b.lineno,
+                        "Line B should be after line A")
+        self.assertLess(inner_b.lineno, inner_c.lineno,
+                        "Line C should be after line B")
+        self.assertLess(inner_c.lineno, inner_d.lineno,
+                        "Line D should be after line C")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_return(self):
+        """Test cache invalidation when stack shrinks (function returns)."""
+        script_body = """\
+            def inner():
+                sock.sendall(b"at_inner")
+                sock.recv(16)
+
+            def outer():
+                inner()
+                sock.sendall(b"at_outer")
+                sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_deep = self._sample_frames(
+                client_socket, unwinder, b"at_inner", b"ack", {"inner", "outer"})
+            frames_shallow = self._sample_frames(
+                client_socket, unwinder, b"at_outer", b"done", {"outer"})
+
+        self.assertIsNotNone(frames_deep)
+        self.assertIsNotNone(frames_shallow)
+
+        funcs_deep = [f.funcname for f in frames_deep]
+        funcs_shallow = [f.funcname for f in frames_shallow]
+
+        self.assertIn("inner", funcs_deep)
+        self.assertIn("outer", funcs_deep)
+        self.assertNotIn("inner", funcs_shallow)
+        self.assertIn("outer", funcs_shallow)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_call(self):
+        """Test cache invalidation when stack grows (new function called)."""
+        script_body = """\
+            def deeper():
+                sock.sendall(b"at_deeper")
+                sock.recv(16)
+
+            def middle():
+                sock.sendall(b"at_middle")
+                sock.recv(16)
+                deeper()
+
+            def top():
+                middle()
+
+            top()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_before = self._sample_frames(
+                client_socket, unwinder, b"at_middle", b"ack", {"middle", "top"})
+            frames_after = self._sample_frames(
+                client_socket, unwinder, b"at_deeper", b"done", {"deeper", "middle", "top"})
+
+        self.assertIsNotNone(frames_before)
+        self.assertIsNotNone(frames_after)
+
+        funcs_before = [f.funcname for f in frames_before]
+        funcs_after = [f.funcname for f in frames_after]
+
+        self.assertIn("middle", funcs_before)
+        self.assertIn("top", funcs_before)
+        self.assertNotIn("deeper", funcs_before)
+
+        self.assertIn("deeper", funcs_after)
+        self.assertIn("middle", funcs_after)
+        self.assertIn("top", funcs_after)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_partial_stack_reuse(self):
+        """Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D)."""
+        script_body = """\
+            def func_c():
+                sock.sendall(b"at_c")
+                sock.recv(16)
+
+            def func_d():
+                sock.sendall(b"at_d")
+                sock.recv(16)
+
+            def func_b():
+                func_c()
+                func_d()
+
+            def func_a():
+                func_b()
+
+            func_a()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            # Sample at C: stack is A→B→C
+            frames_c = self._sample_frames(
+                client_socket, unwinder, b"at_c", b"ack", {"func_a", "func_b", "func_c"})
+            # Sample at D: stack is A→B→D (C returned, D called)
+            frames_d = self._sample_frames(
+                client_socket, unwinder, b"at_d", b"done", {"func_a", "func_b", "func_d"})
+
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Find func_a and func_b frames in both samples
+        def find_frame(frames, funcname):
+            for f in frames:
+                if f.funcname == funcname:
+                    return f
+            return None
+
+        frame_a_in_c = find_frame(frames_c, "func_a")
+        frame_b_in_c = find_frame(frames_c, "func_b")
+        frame_a_in_d = find_frame(frames_d, "func_a")
+        frame_b_in_d = find_frame(frames_d, "func_b")
+
+        self.assertIsNotNone(frame_a_in_c)
+        self.assertIsNotNone(frame_b_in_c)
+        self.assertIsNotNone(frame_a_in_d)
+        self.assertIsNotNone(frame_b_in_d)
+
+        # The bottom frames (A, B) should be the SAME objects (cache reuse)
+        self.assertIs(frame_a_in_c, frame_a_in_d, "func_a frame should be reused from cache")
+        self.assertIs(frame_b_in_c, frame_b_in_d, "func_b frame should be reused from cache")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_recursive_frames(self):
+        """Test caching with same function appearing multiple times (recursion)."""
+        script_body = """\
+            def recurse(n):
+                if n <= 0:
+                    sock.sendall(b"sync1")
+                    sock.recv(16)
+                    sock.sendall(b"sync2")
+                    sock.recv(16)
+                else:
+                    recurse(n - 1)
+
+            recurse(5)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames1 = self._sample_frames(
+                client_socket, unwinder, b"sync1", b"ack", {"recurse"})
+            frames2 = self._sample_frames(
+                client_socket, unwinder, b"sync2", b"done", {"recurse"})
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        # Should have multiple "recurse" frames (6 total: recurse(5) down to recurse(0))
+        recurse_count = sum(1 for f in frames1 if f.funcname == "recurse")
+        self.assertEqual(recurse_count, 6, "Should have 6 recursive frames")
+
+        self.assertEqual(len(frames1), len(frames2))
+
+        # Current frame (index 0) is re-read, check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+
+        # Parent frames (index 1+) should be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            self.assertIs(frames1[i], frames2[i],
+                          f"Frame {i}: recursive frames must be same object")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_vs_no_cache_equivalence(self):
+        """Test that cache_frames=True and cache_frames=False produce equivalent results."""
+        script_body = """\
+            def level3():
+                sock.sendall(b"ready"); sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            self._wait_for_signal(client_socket, b"ready")
+
+            # Sample with cache
+            unwinder_cache = make_unwinder(cache_frames=True)
+            frames_cached = self._get_frames(unwinder_cache, {"level1", "level2", "level3"})
+
+            # Sample without cache
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+            frames_no_cache = self._get_frames(unwinder_no_cache, {"level1", "level2", "level3"})
+
+            client_socket.sendall(b"done")
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Same number of frames
+        self.assertEqual(len(frames_cached), len(frames_no_cache))
+
+        # Same function names in same order
+        funcs_cached = [f.funcname for f in frames_cached]
+        funcs_no_cache = [f.funcname for f in frames_no_cache]
+        self.assertEqual(funcs_cached, funcs_no_cache)
+
+        # Same line numbers
+        lines_cached = [f.lineno for f in frames_cached]
+        lines_no_cache = [f.lineno for f in frames_no_cache]
+        self.assertEqual(lines_cached, lines_no_cache)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_per_thread_isolation(self):
+        """Test that frame cache is per-thread and cache invalidation works independently."""
+        script_body = """\
+            import threading
+
+            lock = threading.Lock()
+
+            def sync(msg):
+                with lock:
+                    sock.sendall(msg + b"\\n")
+                    sock.recv(1)
+
+            # Thread 1 functions
+            def baz1():
+                sync(b"t1:baz1")
+
+            def bar1():
+                baz1()
+
+            def blech1():
+                sync(b"t1:blech1")
+
+            def foo1():
+                bar1()  # Goes down to baz1, syncs
+                blech1()  # Returns up, goes down to blech1, syncs
+
+            # Thread 2 functions
+            def baz2():
+                sync(b"t2:baz2")
+
+            def bar2():
+                baz2()
+
+            def blech2():
+                sync(b"t2:blech2")
+
+            def foo2():
+                bar2()  # Goes down to baz2, syncs
+                blech2()  # Returns up, goes down to blech2, syncs
+
+            t1 = threading.Thread(target=foo1)
+            t2 = threading.Thread(target=foo2)
+            t1.start()
+            t2.start()
+            t1.join()
+            t2.join()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+            buffer = b""
+
+            def recv_msg():
+                """Receive a single message from socket."""
+                nonlocal buffer
+                while b"\n" not in buffer:
+                    chunk = client_socket.recv(256)
+                    if not chunk:
+                        return None
+                    buffer += chunk
+                msg, buffer = buffer.split(b"\n", 1)
+                return msg
+
+            def get_thread_frames(target_funcs):
+                """Get frames for thread matching target functions."""
+                retries = 0
+                for _ in busy_retry(SHORT_TIMEOUT):
+                    if retries >= 5:
+                        break
+                    retries += 1
+                    # On Windows, ReadProcessMemory can fail with OSError
+                    # (WinError 299) when frame pointers are in flux
+                    with contextlib.suppress(RuntimeError, OSError):
+                        traces = unwinder.get_stack_trace()
+                        for interp in traces:
+                            for thread in interp.threads:
+                                funcs = [f.funcname for f in thread.frame_info]
+                                if any(f in funcs for f in target_funcs):
+                                    return funcs
+                return None
+
+            # Track results for each sync point
+            results = {}
+
+            # Process 4 sync points: baz1, baz2, blech1, blech2
+            # With the lock, threads are serialized - handle one at a time
+            for _ in range(4):
+                msg = recv_msg()
+                self.assertIsNotNone(msg, "Expected message from subprocess")
+
+                # Determine which thread/function and take snapshot
+                if msg == b"t1:baz1":
+                    funcs = get_thread_frames(["baz1", "bar1", "foo1"])
+                    self.assertIsNotNone(funcs, "Thread 1 not found at baz1")
+                    results["t1:baz1"] = funcs
+                elif msg == b"t2:baz2":
+                    funcs = get_thread_frames(["baz2", "bar2", "foo2"])
+                    self.assertIsNotNone(funcs, "Thread 2 not found at baz2")
+                    results["t2:baz2"] = funcs
+                elif msg == b"t1:blech1":
+                    funcs = get_thread_frames(["blech1", "foo1"])
+                    self.assertIsNotNone(funcs, "Thread 1 not found at blech1")
+                    results["t1:blech1"] = funcs
+                elif msg == b"t2:blech2":
+                    funcs = get_thread_frames(["blech2", "foo2"])
+                    self.assertIsNotNone(funcs, "Thread 2 not found at blech2")
+                    results["t2:blech2"] = funcs
+
+                # Release thread to continue
+                client_socket.sendall(b"k")
+
+            # Validate Phase 1: baz snapshots
+            t1_baz = results.get("t1:baz1")
+            t2_baz = results.get("t2:baz2")
+            self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot")
+            self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot")
+
+            # Thread 1 at baz1: should have foo1->bar1->baz1
+            self.assertIn("baz1", t1_baz)
+            self.assertIn("bar1", t1_baz)
+            self.assertIn("foo1", t1_baz)
+            self.assertNotIn("blech1", t1_baz)
+            # No cross-contamination
+            self.assertNotIn("baz2", t1_baz)
+            self.assertNotIn("bar2", t1_baz)
+            self.assertNotIn("foo2", t1_baz)
+
+            # Thread 2 at baz2: should have foo2->bar2->baz2
+            self.assertIn("baz2", t2_baz)
+            self.assertIn("bar2", t2_baz)
+            self.assertIn("foo2", t2_baz)
+            self.assertNotIn("blech2", t2_baz)
+            # No cross-contamination
+            self.assertNotIn("baz1", t2_baz)
+            self.assertNotIn("bar1", t2_baz)
+            self.assertNotIn("foo1", t2_baz)
+
+            # Validate Phase 2: blech snapshots (cache invalidation test)
+            t1_blech = results.get("t1:blech1")
+            t2_blech = results.get("t2:blech2")
+            self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot")
+            self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot")
+
+            # Thread 1 at blech1: bar1/baz1 should be GONE (cache invalidated)
+            self.assertIn("blech1", t1_blech)
+            self.assertIn("foo1", t1_blech)
+            self.assertNotIn("bar1", t1_blech, "Cache not invalidated: bar1 still present")
+            self.assertNotIn("baz1", t1_blech, "Cache not invalidated: baz1 still present")
+            # No cross-contamination
+            self.assertNotIn("blech2", t1_blech)
+
+            # Thread 2 at blech2: bar2/baz2 should be GONE (cache invalidated)
+            self.assertIn("blech2", t2_blech)
+            self.assertIn("foo2", t2_blech)
+            self.assertNotIn("bar2", t2_blech, "Cache not invalidated: bar2 still present")
+            self.assertNotIn("baz2", t2_blech, "Cache not invalidated: baz2 still present")
+            # No cross-contamination
+            self.assertNotIn("blech1", t2_blech)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_new_unwinder_with_stale_last_profiled_frame(self):
+        """Test that a new unwinder returns complete stack when cache lookup misses."""
+        script_body = """\
+            def level4():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+
+            def level3():
+                level4()
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            expected = {"level1", "level2", "level3", "level4"}
+
+            # First unwinder samples - this sets last_profiled_frame in target
+            unwinder1 = make_unwinder(cache_frames=True)
+            frames1 = self._sample_frames(client_socket, unwinder1, b"sync1", b"ack", expected)
+
+            # Create NEW unwinder (empty cache) and sample
+            # The target still has last_profiled_frame set from unwinder1
+            unwinder2 = make_unwinder(cache_frames=True)
+            frames2 = self._sample_frames(client_socket, unwinder2, b"sync2", b"done", expected)
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        funcs1 = [f.funcname for f in frames1]
+        funcs2 = [f.funcname for f in frames2]
+
+        # Both should have all levels
+        for level in ["level1", "level2", "level3", "level4"]:
+            self.assertIn(level, funcs1, f"{level} missing from first sample")
+            self.assertIn(level, funcs2, f"{level} missing from second sample")
+
+        # Should have same stack depth
+        self.assertEqual(len(frames1), len(frames2),
+                         "New unwinder should return complete stack despite stale last_profiled_frame")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_exhaustion(self):
+        """Test cache works when frame limit (1024) is exceeded.
+
+        FRAME_CACHE_MAX_FRAMES=1024. With 1100 recursive frames,
+        the cache can't store all of them but should still work.
+        """
+        # Use 1100 to exceed FRAME_CACHE_MAX_FRAMES=1024
+        depth = 1100
+        script_body = f"""\
+import sys
+sys.setrecursionlimit(2000)
+
+def recurse(n):
+    if n <= 0:
+        sock.sendall(b"ready")
+        sock.recv(16)  # wait for ack
+        sock.sendall(b"ready2")
+        sock.recv(16)  # wait for done
+        return
+    recurse(n - 1)
+
+recurse({depth})
+"""
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder_cache = make_unwinder(cache_frames=True)
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+
+            frames_cached = self._sample_frames(
+                client_socket, unwinder_cache, b"ready", b"ack", {"recurse"}, expected_frames=1102
+            )
+            # Sample again with no cache for comparison
+            frames_no_cache = self._sample_frames(
+                client_socket, unwinder_no_cache, b"ready2", b"done", {"recurse"}, expected_frames=1102
+            )
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Both should have many recurse frames (> 1024 limit)
+        cached_count = [f.funcname for f in frames_cached].count("recurse")
+        no_cache_count = [f.funcname for f in frames_no_cache].count("recurse")
+
+        self.assertGreater(cached_count, 1000, "Should have >1000 recurse frames")
+        self.assertGreater(no_cache_count, 1000, "Should have >1000 recurse frames")
+
+        # Both modes should produce same frame count
+        self.assertEqual(len(frames_cached), len(frames_no_cache),
+                        "Cache exhaustion should not affect stack completeness")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats(self):
+        """Test that get_stats() returns statistics when stats=True."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(p.pid, all_threads=True, stats=True)
+            self._wait_for_signal(client_socket, b"ready")
+
+            # Take a sample
+            unwinder.get_stack_trace()
+
+            stats = unwinder.get_stats()
+            client_socket.sendall(b"done")
+
+        # Verify expected keys exist
+        expected_keys = [
+            'total_samples', 'frame_cache_hits', 'frame_cache_misses',
+            'frame_cache_partial_hits', 'frames_read_from_cache',
+            'frames_read_from_memory', 'frame_cache_hit_rate'
+        ]
+        for key in expected_keys:
+            self.assertIn(key, stats)
+
+        self.assertEqual(stats['total_samples'], 1)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats_disabled_raises(self):
+        """Test that get_stats() raises RuntimeError when stats=False."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(p.pid, all_threads=True)  # stats=False by default
+            self._wait_for_signal(client_socket, b"ready")
+
+            with self.assertRaises(RuntimeError):
+                unwinder.get_stats()
+
+            client_socket.sendall(b"done")
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
+++ b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
@ -0,0 +1,5 @@
+The ``_remote_debugging`` module now implements frame caching in the
+``RemoteUnwinder`` class to reduce memory reads when profiling remote
+processes. When ``cache_frames=True``, unchanged portions of the call stack
+are reused from previous samples, significantly improving profiling
+performance for deep call stacks.
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@ -41,7 +41,7 @@
@MODULE__PICKLE_TRUE@_pickle _pickle.c
@MODULE__QUEUE_TRUE@_queue _queuemodule.c
@MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c
@MODULE__STRUCT_TRUE@_struct _struct.c

 # build supports subinterpreters
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@ -154,6 +154,39 @@ typedef struct {
    uintptr_t addr_code_adaptive;
 } CachedCodeMetadata;

+/* Frame cache constants and types */
+#define FRAME_CACHE_MAX_THREADS 32
+#define FRAME_CACHE_MAX_FRAMES 1024
+
+typedef struct {
+    uint64_t thread_id;                      // 0 = empty slot
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs;
+    PyObject *frame_list;                    // owned reference, NULL if empty
+} FrameCacheEntry;
+
+/* Statistics for profiling performance analysis */
+typedef struct {
+    uint64_t total_samples;                  // Total number of get_stack_trace calls
+    uint64_t frame_cache_hits;               // Full cache hits (entire stack unchanged)
+    uint64_t frame_cache_misses;             // Cache misses requiring full walk
+    uint64_t frame_cache_partial_hits;       // Partial hits (stopped at cached frame)
+    uint64_t frames_read_from_cache;         // Total frames retrieved from cache
+    uint64_t frames_read_from_memory;        // Total frames read from remote memory
+    uint64_t memory_reads;                   // Total remote memory read operations
+    uint64_t memory_bytes_read;              // Total bytes read from remote memory
+    uint64_t code_object_cache_hits;         // Code object cache hits
+    uint64_t code_object_cache_misses;       // Code object cache misses
+    uint64_t stale_cache_invalidations;      // Times stale entries were cleared
+} UnwinderStats;
+
+/* Stats tracking macros - no-op when stats collection is disabled */
+#define STATS_INC(unwinder, field) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+
+#define STATS_ADD(unwinder, field, val) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
+
 typedef struct {
    PyTypeObject *RemoteDebugging_Type;
    PyTypeObject *TaskInfo_Type;
@ -195,7 +228,12 @@ typedef struct {
    int skip_non_matching_threads;
    int native;
    int gc;
+    int cache_frames;
+    int collect_stats;  // whether to collect statistics
+    uint32_t stale_invalidation_counter;  // counter for throttling frame_cache_invalidate_stale
    RemoteDebuggingState *cached_state;
+    FrameCacheEntry *frame_cache;  // preallocated array of FRAME_CACHE_MAX_THREADS entries
+    UnwinderStats stats;  // statistics for performance analysis
 #ifdef Py_GIL_DISABLED
    uint32_t tlbc_generation;
    _Py_hashtable_t *tlbc_cache;
@ -363,9 +401,45 @@ extern int process_frame_chain(
    uintptr_t initial_frame_addr,
    StackChunkList *chunks,
    PyObject *frame_info,
-    uintptr_t gc_frame
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs
 );

+/* Frame cache functions */
+extern int frame_cache_init(RemoteUnwinderObject *unwinder);
+extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
+extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
+extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
+extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
+extern int frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs);
+// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
+extern int frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs);
+
+extern int collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id);
+
 /* ============================================================================
 * THREAD FUNCTION DECLARATIONS
 * ============================================================================ */
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@ -12,7 +12,7 @@ preserve
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
 "               mode=0, debug=False, skip_non_matching_threads=True,\n"
-"               native=False, gc=False)\n"
+"               native=False, gc=False, cache_frames=False, stats=False)\n"
 "--\n"
 "\n"
 "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@ -32,6 +32,10 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "            non-Python code.\n"
 "    gc: If True, include artificial \"<GC>\" frames to denote active garbage\n"
 "        collection.\n"
+"    cache_frames: If True, enable frame caching optimization to avoid re-reading\n"
+"                 unchanged parent frames between samples.\n"
+"    stats: If True, collect statistics about cache hits, memory reads, etc.\n"
+"           Use get_stats() to retrieve the collected statistics.\n"
 "\n"
 "The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
 "process, including examining thread states, stack frames and other runtime data.\n"
@ -48,7 +52,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                               int only_active_thread,
                                               int mode, int debug,
                                               int skip_non_matching_threads,
-                                               int native, int gc);
+                                               int native, int gc,
+                                               int cache_frames, int stats);

 static int
 _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@ -56,7 +61,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
    int return_value = -1;
    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)

-    #define NUM_KEYWORDS 8
+    #define NUM_KEYWORDS 10
    static struct {
        PyGC_Head _this_is_not_used;
        PyObject_VAR_HEAD
@ -65,7 +70,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
    } _kwtuple = {
        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
        .ob_hash = -1,
-        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), },
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), &_Py_ID(cache_frames), &_Py_ID(stats), },
    };
    #undef NUM_KEYWORDS
    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@ -74,14 +79,14 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
    #  define KWTUPLE NULL
    #endif  // !Py_BUILD_CORE

-    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", NULL};
+    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", "cache_frames", "stats", NULL};
    static _PyArg_Parser _parser = {
        .keywords = _keywords,
        .fname = "RemoteUnwinder",
        .kwtuple = KWTUPLE,
    };
    #undef KWTUPLE
-    PyObject *argsbuf[8];
+    PyObject *argsbuf[10];
    PyObject * const *fastargs;
    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
@ -93,6 +98,8 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
    int skip_non_matching_threads = 1;
    int native = 0;
    int gc = 0;
+    int cache_frames = 0;
+    int stats = 0;

    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@ -160,12 +167,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
            goto skip_optional_kwonly;
        }
    }
+    if (fastargs[7]) {
        gc = PyObject_IsTrue(fastargs[7]);
        if (gc < 0) {
            goto exit;
        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    if (fastargs[8]) {
+        cache_frames = PyObject_IsTrue(fastargs[8]);
+        if (cache_frames < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    stats = PyObject_IsTrue(fastargs[9]);
+    if (stats < 0) {
+        goto exit;
+    }
 skip_optional_kwonly:
-    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc);
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc, cache_frames, stats);

 exit:
    return return_value;
@ -347,4 +372,51 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject

    return return_value;
 }
-/*[clinic end generated code: output=99fed5c94cf36881 input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get collected statistics about profiling performance.\n"
+"\n"
+"Returns a dictionary containing statistics about cache performance,\n"
+"memory reads, and other profiling metrics. Only available if the\n"
+"RemoteUnwinder was created with stats=True.\n"
+"\n"
+"Returns:\n"
+"    dict: A dictionary containing:\n"
+"        - total_samples: Total number of get_stack_trace calls\n"
+"        - frame_cache_hits: Full cache hits (entire stack unchanged)\n"
+"        - frame_cache_misses: Cache misses requiring full walk\n"
+"        - frame_cache_partial_hits: Partial hits (stopped at cached frame)\n"
+"        - frames_read_from_cache: Total frames retrieved from cache\n"
+"        - frames_read_from_memory: Total frames read from remote memory\n"
+"        - memory_reads: Total remote memory read operations\n"
+"        - memory_bytes_read: Total bytes read from remote memory\n"
+"        - code_object_cache_hits: Code object cache hits\n"
+"        - code_object_cache_misses: Code object cache misses\n"
+"        - stale_cache_invalidations: Times stale cache entries were cleared\n"
+"        - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
+"        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If stats collection was not enabled (stats=False)");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stats, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_stats_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=f1fd6c1d4c4c7254 input=a9049054013a1b77]*/
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@ -257,6 +257,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,

    if (unwinder && unwinder->code_object_cache != NULL) {
        meta = _Py_hashtable_get(unwinder->code_object_cache, key);
+        if (meta) {
+            STATS_INC(unwinder, code_object_cache_hits);
+        } else {
+            STATS_INC(unwinder, code_object_cache_misses);
+        }
    }

    if (meta == NULL) {
--- a/Modules/_remote_debugging/frame_cache.c
+++ b/Modules/_remote_debugging/frame_cache.c
@ -0,0 +1,236 @@
+/******************************************************************************
+ * Remote Debugging Module - Frame Cache
+ *
+ * This file contains functions for caching frame information to optimize
+ * repeated stack unwinding for profiling.
+ ******************************************************************************/
+
+#include "_remote_debugging.h"
+
+/* ============================================================================
+ * FRAME CACHE - stores (address, frame_info) pairs per thread
+ * Uses preallocated fixed-size arrays for efficiency and bounded memory.
+ * ============================================================================ */
+
+int
+frame_cache_init(RemoteUnwinderObject *unwinder)
+{
+    unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
+    if (!unwinder->frame_cache) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    return 0;
+}
+
+void
+frame_cache_cleanup(RemoteUnwinderObject *unwinder)
+{
+    if (!unwinder->frame_cache) {
+        return;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        Py_CLEAR(unwinder->frame_cache[i].frame_list);
+    }
+    PyMem_Free(unwinder->frame_cache);
+    unwinder->frame_cache = NULL;
+}
+
+// Find cache entry by thread_id
+FrameCacheEntry *
+frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    return NULL;
+}
+
+// Allocate a cache slot for a thread
+// Returns NULL if cache is full (graceful degradation)
+static FrameCacheEntry *
+frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    // First check if thread already has an entry
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Find empty slot
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Cache full - graceful degradation
+    return NULL;
+}
+
+// Remove cache entries for threads not seen in the result
+// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
+// and ThreadInfo[0] is the thread_id
+void
+frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
+{
+    if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
+        return;
+    }
+
+    // Build array of seen thread IDs from result
+    uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
+    int num_seen = 0;
+
+    Py_ssize_t num_interps = PyList_GET_SIZE(result);
+    for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(result, i);
+        PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
+        if (!threads || !PyList_Check(threads)) {
+            continue;
+        }
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
+            if (tid_obj) {
+                uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
+                if (!PyErr_Occurred()) {
+                    seen_threads[num_seen++] = tid;
+                } else {
+                    PyErr_Clear();
+                }
+            }
+        }
+    }
+
+    // Invalidate entries not in seen list
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            continue;
+        }
+        int found = 0;
+        for (int j = 0; j < num_seen; j++) {
+            if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
+                found = 1;
+                break;
+            }
+        }
+        if (!found) {
+            // Clear this entry
+            Py_CLEAR(unwinder->frame_cache[i].frame_list);
+            unwinder->frame_cache[i].thread_id = 0;
+            unwinder->frame_cache[i].num_addrs = 0;
+            STATS_INC(unwinder, stale_cache_invalidations);
+        }
+    }
+}
+
+// Find last_profiled_frame in cache and extend frame_info with cached continuation
+// If frame_addrs is provided (not NULL), also extends it with cached addresses
+int
+frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Find the index where last_profiled_frame matches
+    Py_ssize_t start_idx = -1;
+    for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
+        if (entry->addrs[i] == last_profiled_frame) {
+            start_idx = i;
+            break;
+        }
+    }
+
+    if (start_idx < 0) {
+        return 0;  // Not found
+    }
+
+    Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
+
+    // Extend frame_info with frames from start_idx onwards
+    PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames);
+    if (!slice) {
+        return -1;
+    }
+
+    Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+    int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
+    Py_DECREF(slice);
+
+    if (result < 0) {
+        return -1;
+    }
+
+    // Also extend frame_addrs with cached addresses if provided
+    if (frame_addrs) {
+        for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
+            frame_addrs[(*num_addrs)++] = entry->addrs[i];
+        }
+    }
+
+    return 1;
+}
+
+// Store frame list with addresses in cache
+// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
+int
+frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return 0;
+    }
+
+    // Clamp to max frames
+    if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
+        num_addrs = FRAME_CACHE_MAX_FRAMES;
+    }
+
+    FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
+    if (!entry) {
+        // Cache full - graceful degradation
+        return 0;
+    }
+
+    // Clear old frame_list if replacing
+    Py_CLEAR(entry->frame_list);
+
+    // Store full frame list (don't truncate to num_addrs - frames beyond the
+    // address array limit are still valid and needed for full cache hits)
+    Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
+    entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
+    if (!entry->frame_list) {
+        return -1;
+    }
+    entry->thread_id = thread_id;
+    memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
+    entry->num_addrs = num_addrs;
+
+    return 1;
+}
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@ -189,6 +189,8 @@ parse_frame_object(
        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
        return -1;
    }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);

    *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
    uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
@ -258,14 +260,39 @@ process_frame_chain(
    uintptr_t initial_frame_addr,
    StackChunkList *chunks,
    PyObject *frame_info,
-    uintptr_t gc_frame)
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,      // optional: C array to receive frame addresses
+    Py_ssize_t *num_addrs,       // in/out: current count / updated count
+    Py_ssize_t max_addrs)        // max capacity of frame_addrs array
 {
    uintptr_t frame_addr = initial_frame_addr;
    uintptr_t prev_frame_addr = 0;
-    const size_t MAX_FRAMES = 1024;
+    const size_t MAX_FRAMES = 1024 + 512;
    size_t frame_count = 0;

+    // Initialize output flag
+    if (stopped_at_cached_frame) {
+        *stopped_at_cached_frame = 0;
+    }
+
+    // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
+    if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
+        if (stopped_at_cached_frame) {
+            *stopped_at_cached_frame = 1;
+        }
+        return 0;
+    }
+
    while ((void*)frame_addr != NULL) {
+        // Check if we've reached the cached frame - if so, stop here
+        if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
+            if (stopped_at_cached_frame) {
+                *stopped_at_cached_frame = 1;
+            }
+            break;
+        }
        PyObject *frame = NULL;
        uintptr_t next_frame_addr = 0;
        uintptr_t stackpointer = 0;
@ -286,7 +313,6 @@ process_frame_chain(
            }
        }
        if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
-            // If the first frame is missing, the chain is broken:
            const char *e = "Failed to parse initial frame in chain";
            PyErr_SetString(PyExc_RuntimeError, e);
            return -1;
@ -310,36 +336,40 @@ process_frame_chain(
            extra_frame = &_Py_STR(native);
        }
        if (extra_frame) {
-            // Use "~" as file and 0 as line, since that's what pstats uses:
            PyObject *extra_frame_info = make_frame_info(
                unwinder, _Py_LATIN1_CHR('~'), _PyLong_GetZero(), extra_frame);
            if (extra_frame_info == NULL) {
                return -1;
            }
-            int error = PyList_Append(frame_info, extra_frame_info);
+            if (PyList_Append(frame_info, extra_frame_info) < 0) {
                Py_DECREF(extra_frame_info);
-            if (error) {
-                const char *e = "Failed to append extra frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
                return -1;
            }
+            // Extra frames use 0 as address (they're synthetic)
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = 0;
+            }
+            Py_DECREF(extra_frame_info);
        }
        if (frame) {
            if (prev_frame_addr && frame_addr != prev_frame_addr) {
                const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx";
                PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr);
                Py_DECREF(frame);
-                const char *e = "Frame chain consistency check failed";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed");
                return -1;
            }

-            if (PyList_Append(frame_info, frame) == -1) {
+            if (PyList_Append(frame_info, frame) < 0) {
                Py_DECREF(frame);
-                const char *e = "Failed to append frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
                return -1;
            }
+            // Track the address for this frame
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = frame_addr;
+            }
            Py_DECREF(frame);
        }

@ -349,3 +379,208 @@ process_frame_chain(

    return 0;
 }
+
+// Clear last_profiled_frame for all threads in the target process.
+// This must be called at the start of profiling to avoid stale values
+// from previous profilers causing us to stop frame walking early.
+int
+clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
+{
+    uintptr_t current_interp = unwinder->interpreter_addr;
+    uintptr_t zero = 0;
+
+    while (current_interp != 0) {
+        // Get first thread in this interpreter
+        uintptr_t tstate_addr;
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.threads_head,
+                sizeof(void*),
+                &tstate_addr) < 0) {
+            // Non-fatal: just skip clearing
+            PyErr_Clear();
+            return 0;
+        }
+
+        // Iterate all threads in this interpreter
+        while (tstate_addr != 0) {
+            // Clear last_profiled_frame
+            uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
+            if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                                  sizeof(uintptr_t), &zero) < 0) {
+                // Non-fatal: just continue
+                PyErr_Clear();
+            }
+
+            // Move to next thread
+            if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                    &unwinder->handle,
+                    tstate_addr + unwinder->debug_offsets.thread_state.next,
+                    sizeof(void*),
+                    &tstate_addr) < 0) {
+                PyErr_Clear();
+                break;
+            }
+        }
+
+        // Move to next interpreter
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.next,
+                sizeof(void*),
+                &current_interp) < 0) {
+            PyErr_Clear();
+            break;
+        }
+    }
+
+    return 0;
+}
+
+// Fast path: check if we have a full cache hit (parent stack unchanged)
+// A "full hit" means current frame == last profiled frame, so we can reuse
+// cached parent frames. We always read the current frame from memory to get
+// updated line numbers (the line within a frame can change between samples).
+// Returns: 1 if full hit (frame_info populated with current frame + cached parents),
+//          0 if miss, -1 on error
+static int
+try_full_cache_hit(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id,
+    PyObject *frame_info)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+    // Full hit only if current frame == last profiled frame
+    if (frame_addr != last_profiled_frame) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Verify first address matches (sanity check)
+    if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
+        return 0;
+    }
+
+    // Always read the current frame from memory to get updated line number
+    PyObject *current_frame = NULL;
+    uintptr_t code_object_addr = 0;
+    uintptr_t previous_frame = 0;
+    int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
+                                          &code_object_addr, &previous_frame);
+    if (parse_result < 0) {
+        return -1;
+    }
+
+    // Get cached parent frames first (before modifying frame_info)
+    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+    PyObject *parent_slice = NULL;
+    if (cached_size > 1) {
+        parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
+        if (!parent_slice) {
+            Py_XDECREF(current_frame);
+            return -1;
+        }
+    }
+
+    // Now safe to modify frame_info - add current frame if valid
+    if (current_frame != NULL) {
+        if (PyList_Append(frame_info, current_frame) < 0) {
+            Py_DECREF(current_frame);
+            Py_XDECREF(parent_slice);
+            return -1;
+        }
+        Py_DECREF(current_frame);
+        STATS_ADD(unwinder, frames_read_from_memory, 1);
+    }
+
+    // Extend with cached parent frames
+    if (parent_slice) {
+        Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+        int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
+        Py_DECREF(parent_slice);
+        if (result < 0) {
+            return -1;
+        }
+        STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
+    }
+
+    STATS_INC(unwinder, frame_cache_hits);
+    return 1;
+}
+
+// High-level helper: collect frames with cache optimization
+// Returns complete frame_info list, handling all cache logic internally
+int
+collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id)
+{
+    // Fast path: check for full cache hit first (no allocations needed)
+    int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
+                                       thread_id, frame_info);
+    if (full_hit != 0) {
+        return full_hit < 0 ? -1 : 0;  // Either error or success
+    }
+
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs = 0;
+    Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
+
+    int stopped_at_cached = 0;
+    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame,
+                            last_profiled_frame, &stopped_at_cached,
+                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+        return -1;
+    }
+
+    // Track frames read from memory (frames added by process_frame_chain)
+    STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
+
+    // If stopped at cached frame, extend with cached continuation (both frames and addresses)
+    if (stopped_at_cached) {
+        Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
+        int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
+                                                         frame_info, addrs, &num_addrs,
+                                                         FRAME_CACHE_MAX_FRAMES);
+        if (cache_result < 0) {
+            return -1;
+        }
+        if (cache_result == 0) {
+            // Cache miss - continue walking from last_profiled_frame to get the rest
+            STATS_INC(unwinder, frame_cache_misses);
+            Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
+            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame,
+                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+                return -1;
+            }
+            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
+        } else {
+            // Partial cache hit
+            STATS_INC(unwinder, frame_cache_partial_hits);
+            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
+        }
+    } else if (last_profiled_frame == 0) {
+        // No cache involvement (no last_profiled_frame or cache disabled)
+        STATS_INC(unwinder, frame_cache_misses);
+    }
+
+    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
+    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__
    skip_non_matching_threads: bool = True
    native: bool = False
    gc: bool = False
+    cache_frames: bool = False
+    stats: bool = False

 Initialize a new RemoteUnwinder object for debugging a remote Python process.

@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
            non-Python code.
    gc: If True, include artificial "<GC>" frames to denote active garbage
        collection.
+    cache_frames: If True, enable frame caching optimization to avoid re-reading
+                 unchanged parent frames between samples.
+    stats: If True, collect statistics about cache hits, memory reads, etc.
+           Use get_stats() to retrieve the collected statistics.

 The RemoteUnwinder provides functionality to inspect and debug a running Python
 process, including examining thread states, stack frames and other runtime data.
@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                               int only_active_thread,
                                               int mode, int debug,
                                               int skip_non_matching_threads,
-                                               int native, int gc)
-/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/
+                                               int native, int gc,
+                                               int cache_frames, int stats)
+/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/
 {
    // Validate that all_threads and only_active_thread are not both True
    if (all_threads && only_active_thread) {
@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
 #ifdef Py_GIL_DISABLED
    if (only_active_thread) {
        PyErr_SetString(PyExc_ValueError,
-                       "only_active_thread is not supported when Py_GIL_DISABLED is not defined");
+                       "only_active_thread is not supported in free-threaded builds");
        return -1;
    }
 #endif

    self->native = native;
    self->gc = gc;
+    self->cache_frames = cache_frames;
+    self->collect_stats = stats;
+    self->stale_invalidation_counter = 0;
    self->debug = debug;
    self->only_active_thread = only_active_thread;
    self->mode = mode;
    self->skip_non_matching_threads = skip_non_matching_threads;
    self->cached_state = NULL;
+    self->frame_cache = NULL;
+    // Initialize stats to zero
+    memset(&self->stats, 0, sizeof(self->stats));
    if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
        set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
        return -1;
@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
    self->win_process_buffer_size = 0;
 #endif

+    if (cache_frames && frame_cache_init(self) < 0) {
+        return -1;
+    }
+
+    // Clear stale last_profiled_frame values from previous profilers
+    // This prevents us from stopping frame walking early due to stale values
+    if (cache_frames) {
+        clear_last_profiled_frames(self);
+    }
+
    return 0;
 }

@ -429,6 +452,8 @@ static PyObject *
 _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
 /*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/
 {
+    STATS_INC(self, total_samples);
+
    PyObject* result = PyList_New(0);
    if (!result) {
        set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list");
@ -591,6 +616,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
    }

 exit:
+    // Invalidate cache entries for threads not seen in this sample.
+    // Only do this every 1024 iterations to avoid performance overhead.
+    if (self->cache_frames && result) {
+        if (++self->stale_invalidation_counter >= 1024) {
+            self->stale_invalidation_counter = 0;
+            frame_cache_invalidate_stale(self, result);
+        }
+    }
    _Py_RemoteDebug_ClearCache(&self->handle);
    return result;
 }
@ -757,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
    return NULL;
 }

+/*[clinic input]
+@permit_long_docstring_body
+@critical_section
+_remote_debugging.RemoteUnwinder.get_stats
+
+Get collected statistics about profiling performance.
+
+Returns a dictionary containing statistics about cache performance,
+memory reads, and other profiling metrics. Only available if the
+RemoteUnwinder was created with stats=True.
+
+Returns:
+    dict: A dictionary containing:
+        - total_samples: Total number of get_stack_trace calls
+        - frame_cache_hits: Full cache hits (entire stack unchanged)
+        - frame_cache_misses: Cache misses requiring full walk
+        - frame_cache_partial_hits: Partial hits (stopped at cached frame)
+        - frames_read_from_cache: Total frames retrieved from cache
+        - frames_read_from_memory: Total frames read from remote memory
+        - memory_reads: Total remote memory read operations
+        - memory_bytes_read: Total bytes read from remote memory
+        - code_object_cache_hits: Code object cache hits
+        - code_object_cache_misses: Code object cache misses
+        - stale_cache_invalidations: Times stale cache entries were cleared
+        - frame_cache_hit_rate: Percentage of samples that hit the cache
+        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache
+
+Raises:
+    RuntimeError: If stats collection was not enabled (stats=False)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+{
+    if (!self->collect_stats) {
+        PyErr_SetString(PyExc_RuntimeError,
+                       "Statistics collection was not enabled. "
+                       "Create RemoteUnwinder with stats=True to collect statistics.");
+        return NULL;
+    }
+
+    PyObject *result = PyDict_New();
+    if (!result) {
+        return NULL;
+    }
+
+#define ADD_STAT(name) do { \
+    PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \
+    if (!val || PyDict_SetItemString(result, #name, val) < 0) { \
+        Py_XDECREF(val); \
+        Py_DECREF(result); \
+        return NULL; \
+    } \
+    Py_DECREF(val); \
+} while(0)
+
+    ADD_STAT(total_samples);
+    ADD_STAT(frame_cache_hits);
+    ADD_STAT(frame_cache_misses);
+    ADD_STAT(frame_cache_partial_hits);
+    ADD_STAT(frames_read_from_cache);
+    ADD_STAT(frames_read_from_memory);
+    ADD_STAT(memory_reads);
+    ADD_STAT(memory_bytes_read);
+    ADD_STAT(code_object_cache_hits);
+    ADD_STAT(code_object_cache_misses);
+    ADD_STAT(stale_cache_invalidations);
+
+#undef ADD_STAT
+
+    // Calculate and add derived statistics
+    // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
+    double frame_cache_hit_rate = 0.0;
+    uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses;
+    if (total_cache_lookups > 0) {
+        frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
+                               / (double)total_cache_lookups;
+    }
+    PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
+    if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
+        Py_XDECREF(hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(hit_rate);
+
+    double code_object_hit_rate = 0.0;
+    uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
+    if (total_code_lookups > 0) {
+        code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
+    }
+    PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
+    if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
+        Py_XDECREF(code_hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(code_hit_rate);
+
+    return result;
+}
+
 static PyMethodDef RemoteUnwinder_methods[] = {
    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF
    {NULL, NULL}
 };

@ -787,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op)
        _Py_RemoteDebug_ClearCache(&self->handle);
        _Py_RemoteDebug_CleanupProcHandle(&self->handle);
    }
+    frame_cache_cleanup(self);
    PyObject_Del(self);
    Py_DECREF(tp);
 }
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@ -296,6 +296,8 @@ unwind_stack_for_thread(
        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
        goto error;
    }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);

    long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);

@ -309,6 +311,8 @@ unwind_stack_for_thread(
        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
        goto error;
    }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size);

    // Calculate thread status using flags (always)
    int status_flags = 0;
@ -383,15 +387,37 @@ unwind_stack_for_thread(
        goto error;
    }

+    // In cache mode, copying stack chunks is more expensive than direct memory reads
+    if (!unwinder->cache_frames) {
        if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
            goto error;
        }
+    }

-    if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) {
+    if (unwinder->cache_frames) {
+        // Use cache to avoid re-reading unchanged parent frames
+        uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
+            unwinder->debug_offsets.thread_state.last_profiled_frame);
+        if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
+                                      gc_frame, last_profiled_frame, tid) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
+            goto error;
+        }
+        // Update last_profiled_frame for next sample
+        uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
+        if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                              sizeof(uintptr_t), &frame_addr) < 0) {
+            PyErr_Clear();  // Non-fatal
+        }
+    } else {
+        // No caching - process entire frame chain
+        if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
+                                gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
            goto error;
        }
+    }

    *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);

--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@ -102,6 +102,7 @@
    <ClCompile Include="..\Modules\_remote_debugging\object_reading.c" />
    <ClCompile Include="..\Modules\_remote_debugging\code_objects.c" />
    <ClCompile Include="..\Modules\_remote_debugging\frames.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
    <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
    <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
  </ItemGroup>
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@ -24,6 +24,9 @@
    <ClCompile Include="..\Modules\_remote_debugging\frames.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
    <ClCompile Include="..\Modules\_remote_debugging\threads.c">
      <Filter>Source Files</Filter>
    </ClCompile>
--- a/Python/ceval.c
+++ b/Python/ceval.c
@ -2288,6 +2288,16 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
 void
 _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame)
 {
+    // Update last_profiled_frame for remote profiler frame caching.
+    // By this point, tstate->current_frame is already set to the parent frame.
+    // Only update if we're popping the exact frame that was last profiled.
+    // This avoids corrupting the cache when transient frames (called and returned
+    // between profiler samples) update last_profiled_frame to addresses the
+    // profiler never saw.
+    if (tstate->last_profiled_frame != NULL && tstate->last_profiled_frame == frame) {
+        tstate->last_profiled_frame = tstate->current_frame;
+    }
+
    if (frame->owner == FRAME_OWNED_BY_THREAD) {
        clear_thread_frame(tstate, frame);
    }
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@ -1102,6 +1102,115 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }

+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+// Fallback write using /proc/pid/mem
+static int
+_Py_RemoteDebug_WriteRemoteMemoryFallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+    if (handle->memfd == -1) {
+        if (open_proc_mem_fd(handle) < 0) {
+            return -1;
+        }
+    }
+
+    struct iovec local[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (char*)src + result;
+        local[0].iov_len = len - result;
+        off_t offset = remote_address + result;
+
+        written = pwritev(handle->memfd, local, 1, offset);
+        if (written < 0) {
+            PyErr_SetFromErrno(PyExc_OSError);
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+}
+#endif // __linux__
+
+// Platform-independent memory write function
+UNUSED static int
+_Py_RemoteDebug_WriteRemoteMemory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+#ifdef MS_WINDOWS
+    SIZE_T written = 0;
+    SIZE_T result = 0;
+    do {
+        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
+            PyErr_SetFromWindowsErr(0);
+            DWORD error = GetLastError();
+            _set_debug_exception_cause(PyExc_OSError,
+                "WriteProcessMemory failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zu bytes): Windows error %lu",
+                handle->pid, remote_address + result, len - result, result, error);
+            return -1;
+        }
+        result += written;
+    } while (result < len);
+    return 0;
+#elif defined(__linux__) && HAVE_PROCESS_VM_READV
+    if (handle->memfd != -1) {
+        return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+    }
+    struct iovec local[1];
+    struct iovec remote[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (void*)((char*)src + result);
+        local[0].iov_len = len - result;
+        remote[0].iov_base = (void*)((char*)remote_address + result);
+        remote[0].iov_len = len - result;
+
+        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
+        if (written < 0) {
+            if (errno == ENOSYS) {
+                return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+            }
+            PyErr_SetFromErrno(PyExc_OSError);
+            _set_debug_exception_cause(PyExc_OSError,
+                "process_vm_writev failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zd bytes): %s",
+                handle->pid, remote_address + result, len - result, result, strerror(errno));
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+    kern_return_t kr = mach_vm_write(
+        handle->task,
+        (mach_vm_address_t)remote_address,
+        (vm_offset_t)src,
+        (mach_msg_type_number_t)len);
+
+    if (kr != KERN_SUCCESS) {
+        switch (kr) {
+        case KERN_PROTECTION_FAILURE:
+            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
+            break;
+        case KERN_INVALID_ARGUMENT:
+            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
+            break;
+        default:
+            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
+        }
+        return -1;
+    }
+    return 0;
+#else
+    Py_UNREACHABLE();
+#endif
+}
+
 UNUSED static int
 _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
                                      uintptr_t addr,
--- a/Python/remote_debugging.c
+++ b/Python/remote_debugging.c
@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d
    return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst);
 }

-// Why is pwritev not guarded? Except on Android API level 23 (no longer
-// supported), HAVE_PROCESS_VM_READV is sufficient.
-#if defined(__linux__) && HAVE_PROCESS_VM_READV
-static int
-write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
-{
-    if (handle->memfd == -1) {
-        if (open_proc_mem_fd(handle) < 0) {
-            return -1;
-        }
-    }
-
-    struct iovec local[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (char*)src + result;
-        local[0].iov_len = len - result;
-        off_t offset = remote_address + result;
-
-        written = pwritev(handle->memfd, local, 1, offset);
-        if (written < 0) {
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-}
-#endif // __linux__
-
+// Use the shared write function from remote_debug.h
 static int
 write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
 {
-#ifdef MS_WINDOWS
-    SIZE_T written = 0;
-    SIZE_T result = 0;
-    do {
-        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
-            PyErr_SetFromWindowsErr(0);
-            return -1;
-        }
-        result += written;
-    } while (result < len);
-    return 0;
-#elif defined(__linux__) && HAVE_PROCESS_VM_READV
-    if (handle->memfd != -1) {
-        return write_memory_fallback(handle, remote_address, len, src);
-    }
-    struct iovec local[1];
-    struct iovec remote[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (void*)((char*)src + result);
-        local[0].iov_len = len - result;
-        remote[0].iov_base = (void*)((char*)remote_address + result);
-        remote[0].iov_len = len - result;
-
-        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
-        if (written < 0) {
-            if (errno == ENOSYS) {
-                return write_memory_fallback(handle, remote_address, len, src);
-            }
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-#elif defined(__APPLE__) && TARGET_OS_OSX
-    kern_return_t kr = mach_vm_write(
-        pid_to_task(handle->pid),
-        (mach_vm_address_t)remote_address,
-        (vm_offset_t)src,
-        (mach_msg_type_number_t)len);
-
-    if (kr != KERN_SUCCESS) {
-        switch (kr) {
-        case KERN_PROTECTION_FAILURE:
-            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
-            break;
-        case KERN_INVALID_ARGUMENT:
-            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
-            break;
-        default:
-            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
-        }
-        return -1;
-    }
-    return 0;
-#else
-    Py_UNREACHABLE();
-#endif
+    return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src);
 }

 static int
--- a/Tools/inspection/benchmark_external_inspection.py
+++ b/Tools/inspection/benchmark_external_inspection.py
@ -434,7 +434,7 @@ def main():
                    elif args.threads == "only_active":
                        kwargs["only_active_thread"] = True
                    unwinder = _remote_debugging.RemoteUnwinder(
-                        process.pid, **kwargs
+                        process.pid, cache_frames=True, **kwargs
                    )
                    results = benchmark(unwinder, duration_seconds=args.duration)
                finally: