diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index 1e1e46ea4c0..08d71070ddc 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -135,6 +135,8 @@ struct _ts {
     /* Pointer to currently executing frame. */
     struct _PyInterpreterFrame *current_frame;
 
+    struct _PyInterpreterFrame *last_profiled_frame;
+
     Py_tracefunc c_profilefunc;
     Py_tracefunc c_tracefunc;
     PyObject *c_profileobj;
diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h
index 0f17bf17f82..bfd86c08887 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
         uint64_t next;
         uint64_t interp;
         uint64_t current_frame;
+        uint64_t last_profiled_frame;
         uint64_t thread_id;
         uint64_t native_thread_id;
         uint64_t datastack_chunk;
@@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
         .next = offsetof(PyThreadState, next), \
         .interp = offsetof(PyThreadState, interp), \
         .current_frame = offsetof(PyThreadState, current_frame), \
+        .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
         .thread_id = offsetof(PyThreadState, thread_id), \
         .native_thread_id = offsetof(PyThreadState, native_thread_id), \
         .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 783747d1f01..d23d6d4f91b 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1609,6 +1609,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_parameter_type));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_stack));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cache_frames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
@@ -2053,6 +2054,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index 374617d8284..5c3ea474ad0 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -332,6 +332,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(c_parameter_type)
         STRUCT_FOR_ID(c_return)
         STRUCT_FOR_ID(c_stack)
+        STRUCT_FOR_ID(cache_frames)
         STRUCT_FOR_ID(cached_datetime_module)
         STRUCT_FOR_ID(cached_statements)
         STRUCT_FOR_ID(cadata)
@@ -776,6 +777,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(stacklevel)
         STRUCT_FOR_ID(start)
         STRUCT_FOR_ID(statement)
+        STRUCT_FOR_ID(stats)
         STRUCT_FOR_ID(status)
         STRUCT_FOR_ID(stderr)
         STRUCT_FOR_ID(stdin)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index a66c97f7f13..31d88339a13 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1607,6 +1607,7 @@ extern "C" {
     INIT_ID(c_parameter_type), \
     INIT_ID(c_return), \
     INIT_ID(c_stack), \
+    INIT_ID(cache_frames), \
     INIT_ID(cached_datetime_module), \
     INIT_ID(cached_statements), \
     INIT_ID(cadata), \
@@ -2051,6 +2052,7 @@ extern "C" {
     INIT_ID(stacklevel), \
     INIT_ID(start), \
     INIT_ID(statement), \
+    INIT_ID(stats), \
     INIT_ID(status), \
     INIT_ID(stderr), \
     INIT_ID(stdin), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index 2061b1d2049..c5b01ff9876 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1108,6 +1108,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cache_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(cached_datetime_module);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2884,6 +2888,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stats);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(status);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md
index 804d7436018..d56e5481d3c 100644
--- a/InternalDocs/frames.md
+++ b/InternalDocs/frames.md
@@ -111,6 +111,26 @@ ### Shim frames
 instruction which cleans up the shim frame and returns.
 
 
+### Remote Profiling Frame Cache
+
+The `last_profiled_frame` field in `PyThreadState` supports an optimization for
+remote profilers that sample call stacks from external processes. When a remote
+profiler reads the call stack, it writes the current frame address to this field.
+The eval loop then keeps this pointer valid by updating it to the parent frame
+whenever a frame returns (in `_PyEval_FrameClearAndPop`).
+
+This creates a "high-water mark" that always points to a frame still on the stack.
+On subsequent samples, the profiler can walk from `current_frame` until it reaches
+`last_profiled_frame`, knowing that frames from that point downward are unchanged
+and can be retrieved from a cache. This significantly reduces the amount of remote
+memory reads needed when call stacks are deep and stable at their base.
+
+The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
+`last_profiled_frame` is non-NULL AND matches the frame being popped. This
+prevents transient frames (called and returned between profiler samples) from
+corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
+
+
 ### The Instruction Pointer
 
 `_PyInterpreterFrame` has two fields which are used to maintain the instruction
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 99cac71a404..dd4ea1edbf6 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -27,21 +27,24 @@
 
 
 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
         self.pid = pid
         self.sample_interval_usec = sample_interval_usec
         self.all_threads = all_threads
         self.mode = mode  # Store mode for later use
+        self.collect_stats = collect_stats
         if _FREE_THREADED_BUILD:
             self.unwinder = _remote_debugging.RemoteUnwinder(
                 self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
             )
         else:
             only_active_threads = bool(self.all_threads)
             self.unwinder = _remote_debugging.RemoteUnwinder(
                 self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
             )
         # Track sample intervals and total sample count
         self.sample_intervals = deque(maxlen=100)
@@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
             print(f"Sample rate: {sample_rate:.2f} samples/sec")
             print(f"Error rate: {error_rate:.2f}%")
 
+            # Print unwinder stats if stats collection is enabled
+            if self.collect_stats:
+                self._print_unwinder_stats()
+
         # Pass stats to flamegraph collector if it's the right type
         if hasattr(collector, 'set_stats'):
             collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -176,17 +183,100 @@ def _print_realtime_stats(self):
             (1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
         )  # Max time = Min Hz
 
+        # Build cache stats string if stats collection is enabled
+        cache_stats_str = ""
+        if self.collect_stats:
+            try:
+                stats = self.unwinder.get_stats()
+                hits = stats.get('frame_cache_hits', 0)
+                partial = stats.get('frame_cache_partial_hits', 0)
+                misses = stats.get('frame_cache_misses', 0)
+                total = hits + partial + misses
+                if total > 0:
+                    hit_pct = (hits + partial) / total * 100
+                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
+            except RuntimeError:
+                pass
+
         # Clear line and print stats
         print(
-            f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
-            f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
+            f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
+            f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
+            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
+            f"{cache_stats_str}",
             end="",
             flush=True,
         )
 
+    def _print_unwinder_stats(self):
+        """Print unwinder statistics including cache performance."""
+        try:
+            stats = self.unwinder.get_stats()
+        except RuntimeError:
+            return  # Stats not enabled
+
+        print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
+        print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
+
+        # Frame cache stats
+        total_samples = stats.get('total_samples', 0)
+        frame_cache_hits = stats.get('frame_cache_hits', 0)
+        frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
+        frame_cache_misses = stats.get('frame_cache_misses', 0)
+        total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
+
+        # Calculate percentages
+        hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
+        partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
+        misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
+        print(f"    Total samples:    {total_samples:,}")
+        print(f"    Full hits:        {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Partial hits:     {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Frame read stats
+        frames_from_cache = stats.get('frames_read_from_cache', 0)
+        frames_from_memory = stats.get('frames_read_from_memory', 0)
+        total_frames = frames_from_cache + frames_from_memory
+        cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
+        memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
+        print(f"    From cache:       {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    From memory:      {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
+
+        # Code object cache stats
+        code_hits = stats.get('code_object_cache_hits', 0)
+        code_misses = stats.get('code_object_cache_misses', 0)
+        total_code = code_hits + code_misses
+        code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
+        code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
+        print(f"    Hits:             {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Memory operations
+        memory_reads = stats.get('memory_reads', 0)
+        memory_bytes = stats.get('memory_bytes_read', 0)
+        if memory_bytes >= 1024 * 1024:
+            memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
+        elif memory_bytes >= 1024:
+            memory_str = f"{memory_bytes / 1024:.1f} KB"
+        else:
+            memory_str = f"{memory_bytes} B"
+        print(f"  {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
+        print(f"    Read operations:  {memory_reads:,} ({memory_str})")
+
+        # Stale invalidations
+        stale_invalidations = stats.get('stale_cache_invalidations', 0)
+        if stale_invalidations > 0:
+            print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
+
 
 def sample(
     pid,
@@ -234,7 +324,8 @@ def sample(
         mode=mode,
         native=native,
         gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
     )
     profiler.realtime_stats = realtime_stats
 
@@ -290,7 +381,8 @@ def sample_live(
         mode=mode,
         native=native,
         gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
     )
     profiler.realtime_stats = realtime_stats
 
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index 7decd8f32d5..2e6e6eaad06 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -1,3 +1,4 @@
+import contextlib
 import unittest
 import os
 import textwrap
@@ -2038,5 +2039,766 @@ def busy_thread():
                 p.stderr.close()
 
 
+class TestFrameCaching(unittest.TestCase):
+    """Test that frame caching produces correct results.
+
+    Uses socket-based synchronization for deterministic testing.
+    All tests verify cache reuse via object identity checks (assertIs).
+    """
+
+    maxDiff = None
+    MAX_TRIES = 10
+
+    @contextlib.contextmanager
+    def _target_process(self, script_body):
+        """Context manager for running a target process with socket sync."""
+        port = find_unused_port()
+        script = f"""\
+import socket
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.connect(('localhost', {port}))
+{textwrap.dedent(script_body)}
+"""
+
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "script", script)
+            client_socket = None
+            p = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+
+                def make_unwinder(cache_frames=True):
+                    return RemoteUnwinder(p.pid, all_threads=True, cache_frames=cache_frames)
+
+                yield p, client_socket, make_unwinder
+
+            except PermissionError:
+                self.skipTest("Insufficient permissions to read the stack trace")
+            finally:
+                if client_socket:
+                    client_socket.close()
+                if p:
+                    p.kill()
+                    p.terminate()
+                    p.wait(timeout=SHORT_TIMEOUT)
+
+    def _wait_for_signal(self, client_socket, signal):
+        """Block until signal received from target."""
+        response = b""
+        while signal not in response:
+            chunk = client_socket.recv(64)
+            if not chunk:
+                break
+            response += chunk
+        return response
+
+    def _get_frames(self, unwinder, required_funcs):
+        """Sample and return frame_info list for thread containing required_funcs."""
+        traces = unwinder.get_stack_trace()
+        for interp in traces:
+            for thread in interp.threads:
+                funcs = [f.funcname for f in thread.frame_info]
+                if required_funcs.issubset(set(funcs)):
+                    return thread.frame_info
+        return None
+
+    def _sample_frames(self, client_socket, unwinder, wait_signal, send_ack, required_funcs, expected_frames=1):
+        """Wait for signal, sample frames, send ack. Returns frame_info list."""
+        self._wait_for_signal(client_socket, wait_signal)
+        # Give at least MAX_TRIES tries for the process to arrive to a steady state
+        for _ in range(self.MAX_TRIES):
+            frames = self._get_frames(unwinder, required_funcs)
+            if frames and len(frames) >= expected_frames:
+                break
+            time.sleep(0.1)
+        client_socket.sendall(send_ack)
+        return frames
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_hit_same_stack(self):
+        """Test that consecutive samples reuse cached parent frame objects.
+
+        The current frame (index 0) is always re-read from memory to get
+        updated line numbers, so it may be a different object. Parent frames
+        (index 1+) should be identical objects from cache.
+        """
+        script_body = """\
+            def level3():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+                sock.sendall(b"sync3")
+                sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+            expected = {"level1", "level2", "level3"}
+
+            frames1 = self._sample_frames(client_socket, unwinder, b"sync1", b"ack", expected)
+            frames2 = self._sample_frames(client_socket, unwinder, b"sync2", b"ack", expected)
+            frames3 = self._sample_frames(client_socket, unwinder, b"sync3", b"done", expected)
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+        self.assertIsNotNone(frames3)
+        self.assertEqual(len(frames1), len(frames2))
+        self.assertEqual(len(frames2), len(frames3))
+
+        # Current frame (index 0) is always re-read, so check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+        self.assertEqual(frames2[0].funcname, frames3[0].funcname)
+
+        # Parent frames (index 1+) must be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            f1, f2, f3 = frames1[i], frames2[i], frames3[i]
+            self.assertIs(f1, f2, f"Frame {i}: samples 1-2 must be same object")
+            self.assertIs(f2, f3, f"Frame {i}: samples 2-3 must be same object")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_line_number_updates_in_same_frame(self):
+        """Test that line numbers are correctly updated when execution moves within a function.
+
+        When the profiler samples at different points within the same function,
+        it must report the correct line number for each sample, not stale cached values.
+        """
+        script_body = """\
+            def outer():
+                inner()
+
+            def inner():
+                sock.sendall(b"line_a"); sock.recv(16)
+                sock.sendall(b"line_b"); sock.recv(16)
+                sock.sendall(b"line_c"); sock.recv(16)
+                sock.sendall(b"line_d"); sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_a = self._sample_frames(client_socket, unwinder, b"line_a", b"ack", {"inner"})
+            frames_b = self._sample_frames(client_socket, unwinder, b"line_b", b"ack", {"inner"})
+            frames_c = self._sample_frames(client_socket, unwinder, b"line_c", b"ack", {"inner"})
+            frames_d = self._sample_frames(client_socket, unwinder, b"line_d", b"done", {"inner"})
+
+        self.assertIsNotNone(frames_a)
+        self.assertIsNotNone(frames_b)
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Get the 'inner' frame from each sample (should be index 0)
+        inner_a = frames_a[0]
+        inner_b = frames_b[0]
+        inner_c = frames_c[0]
+        inner_d = frames_d[0]
+
+        self.assertEqual(inner_a.funcname, "inner")
+        self.assertEqual(inner_b.funcname, "inner")
+        self.assertEqual(inner_c.funcname, "inner")
+        self.assertEqual(inner_d.funcname, "inner")
+
+        # Line numbers must be different and increasing (execution moves forward)
+        self.assertLess(inner_a.lineno, inner_b.lineno,
+                        "Line B should be after line A")
+        self.assertLess(inner_b.lineno, inner_c.lineno,
+                        "Line C should be after line B")
+        self.assertLess(inner_c.lineno, inner_d.lineno,
+                        "Line D should be after line C")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_return(self):
+        """Test cache invalidation when stack shrinks (function returns)."""
+        script_body = """\
+            def inner():
+                sock.sendall(b"at_inner")
+                sock.recv(16)
+
+            def outer():
+                inner()
+                sock.sendall(b"at_outer")
+                sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_deep = self._sample_frames(
+                client_socket, unwinder, b"at_inner", b"ack", {"inner", "outer"})
+            frames_shallow = self._sample_frames(
+                client_socket, unwinder, b"at_outer", b"done", {"outer"})
+
+        self.assertIsNotNone(frames_deep)
+        self.assertIsNotNone(frames_shallow)
+
+        funcs_deep = [f.funcname for f in frames_deep]
+        funcs_shallow = [f.funcname for f in frames_shallow]
+
+        self.assertIn("inner", funcs_deep)
+        self.assertIn("outer", funcs_deep)
+        self.assertNotIn("inner", funcs_shallow)
+        self.assertIn("outer", funcs_shallow)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_call(self):
+        """Test cache invalidation when stack grows (new function called)."""
+        script_body = """\
+            def deeper():
+                sock.sendall(b"at_deeper")
+                sock.recv(16)
+
+            def middle():
+                sock.sendall(b"at_middle")
+                sock.recv(16)
+                deeper()
+
+            def top():
+                middle()
+
+            top()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_before = self._sample_frames(
+                client_socket, unwinder, b"at_middle", b"ack", {"middle", "top"})
+            frames_after = self._sample_frames(
+                client_socket, unwinder, b"at_deeper", b"done", {"deeper", "middle", "top"})
+
+        self.assertIsNotNone(frames_before)
+        self.assertIsNotNone(frames_after)
+
+        funcs_before = [f.funcname for f in frames_before]
+        funcs_after = [f.funcname for f in frames_after]
+
+        self.assertIn("middle", funcs_before)
+        self.assertIn("top", funcs_before)
+        self.assertNotIn("deeper", funcs_before)
+
+        self.assertIn("deeper", funcs_after)
+        self.assertIn("middle", funcs_after)
+        self.assertIn("top", funcs_after)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_partial_stack_reuse(self):
+        """Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D)."""
+        script_body = """\
+            def func_c():
+                sock.sendall(b"at_c")
+                sock.recv(16)
+
+            def func_d():
+                sock.sendall(b"at_d")
+                sock.recv(16)
+
+            def func_b():
+                func_c()
+                func_d()
+
+            def func_a():
+                func_b()
+
+            func_a()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            # Sample at C: stack is A→B→C
+            frames_c = self._sample_frames(
+                client_socket, unwinder, b"at_c", b"ack", {"func_a", "func_b", "func_c"})
+            # Sample at D: stack is A→B→D (C returned, D called)
+            frames_d = self._sample_frames(
+                client_socket, unwinder, b"at_d", b"done", {"func_a", "func_b", "func_d"})
+
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Find func_a and func_b frames in both samples
+        def find_frame(frames, funcname):
+            for f in frames:
+                if f.funcname == funcname:
+                    return f
+            return None
+
+        frame_a_in_c = find_frame(frames_c, "func_a")
+        frame_b_in_c = find_frame(frames_c, "func_b")
+        frame_a_in_d = find_frame(frames_d, "func_a")
+        frame_b_in_d = find_frame(frames_d, "func_b")
+
+        self.assertIsNotNone(frame_a_in_c)
+        self.assertIsNotNone(frame_b_in_c)
+        self.assertIsNotNone(frame_a_in_d)
+        self.assertIsNotNone(frame_b_in_d)
+
+        # The bottom frames (A, B) should be the SAME objects (cache reuse)
+        self.assertIs(frame_a_in_c, frame_a_in_d, "func_a frame should be reused from cache")
+        self.assertIs(frame_b_in_c, frame_b_in_d, "func_b frame should be reused from cache")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_recursive_frames(self):
+        """Test caching with same function appearing multiple times (recursion)."""
+        script_body = """\
+            def recurse(n):
+                if n <= 0:
+                    sock.sendall(b"sync1")
+                    sock.recv(16)
+                    sock.sendall(b"sync2")
+                    sock.recv(16)
+                else:
+                    recurse(n - 1)
+
+            recurse(5)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames1 = self._sample_frames(
+                client_socket, unwinder, b"sync1", b"ack", {"recurse"})
+            frames2 = self._sample_frames(
+                client_socket, unwinder, b"sync2", b"done", {"recurse"})
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        # Should have multiple "recurse" frames (6 total: recurse(5) down to recurse(0))
+        recurse_count = sum(1 for f in frames1 if f.funcname == "recurse")
+        self.assertEqual(recurse_count, 6, "Should have 6 recursive frames")
+
+        self.assertEqual(len(frames1), len(frames2))
+
+        # Current frame (index 0) is re-read, check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+
+        # Parent frames (index 1+) should be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            self.assertIs(frames1[i], frames2[i],
+                          f"Frame {i}: recursive frames must be same object")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_vs_no_cache_equivalence(self):
+        """Test that cache_frames=True and cache_frames=False produce equivalent results."""
+        script_body = """\
+            def level3():
+                sock.sendall(b"ready"); sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            self._wait_for_signal(client_socket, b"ready")
+
+            # Sample with cache
+            unwinder_cache = make_unwinder(cache_frames=True)
+            frames_cached = self._get_frames(unwinder_cache, {"level1", "level2", "level3"})
+
+            # Sample without cache
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+            frames_no_cache = self._get_frames(unwinder_no_cache, {"level1", "level2", "level3"})
+
+            client_socket.sendall(b"done")
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Same number of frames
+        self.assertEqual(len(frames_cached), len(frames_no_cache))
+
+        # Same function names in same order
+        funcs_cached = [f.funcname for f in frames_cached]
+        funcs_no_cache = [f.funcname for f in frames_no_cache]
+        self.assertEqual(funcs_cached, funcs_no_cache)
+
+        # Same line numbers
+        lines_cached = [f.lineno for f in frames_cached]
+        lines_no_cache = [f.lineno for f in frames_no_cache]
+        self.assertEqual(lines_cached, lines_no_cache)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_per_thread_isolation(self):
+        """Test that frame cache is per-thread and cache invalidation works independently."""
+        script_body = """\
+            import threading
+
+            lock = threading.Lock()
+
+            def sync(msg):
+                with lock:
+                    sock.sendall(msg + b"\\n")
+                    sock.recv(1)
+
+            # Thread 1 functions
+            def baz1():
+                sync(b"t1:baz1")
+
+            def bar1():
+                baz1()
+
+            def blech1():
+                sync(b"t1:blech1")
+
+            def foo1():
+                bar1()  # Goes down to baz1, syncs
+                blech1()  # Returns up, goes down to blech1, syncs
+
+            # Thread 2 functions
+            def baz2():
+                sync(b"t2:baz2")
+
+            def bar2():
+                baz2()
+
+            def blech2():
+                sync(b"t2:blech2")
+
+            def foo2():
+                bar2()  # Goes down to baz2, syncs
+                blech2()  # Returns up, goes down to blech2, syncs
+
+            t1 = threading.Thread(target=foo1)
+            t2 = threading.Thread(target=foo2)
+            t1.start()
+            t2.start()
+            t1.join()
+            t2.join()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder = make_unwinder(cache_frames=True)
+            buffer = b""
+
+            def recv_msg():
+                """Receive a single message from socket."""
+                nonlocal buffer
+                while b"\n" not in buffer:
+                    chunk = client_socket.recv(256)
+                    if not chunk:
+                        return None
+                    buffer += chunk
+                msg, buffer = buffer.split(b"\n", 1)
+                return msg
+
+            def get_thread_frames(target_funcs):
+                """Get frames for thread matching target functions."""
+                retries = 0
+                for _ in busy_retry(SHORT_TIMEOUT):
+                    if retries >= 5:
+                        break
+                    retries += 1
+                    # On Windows, ReadProcessMemory can fail with OSError
+                    # (WinError 299) when frame pointers are in flux
+                    with contextlib.suppress(RuntimeError, OSError):
+                        traces = unwinder.get_stack_trace()
+                        for interp in traces:
+                            for thread in interp.threads:
+                                funcs = [f.funcname for f in thread.frame_info]
+                                if any(f in funcs for f in target_funcs):
+                                    return funcs
+                return None
+
+            # Track results for each sync point
+            results = {}
+
+            # Process 4 sync points: baz1, baz2, blech1, blech2
+            # With the lock, threads are serialized - handle one at a time
+            for _ in range(4):
+                msg = recv_msg()
+                self.assertIsNotNone(msg, "Expected message from subprocess")
+
+                # Determine which thread/function and take snapshot
+                if msg == b"t1:baz1":
+                    funcs = get_thread_frames(["baz1", "bar1", "foo1"])
+                    self.assertIsNotNone(funcs, "Thread 1 not found at baz1")
+                    results["t1:baz1"] = funcs
+                elif msg == b"t2:baz2":
+                    funcs = get_thread_frames(["baz2", "bar2", "foo2"])
+                    self.assertIsNotNone(funcs, "Thread 2 not found at baz2")
+                    results["t2:baz2"] = funcs
+                elif msg == b"t1:blech1":
+                    funcs = get_thread_frames(["blech1", "foo1"])
+                    self.assertIsNotNone(funcs, "Thread 1 not found at blech1")
+                    results["t1:blech1"] = funcs
+                elif msg == b"t2:blech2":
+                    funcs = get_thread_frames(["blech2", "foo2"])
+                    self.assertIsNotNone(funcs, "Thread 2 not found at blech2")
+                    results["t2:blech2"] = funcs
+
+                # Release thread to continue
+                client_socket.sendall(b"k")
+
+            # Validate Phase 1: baz snapshots
+            t1_baz = results.get("t1:baz1")
+            t2_baz = results.get("t2:baz2")
+            self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot")
+            self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot")
+
+            # Thread 1 at baz1: should have foo1->bar1->baz1
+            self.assertIn("baz1", t1_baz)
+            self.assertIn("bar1", t1_baz)
+            self.assertIn("foo1", t1_baz)
+            self.assertNotIn("blech1", t1_baz)
+            # No cross-contamination
+            self.assertNotIn("baz2", t1_baz)
+            self.assertNotIn("bar2", t1_baz)
+            self.assertNotIn("foo2", t1_baz)
+
+            # Thread 2 at baz2: should have foo2->bar2->baz2
+            self.assertIn("baz2", t2_baz)
+            self.assertIn("bar2", t2_baz)
+            self.assertIn("foo2", t2_baz)
+            self.assertNotIn("blech2", t2_baz)
+            # No cross-contamination
+            self.assertNotIn("baz1", t2_baz)
+            self.assertNotIn("bar1", t2_baz)
+            self.assertNotIn("foo1", t2_baz)
+
+            # Validate Phase 2: blech snapshots (cache invalidation test)
+            t1_blech = results.get("t1:blech1")
+            t2_blech = results.get("t2:blech2")
+            self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot")
+            self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot")
+
+            # Thread 1 at blech1: bar1/baz1 should be GONE (cache invalidated)
+            self.assertIn("blech1", t1_blech)
+            self.assertIn("foo1", t1_blech)
+            self.assertNotIn("bar1", t1_blech, "Cache not invalidated: bar1 still present")
+            self.assertNotIn("baz1", t1_blech, "Cache not invalidated: baz1 still present")
+            # No cross-contamination
+            self.assertNotIn("blech2", t1_blech)
+
+            # Thread 2 at blech2: bar2/baz2 should be GONE (cache invalidated)
+            self.assertIn("blech2", t2_blech)
+            self.assertIn("foo2", t2_blech)
+            self.assertNotIn("bar2", t2_blech, "Cache not invalidated: bar2 still present")
+            self.assertNotIn("baz2", t2_blech, "Cache not invalidated: baz2 still present")
+            # No cross-contamination
+            self.assertNotIn("blech1", t2_blech)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_new_unwinder_with_stale_last_profiled_frame(self):
+        """Test that a new unwinder returns complete stack when cache lookup misses."""
+        script_body = """\
+            def level4():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+
+            def level3():
+                level4()
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            expected = {"level1", "level2", "level3", "level4"}
+
+            # First unwinder samples - this sets last_profiled_frame in target
+            unwinder1 = make_unwinder(cache_frames=True)
+            frames1 = self._sample_frames(client_socket, unwinder1, b"sync1", b"ack", expected)
+
+            # Create NEW unwinder (empty cache) and sample
+            # The target still has last_profiled_frame set from unwinder1
+            unwinder2 = make_unwinder(cache_frames=True)
+            frames2 = self._sample_frames(client_socket, unwinder2, b"sync2", b"done", expected)
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        funcs1 = [f.funcname for f in frames1]
+        funcs2 = [f.funcname for f in frames2]
+
+        # Both should have all levels
+        for level in ["level1", "level2", "level3", "level4"]:
+            self.assertIn(level, funcs1, f"{level} missing from first sample")
+            self.assertIn(level, funcs2, f"{level} missing from second sample")
+
+        # Should have same stack depth
+        self.assertEqual(len(frames1), len(frames2),
+                         "New unwinder should return complete stack despite stale last_profiled_frame")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_exhaustion(self):
+        """Test cache works when frame limit (1024) is exceeded.
+
+        FRAME_CACHE_MAX_FRAMES=1024. With 1100 recursive frames,
+        the cache can't store all of them but should still work.
+        """
+        # Use 1100 to exceed FRAME_CACHE_MAX_FRAMES=1024
+        depth = 1100
+        script_body = f"""\
+import sys
+sys.setrecursionlimit(2000)
+
+def recurse(n):
+    if n <= 0:
+        sock.sendall(b"ready")
+        sock.recv(16)  # wait for ack
+        sock.sendall(b"ready2")
+        sock.recv(16)  # wait for done
+        return
+    recurse(n - 1)
+
+recurse({depth})
+"""
+
+        with self._target_process(script_body) as (p, client_socket, make_unwinder):
+            unwinder_cache = make_unwinder(cache_frames=True)
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+
+            frames_cached = self._sample_frames(
+                client_socket, unwinder_cache, b"ready", b"ack", {"recurse"}, expected_frames=1102
+            )
+            # Sample again with no cache for comparison
+            frames_no_cache = self._sample_frames(
+                client_socket, unwinder_no_cache, b"ready2", b"done", {"recurse"}, expected_frames=1102
+            )
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Both should have many recurse frames (> 1024 limit)
+        cached_count = [f.funcname for f in frames_cached].count("recurse")
+        no_cache_count = [f.funcname for f in frames_no_cache].count("recurse")
+
+        self.assertGreater(cached_count, 1000, "Should have >1000 recurse frames")
+        self.assertGreater(no_cache_count, 1000, "Should have >1000 recurse frames")
+
+        # Both modes should produce same frame count
+        self.assertEqual(len(frames_cached), len(frames_no_cache),
+                        "Cache exhaustion should not affect stack completeness")
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats(self):
+        """Test that get_stats() returns statistics when stats=True."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(p.pid, all_threads=True, stats=True)
+            self._wait_for_signal(client_socket, b"ready")
+
+            # Take a sample
+            unwinder.get_stack_trace()
+
+            stats = unwinder.get_stats()
+            client_socket.sendall(b"done")
+
+        # Verify expected keys exist
+        expected_keys = [
+            'total_samples', 'frame_cache_hits', 'frame_cache_misses',
+            'frame_cache_partial_hits', 'frames_read_from_cache',
+            'frames_read_from_memory', 'frame_cache_hit_rate'
+        ]
+        for key in expected_keys:
+            self.assertIn(key, stats)
+
+        self.assertEqual(stats['total_samples'], 1)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats_disabled_raises(self):
+        """Test that get_stats() raises RuntimeError when stats=False."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(p.pid, all_threads=True)  # stats=False by default
+            self._wait_for_signal(client_socket, b"ready")
+
+            with self.assertRaises(RuntimeError):
+                unwinder.get_stats()
+
+            client_socket.sendall(b"done")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
new file mode 100644
index 00000000000..e24fea416ff
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
@@ -0,0 +1,5 @@
+The ``_remote_debugging`` module now implements frame caching in the
+``RemoteUnwinder`` class to reduce memory reads when profiling remote
+processes. When ``cache_frames=True``, unchanged portions of the call stack
+are reused from previous samples, significantly improving profiling
+performance for deep call stacks.
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index b1582c75bda..1be83b45526 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -41,7 +41,7 @@
 @MODULE__PICKLE_TRUE@_pickle _pickle.c
 @MODULE__QUEUE_TRUE@_queue _queuemodule.c
 @MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c
 @MODULE__STRUCT_TRUE@_struct _struct.c
 
 # build supports subinterpreters
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
index 70e362ccada..804e2c904e1 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -154,6 +154,39 @@ typedef struct {
     uintptr_t addr_code_adaptive;
 } CachedCodeMetadata;
 
+/* Frame cache constants and types */
+#define FRAME_CACHE_MAX_THREADS 32
+#define FRAME_CACHE_MAX_FRAMES 1024
+
+typedef struct {
+    uint64_t thread_id;                      // 0 = empty slot
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs;
+    PyObject *frame_list;                    // owned reference, NULL if empty
+} FrameCacheEntry;
+
+/* Statistics for profiling performance analysis */
+typedef struct {
+    uint64_t total_samples;                  // Total number of get_stack_trace calls
+    uint64_t frame_cache_hits;               // Full cache hits (entire stack unchanged)
+    uint64_t frame_cache_misses;             // Cache misses requiring full walk
+    uint64_t frame_cache_partial_hits;       // Partial hits (stopped at cached frame)
+    uint64_t frames_read_from_cache;         // Total frames retrieved from cache
+    uint64_t frames_read_from_memory;        // Total frames read from remote memory
+    uint64_t memory_reads;                   // Total remote memory read operations
+    uint64_t memory_bytes_read;              // Total bytes read from remote memory
+    uint64_t code_object_cache_hits;         // Code object cache hits
+    uint64_t code_object_cache_misses;       // Code object cache misses
+    uint64_t stale_cache_invalidations;      // Times stale entries were cleared
+} UnwinderStats;
+
+/* Stats tracking macros - no-op when stats collection is disabled */
+#define STATS_INC(unwinder, field) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+
+#define STATS_ADD(unwinder, field, val) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
+
 typedef struct {
     PyTypeObject *RemoteDebugging_Type;
     PyTypeObject *TaskInfo_Type;
@@ -195,7 +228,12 @@ typedef struct {
     int skip_non_matching_threads;
     int native;
     int gc;
+    int cache_frames;
+    int collect_stats;  // whether to collect statistics
+    uint32_t stale_invalidation_counter;  // counter for throttling frame_cache_invalidate_stale
     RemoteDebuggingState *cached_state;
+    FrameCacheEntry *frame_cache;  // preallocated array of FRAME_CACHE_MAX_THREADS entries
+    UnwinderStats stats;  // statistics for performance analysis
 #ifdef Py_GIL_DISABLED
     uint32_t tlbc_generation;
     _Py_hashtable_t *tlbc_cache;
@@ -363,9 +401,45 @@ extern int process_frame_chain(
     uintptr_t initial_frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
-    uintptr_t gc_frame
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs
 );
 
+/* Frame cache functions */
+extern int frame_cache_init(RemoteUnwinderObject *unwinder);
+extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
+extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
+extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
+extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
+extern int frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs);
+// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
+extern int frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs);
+
+extern int collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id);
+
 /* ============================================================================
  * THREAD FUNCTION DECLARATIONS
  * ============================================================================ */
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h
index 60adb357e32..03127b753cc 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -12,7 +12,7 @@ preserve
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
 "               mode=0, debug=False, skip_non_matching_threads=True,\n"
-"               native=False, gc=False)\n"
+"               native=False, gc=False, cache_frames=False, stats=False)\n"
 "--\n"
 "\n"
 "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@@ -32,6 +32,10 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "            non-Python code.\n"
 "    gc: If True, include artificial \"<GC>\" frames to denote active garbage\n"
 "        collection.\n"
+"    cache_frames: If True, enable frame caching optimization to avoid re-reading\n"
+"                 unchanged parent frames between samples.\n"
+"    stats: If True, collect statistics about cache hits, memory reads, etc.\n"
+"           Use get_stats() to retrieve the collected statistics.\n"
 "\n"
 "The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
 "process, including examining thread states, stack frames and other runtime data.\n"
@@ -48,7 +52,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int only_active_thread,
                                                int mode, int debug,
                                                int skip_non_matching_threads,
-                                               int native, int gc);
+                                               int native, int gc,
+                                               int cache_frames, int stats);
 
 static int
 _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -56,7 +61,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     int return_value = -1;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 8
+    #define NUM_KEYWORDS 10
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
@@ -65,7 +70,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
         .ob_hash = -1,
-        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), },
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), &_Py_ID(cache_frames), &_Py_ID(stats), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -74,14 +79,14 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", NULL};
+    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", "cache_frames", "stats", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "RemoteUnwinder",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[8];
+    PyObject *argsbuf[10];
     PyObject * const *fastargs;
     Py_ssize_t nargs = PyTuple_GET_SIZE(args);
     Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
@@ -93,6 +98,8 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     int skip_non_matching_threads = 1;
     int native = 0;
     int gc = 0;
+    int cache_frames = 0;
+    int stats = 0;
 
     fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
             /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@@ -160,12 +167,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
             goto skip_optional_kwonly;
         }
     }
-    gc = PyObject_IsTrue(fastargs[7]);
-    if (gc < 0) {
+    if (fastargs[7]) {
+        gc = PyObject_IsTrue(fastargs[7]);
+        if (gc < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    if (fastargs[8]) {
+        cache_frames = PyObject_IsTrue(fastargs[8]);
+        if (cache_frames < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    stats = PyObject_IsTrue(fastargs[9]);
+    if (stats < 0) {
         goto exit;
     }
 skip_optional_kwonly:
-    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc);
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc, cache_frames, stats);
 
 exit:
     return return_value;
@@ -347,4 +372,51 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject
 
     return return_value;
 }
-/*[clinic end generated code: output=99fed5c94cf36881 input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get collected statistics about profiling performance.\n"
+"\n"
+"Returns a dictionary containing statistics about cache performance,\n"
+"memory reads, and other profiling metrics. Only available if the\n"
+"RemoteUnwinder was created with stats=True.\n"
+"\n"
+"Returns:\n"
+"    dict: A dictionary containing:\n"
+"        - total_samples: Total number of get_stack_trace calls\n"
+"        - frame_cache_hits: Full cache hits (entire stack unchanged)\n"
+"        - frame_cache_misses: Cache misses requiring full walk\n"
+"        - frame_cache_partial_hits: Partial hits (stopped at cached frame)\n"
+"        - frames_read_from_cache: Total frames retrieved from cache\n"
+"        - frames_read_from_memory: Total frames read from remote memory\n"
+"        - memory_reads: Total remote memory read operations\n"
+"        - memory_bytes_read: Total bytes read from remote memory\n"
+"        - code_object_cache_hits: Code object cache hits\n"
+"        - code_object_cache_misses: Code object cache misses\n"
+"        - stale_cache_invalidations: Times stale cache entries were cleared\n"
+"        - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
+"        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If stats collection was not enabled (stats=False)");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stats, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_stats_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=f1fd6c1d4c4c7254 input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c
index ea3f00c802b..2cd2505d0f9 100644
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@@ -257,6 +257,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
 
     if (unwinder && unwinder->code_object_cache != NULL) {
         meta = _Py_hashtable_get(unwinder->code_object_cache, key);
+        if (meta) {
+            STATS_INC(unwinder, code_object_cache_hits);
+        } else {
+            STATS_INC(unwinder, code_object_cache_misses);
+        }
     }
 
     if (meta == NULL) {
diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c
new file mode 100644
index 00000000000..4598b9dc353
--- /dev/null
+++ b/Modules/_remote_debugging/frame_cache.c
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * Remote Debugging Module - Frame Cache
+ *
+ * This file contains functions for caching frame information to optimize
+ * repeated stack unwinding for profiling.
+ ******************************************************************************/
+
+#include "_remote_debugging.h"
+
+/* ============================================================================
+ * FRAME CACHE - stores (address, frame_info) pairs per thread
+ * Uses preallocated fixed-size arrays for efficiency and bounded memory.
+ * ============================================================================ */
+
+int
+frame_cache_init(RemoteUnwinderObject *unwinder)
+{
+    unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
+    if (!unwinder->frame_cache) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    return 0;
+}
+
+void
+frame_cache_cleanup(RemoteUnwinderObject *unwinder)
+{
+    if (!unwinder->frame_cache) {
+        return;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        Py_CLEAR(unwinder->frame_cache[i].frame_list);
+    }
+    PyMem_Free(unwinder->frame_cache);
+    unwinder->frame_cache = NULL;
+}
+
+// Find cache entry by thread_id
+FrameCacheEntry *
+frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    return NULL;
+}
+
+// Allocate a cache slot for a thread
+// Returns NULL if cache is full (graceful degradation)
+static FrameCacheEntry *
+frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    // First check if thread already has an entry
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Find empty slot
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Cache full - graceful degradation
+    return NULL;
+}
+
+// Remove cache entries for threads not seen in the result
+// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
+// and ThreadInfo[0] is the thread_id
+void
+frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
+{
+    if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
+        return;
+    }
+
+    // Build array of seen thread IDs from result
+    uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
+    int num_seen = 0;
+
+    Py_ssize_t num_interps = PyList_GET_SIZE(result);
+    for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(result, i);
+        PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
+        if (!threads || !PyList_Check(threads)) {
+            continue;
+        }
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
+            if (tid_obj) {
+                uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
+                if (!PyErr_Occurred()) {
+                    seen_threads[num_seen++] = tid;
+                } else {
+                    PyErr_Clear();
+                }
+            }
+        }
+    }
+
+    // Invalidate entries not in seen list
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            continue;
+        }
+        int found = 0;
+        for (int j = 0; j < num_seen; j++) {
+            if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
+                found = 1;
+                break;
+            }
+        }
+        if (!found) {
+            // Clear this entry
+            Py_CLEAR(unwinder->frame_cache[i].frame_list);
+            unwinder->frame_cache[i].thread_id = 0;
+            unwinder->frame_cache[i].num_addrs = 0;
+            STATS_INC(unwinder, stale_cache_invalidations);
+        }
+    }
+}
+
+// Find last_profiled_frame in cache and extend frame_info with cached continuation
+// If frame_addrs is provided (not NULL), also extends it with cached addresses
+int
+frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Find the index where last_profiled_frame matches
+    Py_ssize_t start_idx = -1;
+    for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
+        if (entry->addrs[i] == last_profiled_frame) {
+            start_idx = i;
+            break;
+        }
+    }
+
+    if (start_idx < 0) {
+        return 0;  // Not found
+    }
+
+    Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
+
+    // Extend frame_info with frames from start_idx onwards
+    PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames);
+    if (!slice) {
+        return -1;
+    }
+
+    Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+    int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
+    Py_DECREF(slice);
+
+    if (result < 0) {
+        return -1;
+    }
+
+    // Also extend frame_addrs with cached addresses if provided
+    if (frame_addrs) {
+        for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
+            frame_addrs[(*num_addrs)++] = entry->addrs[i];
+        }
+    }
+
+    return 1;
+}
+
+// Store frame list with addresses in cache
+// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
+int
+frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return 0;
+    }
+
+    // Clamp to max frames
+    if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
+        num_addrs = FRAME_CACHE_MAX_FRAMES;
+    }
+
+    FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
+    if (!entry) {
+        // Cache full - graceful degradation
+        return 0;
+    }
+
+    // Clear old frame_list if replacing
+    Py_CLEAR(entry->frame_list);
+
+    // Store full frame list (don't truncate to num_addrs - frames beyond the
+    // address array limit are still valid and needed for full cache hits)
+    Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
+    entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
+    if (!entry->frame_list) {
+        return -1;
+    }
+    entry->thread_id = thread_id;
+    memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
+    entry->num_addrs = num_addrs;
+
+    return 1;
+}
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c
index d60caadcb9a..b77c0ca556d 100644
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -189,6 +189,8 @@ parse_frame_object(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
         return -1;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
 
     *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
     uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
@@ -258,14 +260,39 @@ process_frame_chain(
     uintptr_t initial_frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
-    uintptr_t gc_frame)
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,      // optional: C array to receive frame addresses
+    Py_ssize_t *num_addrs,       // in/out: current count / updated count
+    Py_ssize_t max_addrs)        // max capacity of frame_addrs array
 {
     uintptr_t frame_addr = initial_frame_addr;
     uintptr_t prev_frame_addr = 0;
-    const size_t MAX_FRAMES = 1024;
+    const size_t MAX_FRAMES = 1024 + 512;
     size_t frame_count = 0;
 
+    // Initialize output flag
+    if (stopped_at_cached_frame) {
+        *stopped_at_cached_frame = 0;
+    }
+
+    // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
+    if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
+        if (stopped_at_cached_frame) {
+            *stopped_at_cached_frame = 1;
+        }
+        return 0;
+    }
+
     while ((void*)frame_addr != NULL) {
+        // Check if we've reached the cached frame - if so, stop here
+        if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
+            if (stopped_at_cached_frame) {
+                *stopped_at_cached_frame = 1;
+            }
+            break;
+        }
         PyObject *frame = NULL;
         uintptr_t next_frame_addr = 0;
         uintptr_t stackpointer = 0;
@@ -286,7 +313,6 @@ process_frame_chain(
             }
         }
         if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
-            // If the first frame is missing, the chain is broken:
             const char *e = "Failed to parse initial frame in chain";
             PyErr_SetString(PyExc_RuntimeError, e);
             return -1;
@@ -310,36 +336,40 @@ process_frame_chain(
             extra_frame = &_Py_STR(native);
         }
         if (extra_frame) {
-            // Use "~" as file and 0 as line, since that's what pstats uses:
             PyObject *extra_frame_info = make_frame_info(
                 unwinder, _Py_LATIN1_CHR('~'), _PyLong_GetZero(), extra_frame);
             if (extra_frame_info == NULL) {
                 return -1;
             }
-            int error = PyList_Append(frame_info, extra_frame_info);
-            Py_DECREF(extra_frame_info);
-            if (error) {
-                const char *e = "Failed to append extra frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+            if (PyList_Append(frame_info, extra_frame_info) < 0) {
+                Py_DECREF(extra_frame_info);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
                 return -1;
             }
+            // Extra frames use 0 as address (they're synthetic)
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = 0;
+            }
+            Py_DECREF(extra_frame_info);
         }
         if (frame) {
             if (prev_frame_addr && frame_addr != prev_frame_addr) {
                 const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx";
                 PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr);
                 Py_DECREF(frame);
-                const char *e = "Frame chain consistency check failed";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed");
                 return -1;
             }
 
-            if (PyList_Append(frame_info, frame) == -1) {
+            if (PyList_Append(frame_info, frame) < 0) {
                 Py_DECREF(frame);
-                const char *e = "Failed to append frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
                 return -1;
             }
+            // Track the address for this frame
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = frame_addr;
+            }
             Py_DECREF(frame);
         }
 
@@ -349,3 +379,208 @@ process_frame_chain(
 
     return 0;
 }
+
+// Clear last_profiled_frame for all threads in the target process.
+// This must be called at the start of profiling to avoid stale values
+// from previous profilers causing us to stop frame walking early.
+int
+clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
+{
+    uintptr_t current_interp = unwinder->interpreter_addr;
+    uintptr_t zero = 0;
+
+    while (current_interp != 0) {
+        // Get first thread in this interpreter
+        uintptr_t tstate_addr;
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.threads_head,
+                sizeof(void*),
+                &tstate_addr) < 0) {
+            // Non-fatal: just skip clearing
+            PyErr_Clear();
+            return 0;
+        }
+
+        // Iterate all threads in this interpreter
+        while (tstate_addr != 0) {
+            // Clear last_profiled_frame
+            uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
+            if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                                  sizeof(uintptr_t), &zero) < 0) {
+                // Non-fatal: just continue
+                PyErr_Clear();
+            }
+
+            // Move to next thread
+            if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                    &unwinder->handle,
+                    tstate_addr + unwinder->debug_offsets.thread_state.next,
+                    sizeof(void*),
+                    &tstate_addr) < 0) {
+                PyErr_Clear();
+                break;
+            }
+        }
+
+        // Move to next interpreter
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.next,
+                sizeof(void*),
+                &current_interp) < 0) {
+            PyErr_Clear();
+            break;
+        }
+    }
+
+    return 0;
+}
+
+// Fast path: check if we have a full cache hit (parent stack unchanged)
+// A "full hit" means current frame == last profiled frame, so we can reuse
+// cached parent frames. We always read the current frame from memory to get
+// updated line numbers (the line within a frame can change between samples).
+// Returns: 1 if full hit (frame_info populated with current frame + cached parents),
+//          0 if miss, -1 on error
+static int
+try_full_cache_hit(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id,
+    PyObject *frame_info)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+    // Full hit only if current frame == last profiled frame
+    if (frame_addr != last_profiled_frame) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Verify first address matches (sanity check)
+    if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
+        return 0;
+    }
+
+    // Always read the current frame from memory to get updated line number
+    PyObject *current_frame = NULL;
+    uintptr_t code_object_addr = 0;
+    uintptr_t previous_frame = 0;
+    int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
+                                          &code_object_addr, &previous_frame);
+    if (parse_result < 0) {
+        return -1;
+    }
+
+    // Get cached parent frames first (before modifying frame_info)
+    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+    PyObject *parent_slice = NULL;
+    if (cached_size > 1) {
+        parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
+        if (!parent_slice) {
+            Py_XDECREF(current_frame);
+            return -1;
+        }
+    }
+
+    // Now safe to modify frame_info - add current frame if valid
+    if (current_frame != NULL) {
+        if (PyList_Append(frame_info, current_frame) < 0) {
+            Py_DECREF(current_frame);
+            Py_XDECREF(parent_slice);
+            return -1;
+        }
+        Py_DECREF(current_frame);
+        STATS_ADD(unwinder, frames_read_from_memory, 1);
+    }
+
+    // Extend with cached parent frames
+    if (parent_slice) {
+        Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+        int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
+        Py_DECREF(parent_slice);
+        if (result < 0) {
+            return -1;
+        }
+        STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
+    }
+
+    STATS_INC(unwinder, frame_cache_hits);
+    return 1;
+}
+
+// High-level helper: collect frames with cache optimization
+// Returns complete frame_info list, handling all cache logic internally
+int
+collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id)
+{
+    // Fast path: check for full cache hit first (no allocations needed)
+    int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
+                                       thread_id, frame_info);
+    if (full_hit != 0) {
+        return full_hit < 0 ? -1 : 0;  // Either error or success
+    }
+
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs = 0;
+    Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
+
+    int stopped_at_cached = 0;
+    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame,
+                            last_profiled_frame, &stopped_at_cached,
+                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+        return -1;
+    }
+
+    // Track frames read from memory (frames added by process_frame_chain)
+    STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
+
+    // If stopped at cached frame, extend with cached continuation (both frames and addresses)
+    if (stopped_at_cached) {
+        Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
+        int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
+                                                         frame_info, addrs, &num_addrs,
+                                                         FRAME_CACHE_MAX_FRAMES);
+        if (cache_result < 0) {
+            return -1;
+        }
+        if (cache_result == 0) {
+            // Cache miss - continue walking from last_profiled_frame to get the rest
+            STATS_INC(unwinder, frame_cache_misses);
+            Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
+            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame,
+                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+                return -1;
+            }
+            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
+        } else {
+            // Partial cache hit
+            STATS_INC(unwinder, frame_cache_partial_hits);
+            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
+        }
+    } else if (last_profiled_frame == 0) {
+        // No cache involvement (no last_profiled_frame or cache disabled)
+        STATS_INC(unwinder, frame_cache_misses);
+    }
+
+    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
+    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index 6cd9fad37de..123e4f5c4d7 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__
     skip_non_matching_threads: bool = True
     native: bool = False
     gc: bool = False
+    cache_frames: bool = False
+    stats: bool = False
 
 Initialize a new RemoteUnwinder object for debugging a remote Python process.
 
@@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
             non-Python code.
     gc: If True, include artificial "<GC>" frames to denote active garbage
         collection.
+    cache_frames: If True, enable frame caching optimization to avoid re-reading
+                 unchanged parent frames between samples.
+    stats: If True, collect statistics about cache hits, memory reads, etc.
+           Use get_stats() to retrieve the collected statistics.
 
 The RemoteUnwinder provides functionality to inspect and debug a running Python
 process, including examining thread states, stack frames and other runtime data.
@@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int only_active_thread,
                                                int mode, int debug,
                                                int skip_non_matching_threads,
-                                               int native, int gc)
-/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/
+                                               int native, int gc,
+                                               int cache_frames, int stats)
+/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/
 {
     // Validate that all_threads and only_active_thread are not both True
     if (all_threads && only_active_thread) {
@@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
 #ifdef Py_GIL_DISABLED
     if (only_active_thread) {
         PyErr_SetString(PyExc_ValueError,
-                       "only_active_thread is not supported when Py_GIL_DISABLED is not defined");
+                       "only_active_thread is not supported in free-threaded builds");
         return -1;
     }
 #endif
 
     self->native = native;
     self->gc = gc;
+    self->cache_frames = cache_frames;
+    self->collect_stats = stats;
+    self->stale_invalidation_counter = 0;
     self->debug = debug;
     self->only_active_thread = only_active_thread;
     self->mode = mode;
     self->skip_non_matching_threads = skip_non_matching_threads;
     self->cached_state = NULL;
+    self->frame_cache = NULL;
+    // Initialize stats to zero
+    memset(&self->stats, 0, sizeof(self->stats));
     if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
         set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
         return -1;
@@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
     self->win_process_buffer_size = 0;
 #endif
 
+    if (cache_frames && frame_cache_init(self) < 0) {
+        return -1;
+    }
+
+    // Clear stale last_profiled_frame values from previous profilers
+    // This prevents us from stopping frame walking early due to stale values
+    if (cache_frames) {
+        clear_last_profiled_frames(self);
+    }
+
     return 0;
 }
 
@@ -429,6 +452,8 @@ static PyObject *
 _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
 /*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/
 {
+    STATS_INC(self, total_samples);
+
     PyObject* result = PyList_New(0);
     if (!result) {
         set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list");
@@ -591,7 +616,15 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
     }
 
 exit:
-   _Py_RemoteDebug_ClearCache(&self->handle);
+    // Invalidate cache entries for threads not seen in this sample.
+    // Only do this every 1024 iterations to avoid performance overhead.
+    if (self->cache_frames && result) {
+        if (++self->stale_invalidation_counter >= 1024) {
+            self->stale_invalidation_counter = 0;
+            frame_cache_invalidate_stale(self, result);
+        }
+    }
+    _Py_RemoteDebug_ClearCache(&self->handle);
     return result;
 }
 
@@ -757,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
     return NULL;
 }
 
+/*[clinic input]
+@permit_long_docstring_body
+@critical_section
+_remote_debugging.RemoteUnwinder.get_stats
+
+Get collected statistics about profiling performance.
+
+Returns a dictionary containing statistics about cache performance,
+memory reads, and other profiling metrics. Only available if the
+RemoteUnwinder was created with stats=True.
+
+Returns:
+    dict: A dictionary containing:
+        - total_samples: Total number of get_stack_trace calls
+        - frame_cache_hits: Full cache hits (entire stack unchanged)
+        - frame_cache_misses: Cache misses requiring full walk
+        - frame_cache_partial_hits: Partial hits (stopped at cached frame)
+        - frames_read_from_cache: Total frames retrieved from cache
+        - frames_read_from_memory: Total frames read from remote memory
+        - memory_reads: Total remote memory read operations
+        - memory_bytes_read: Total bytes read from remote memory
+        - code_object_cache_hits: Code object cache hits
+        - code_object_cache_misses: Code object cache misses
+        - stale_cache_invalidations: Times stale cache entries were cleared
+        - frame_cache_hit_rate: Percentage of samples that hit the cache
+        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache
+
+Raises:
+    RuntimeError: If stats collection was not enabled (stats=False)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+{
+    if (!self->collect_stats) {
+        PyErr_SetString(PyExc_RuntimeError,
+                       "Statistics collection was not enabled. "
+                       "Create RemoteUnwinder with stats=True to collect statistics.");
+        return NULL;
+    }
+
+    PyObject *result = PyDict_New();
+    if (!result) {
+        return NULL;
+    }
+
+#define ADD_STAT(name) do { \
+    PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \
+    if (!val || PyDict_SetItemString(result, #name, val) < 0) { \
+        Py_XDECREF(val); \
+        Py_DECREF(result); \
+        return NULL; \
+    } \
+    Py_DECREF(val); \
+} while(0)
+
+    ADD_STAT(total_samples);
+    ADD_STAT(frame_cache_hits);
+    ADD_STAT(frame_cache_misses);
+    ADD_STAT(frame_cache_partial_hits);
+    ADD_STAT(frames_read_from_cache);
+    ADD_STAT(frames_read_from_memory);
+    ADD_STAT(memory_reads);
+    ADD_STAT(memory_bytes_read);
+    ADD_STAT(code_object_cache_hits);
+    ADD_STAT(code_object_cache_misses);
+    ADD_STAT(stale_cache_invalidations);
+
+#undef ADD_STAT
+
+    // Calculate and add derived statistics
+    // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
+    double frame_cache_hit_rate = 0.0;
+    uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses;
+    if (total_cache_lookups > 0) {
+        frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
+                               / (double)total_cache_lookups;
+    }
+    PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
+    if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
+        Py_XDECREF(hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(hit_rate);
+
+    double code_object_hit_rate = 0.0;
+    uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
+    if (total_code_lookups > 0) {
+        code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
+    }
+    PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
+    if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
+        Py_XDECREF(code_hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(code_hit_rate);
+
+    return result;
+}
+
 static PyMethodDef RemoteUnwinder_methods[] = {
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF
     {NULL, NULL}
 };
 
@@ -787,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op)
         _Py_RemoteDebug_ClearCache(&self->handle);
         _Py_RemoteDebug_CleanupProcHandle(&self->handle);
     }
+    frame_cache_cleanup(self);
     PyObject_Del(self);
     Py_DECREF(tp);
 }
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c
index 99147b01a1b..ce013f902d1 100644
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -296,6 +296,8 @@ unwind_stack_for_thread(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
         goto error;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
 
     long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
 
@@ -309,6 +311,8 @@ unwind_stack_for_thread(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
         goto error;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size);
 
     // Calculate thread status using flags (always)
     int status_flags = 0;
@@ -383,14 +387,36 @@ unwind_stack_for_thread(
         goto error;
     }
 
-    if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
-        goto error;
+    // In cache mode, copying stack chunks is more expensive than direct memory reads
+    if (!unwinder->cache_frames) {
+        if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
+            goto error;
+        }
     }
 
-    if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
-        goto error;
+    if (unwinder->cache_frames) {
+        // Use cache to avoid re-reading unchanged parent frames
+        uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
+            unwinder->debug_offsets.thread_state.last_profiled_frame);
+        if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
+                                      gc_frame, last_profiled_frame, tid) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
+            goto error;
+        }
+        // Update last_profiled_frame for next sample
+        uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
+        if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                              sizeof(uintptr_t), &frame_addr) < 0) {
+            PyErr_Clear();  // Non-fatal
+        }
+    } else {
+        // No caching - process entire frame chain
+        if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
+                                gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
+            goto error;
+        }
     }
 
     *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index 3ef34ef0563..c91c9cf3652 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -102,6 +102,7 @@
     <ClCompile Include="..\Modules\_remote_debugging\object_reading.c" />
     <ClCompile Include="..\Modules\_remote_debugging\code_objects.c" />
     <ClCompile Include="..\Modules\_remote_debugging\frames.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
     <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
   </ItemGroup>
diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters
index 5c117a79f3b..b37a2c5575c 100644
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@@ -24,6 +24,9 @@
     <ClCompile Include="..\Modules\_remote_debugging\frames.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Modules\_remote_debugging\threads.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/Python/ceval.c b/Python/ceval.c
index aadc6369cbe..382ae210ebb 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -2288,6 +2288,16 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
 void
 _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame)
 {
+    // Update last_profiled_frame for remote profiler frame caching.
+    // By this point, tstate->current_frame is already set to the parent frame.
+    // Only update if we're popping the exact frame that was last profiled.
+    // This avoids corrupting the cache when transient frames (called and returned
+    // between profiler samples) update last_profiled_frame to addresses the
+    // profiler never saw.
+    if (tstate->last_profiled_frame != NULL && tstate->last_profiled_frame == frame) {
+        tstate->last_profiled_frame = tstate->current_frame;
+    }
+
     if (frame->owner == FRAME_OWNED_BY_THREAD) {
         clear_thread_frame(tstate, frame);
     }
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index 517568358a0..1c02870d3af 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -1102,6 +1102,115 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }
 
+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+// Fallback write using /proc/pid/mem
+static int
+_Py_RemoteDebug_WriteRemoteMemoryFallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+    if (handle->memfd == -1) {
+        if (open_proc_mem_fd(handle) < 0) {
+            return -1;
+        }
+    }
+
+    struct iovec local[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (char*)src + result;
+        local[0].iov_len = len - result;
+        off_t offset = remote_address + result;
+
+        written = pwritev(handle->memfd, local, 1, offset);
+        if (written < 0) {
+            PyErr_SetFromErrno(PyExc_OSError);
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+}
+#endif // __linux__
+
+// Platform-independent memory write function
+UNUSED static int
+_Py_RemoteDebug_WriteRemoteMemory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+#ifdef MS_WINDOWS
+    SIZE_T written = 0;
+    SIZE_T result = 0;
+    do {
+        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
+            PyErr_SetFromWindowsErr(0);
+            DWORD error = GetLastError();
+            _set_debug_exception_cause(PyExc_OSError,
+                "WriteProcessMemory failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zu bytes): Windows error %lu",
+                handle->pid, remote_address + result, len - result, result, error);
+            return -1;
+        }
+        result += written;
+    } while (result < len);
+    return 0;
+#elif defined(__linux__) && HAVE_PROCESS_VM_READV
+    if (handle->memfd != -1) {
+        return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+    }
+    struct iovec local[1];
+    struct iovec remote[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (void*)((char*)src + result);
+        local[0].iov_len = len - result;
+        remote[0].iov_base = (void*)((char*)remote_address + result);
+        remote[0].iov_len = len - result;
+
+        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
+        if (written < 0) {
+            if (errno == ENOSYS) {
+                return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+            }
+            PyErr_SetFromErrno(PyExc_OSError);
+            _set_debug_exception_cause(PyExc_OSError,
+                "process_vm_writev failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zd bytes): %s",
+                handle->pid, remote_address + result, len - result, result, strerror(errno));
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+    kern_return_t kr = mach_vm_write(
+        handle->task,
+        (mach_vm_address_t)remote_address,
+        (vm_offset_t)src,
+        (mach_msg_type_number_t)len);
+
+    if (kr != KERN_SUCCESS) {
+        switch (kr) {
+        case KERN_PROTECTION_FAILURE:
+            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
+            break;
+        case KERN_INVALID_ARGUMENT:
+            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
+            break;
+        default:
+            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
+        }
+        return -1;
+    }
+    return 0;
+#else
+    Py_UNREACHABLE();
+#endif
+}
+
 UNUSED static int
 _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
                                       uintptr_t addr,
diff --git a/Python/remote_debugging.c b/Python/remote_debugging.c
index 71ffb17ed68..5b50b95db94 100644
--- a/Python/remote_debugging.c
+++ b/Python/remote_debugging.c
@@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d
     return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst);
 }
 
-// Why is pwritev not guarded? Except on Android API level 23 (no longer
-// supported), HAVE_PROCESS_VM_READV is sufficient.
-#if defined(__linux__) && HAVE_PROCESS_VM_READV
-static int
-write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
-{
-    if (handle->memfd == -1) {
-        if (open_proc_mem_fd(handle) < 0) {
-            return -1;
-        }
-    }
-
-    struct iovec local[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (char*)src + result;
-        local[0].iov_len = len - result;
-        off_t offset = remote_address + result;
-
-        written = pwritev(handle->memfd, local, 1, offset);
-        if (written < 0) {
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-}
-#endif // __linux__
-
+// Use the shared write function from remote_debug.h
 static int
 write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
 {
-#ifdef MS_WINDOWS
-    SIZE_T written = 0;
-    SIZE_T result = 0;
-    do {
-        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
-            PyErr_SetFromWindowsErr(0);
-            return -1;
-        }
-        result += written;
-    } while (result < len);
-    return 0;
-#elif defined(__linux__) && HAVE_PROCESS_VM_READV
-    if (handle->memfd != -1) {
-        return write_memory_fallback(handle, remote_address, len, src);
-    }
-    struct iovec local[1];
-    struct iovec remote[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (void*)((char*)src + result);
-        local[0].iov_len = len - result;
-        remote[0].iov_base = (void*)((char*)remote_address + result);
-        remote[0].iov_len = len - result;
-
-        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
-        if (written < 0) {
-            if (errno == ENOSYS) {
-                return write_memory_fallback(handle, remote_address, len, src);
-            }
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-#elif defined(__APPLE__) && TARGET_OS_OSX
-    kern_return_t kr = mach_vm_write(
-        pid_to_task(handle->pid),
-        (mach_vm_address_t)remote_address,
-        (vm_offset_t)src,
-        (mach_msg_type_number_t)len);
-
-    if (kr != KERN_SUCCESS) {
-        switch (kr) {
-        case KERN_PROTECTION_FAILURE:
-            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
-            break;
-        case KERN_INVALID_ARGUMENT:
-            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
-            break;
-        default:
-            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
-        }
-        return -1;
-    }
-    return 0;
-#else
-    Py_UNREACHABLE();
-#endif
+    return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src);
 }
 
 static int
diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py
index 0ac7ac4d385..9c40c2f4492 100644
--- a/Tools/inspection/benchmark_external_inspection.py
+++ b/Tools/inspection/benchmark_external_inspection.py
@@ -434,7 +434,7 @@ def main():
                     elif args.threads == "only_active":
                         kwargs["only_active_thread"] = True
                     unwinder = _remote_debugging.RemoteUnwinder(
-                        process.pid, **kwargs
+                        process.pid, cache_frames=True, **kwargs
                     )
                     results = benchmark(unwinder, duration_seconds=args.duration)
                 finally: