gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks. When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process. The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory. The sampling profiler now enables frame caching by default.
2025-12-08 06:10:17 +00:00 · 2025-12-06 22:37:34 +00:00 · 2025-12-06 22:37:34 +00:00 · 572c780aa8
commit 572c780aa8
parent 332da6295f
24 changed files with 1855 additions and 142 deletions
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@ -27,21 +27,24 @@


 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
        self.pid = pid
        self.sample_interval_usec = sample_interval_usec
        self.all_threads = all_threads
        self.mode = mode  # Store mode for later use
+        self.collect_stats = collect_stats
        if _FREE_THREADED_BUILD:
            self.unwinder = _remote_debugging.RemoteUnwinder(
                self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
            )
        else:
            only_active_threads = bool(self.all_threads)
            self.unwinder = _remote_debugging.RemoteUnwinder(
                self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
            )
        # Track sample intervals and total sample count
        self.sample_intervals = deque(maxlen=100)
@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
            print(f"Sample rate: {sample_rate:.2f} samples/sec")
            print(f"Error rate: {error_rate:.2f}%")

+            # Print unwinder stats if stats collection is enabled
+            if self.collect_stats:
+                self._print_unwinder_stats()
+
        # Pass stats to flamegraph collector if it's the right type
        if hasattr(collector, 'set_stats'):
            collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@ -176,17 +183,100 @@ def _print_realtime_stats(self):
            (1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
        )  # Max time = Min Hz

+        # Build cache stats string if stats collection is enabled
+        cache_stats_str = ""
+        if self.collect_stats:
+            try:
+                stats = self.unwinder.get_stats()
+                hits = stats.get('frame_cache_hits', 0)
+                partial = stats.get('frame_cache_partial_hits', 0)
+                misses = stats.get('frame_cache_misses', 0)
+                total = hits + partial + misses
+                if total > 0:
+                    hit_pct = (hits + partial) / total * 100
+                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
+            except RuntimeError:
+                pass
+
        # Clear line and print stats
        print(
-            f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
-            f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
+            f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
+            f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
+            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
+            f"{cache_stats_str}",
            end="",
            flush=True,
        )

+    def _print_unwinder_stats(self):
+        """Print unwinder statistics including cache performance."""
+        try:
+            stats = self.unwinder.get_stats()
+        except RuntimeError:
+            return  # Stats not enabled
+
+        print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
+        print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
+
+        # Frame cache stats
+        total_samples = stats.get('total_samples', 0)
+        frame_cache_hits = stats.get('frame_cache_hits', 0)
+        frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
+        frame_cache_misses = stats.get('frame_cache_misses', 0)
+        total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
+
+        # Calculate percentages
+        hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
+        partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
+        misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
+        print(f"    Total samples:    {total_samples:,}")
+        print(f"    Full hits:        {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Partial hits:     {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Frame read stats
+        frames_from_cache = stats.get('frames_read_from_cache', 0)
+        frames_from_memory = stats.get('frames_read_from_memory', 0)
+        total_frames = frames_from_cache + frames_from_memory
+        cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
+        memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
+        print(f"    From cache:       {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    From memory:      {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
+
+        # Code object cache stats
+        code_hits = stats.get('code_object_cache_hits', 0)
+        code_misses = stats.get('code_object_cache_misses', 0)
+        total_code = code_hits + code_misses
+        code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
+        code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
+        print(f"    Hits:             {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Memory operations
+        memory_reads = stats.get('memory_reads', 0)
+        memory_bytes = stats.get('memory_bytes_read', 0)
+        if memory_bytes >= 1024 * 1024:
+            memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
+        elif memory_bytes >= 1024:
+            memory_str = f"{memory_bytes / 1024:.1f} KB"
+        else:
+            memory_str = f"{memory_bytes} B"
+        print(f"  {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
+        print(f"    Read operations:  {memory_reads:,} ({memory_str})")
+
+        # Stale invalidations
+        stale_invalidations = stats.get('stale_cache_invalidations', 0)
+        if stale_invalidations > 0:
+            print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
+

 def sample(
    pid,
@ -234,7 +324,8 @@ def sample(
        mode=mode,
        native=native,
        gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
    )
    profiler.realtime_stats = realtime_stats

@ -290,7 +381,8 @@ def sample_live(
        mode=mode,
        native=native,
        gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
    )
    profiler.realtime_stats = realtime_stats