gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)

This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks.

When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process.

The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory.

The sampling profiler now enables frame caching by default.
This commit is contained in:
Pablo Galindo Salgado 2025-12-06 22:37:34 +00:00 committed by GitHub
parent 332da6295f
commit 572c780aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1855 additions and 142 deletions

View file

@ -27,21 +27,24 @@
class SampleProfiler:
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
self.pid = pid
self.sample_interval_usec = sample_interval_usec
self.all_threads = all_threads
self.mode = mode # Store mode for later use
self.collect_stats = collect_stats
if _FREE_THREADED_BUILD:
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
else:
only_active_threads = bool(self.all_threads)
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
# Track sample intervals and total sample count
self.sample_intervals = deque(maxlen=100)
@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
print(f"Sample rate: {sample_rate:.2f} samples/sec")
print(f"Error rate: {error_rate:.2f}%")
# Print unwinder stats if stats collection is enabled
if self.collect_stats:
self._print_unwinder_stats()
# Pass stats to flamegraph collector if it's the right type
if hasattr(collector, 'set_stats'):
collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@ -176,17 +183,100 @@ def _print_realtime_stats(self):
(1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
) # Max time = Min Hz
# Build cache stats string if stats collection is enabled
cache_stats_str = ""
if self.collect_stats:
try:
stats = self.unwinder.get_stats()
hits = stats.get('frame_cache_hits', 0)
partial = stats.get('frame_cache_partial_hits', 0)
misses = stats.get('frame_cache_misses', 0)
total = hits + partial + misses
if total > 0:
hit_pct = (hits + partial) / total * 100
cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
except RuntimeError:
pass
# Clear line and print stats
print(
f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
f"{cache_stats_str}",
end="",
flush=True,
)
def _print_unwinder_stats(self):
"""Print unwinder statistics including cache performance."""
try:
stats = self.unwinder.get_stats()
except RuntimeError:
return # Stats not enabled
print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
# Frame cache stats
total_samples = stats.get('total_samples', 0)
frame_cache_hits = stats.get('frame_cache_hits', 0)
frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
frame_cache_misses = stats.get('frame_cache_misses', 0)
total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
# Calculate percentages
hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
print(f" {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
print(f" Total samples: {total_samples:,}")
print(f" Full hits: {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Partial hits: {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
# Frame read stats
frames_from_cache = stats.get('frames_read_from_cache', 0)
frames_from_memory = stats.get('frames_read_from_memory', 0)
total_frames = frames_from_cache + frames_from_memory
cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
print(f" {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
print(f" From cache: {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
print(f" From memory: {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
# Code object cache stats
code_hits = stats.get('code_object_cache_hits', 0)
code_misses = stats.get('code_object_cache_misses', 0)
total_code = code_hits + code_misses
code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
print(f" {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
print(f" Hits: {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)
if memory_bytes >= 1024 * 1024:
memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
elif memory_bytes >= 1024:
memory_str = f"{memory_bytes / 1024:.1f} KB"
else:
memory_str = f"{memory_bytes} B"
print(f" {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
print(f" Read operations: {memory_reads:,} ({memory_str})")
# Stale invalidations
stale_invalidations = stats.get('stale_cache_invalidations', 0)
if stale_invalidations > 0:
print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
def sample(
pid,
@ -234,7 +324,8 @@ def sample(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats
@ -290,7 +381,8 @@ def sample_live(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats