From d6d850df89c97fbced893b7914682efedfa6ebc4 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 7 Dec 2025 15:53:48 +0000 Subject: [PATCH] gh-138122: Don't sample partial frame chains (#141912) --- Include/cpython/pystate.h | 7 + Include/internal/pycore_debug_offsets.h | 2 + Include/internal/pycore_tstate.h | 5 + InternalDocs/frames.md | 29 +++++ Lib/test/test_external_inspection.py | 120 +++++------------- ...-11-24-16-07-57.gh-issue-138122.m3EF9E.rst | 6 + Modules/_remote_debugging/_remote_debugging.h | 1 + Modules/_remote_debugging/frames.c | 30 +++-- Modules/_remote_debugging/threads.c | 5 +- Python/ceval.c | 3 + Python/pylifecycle.c | 2 +- Python/pystate.c | 28 +++- Python/traceback.c | 2 +- 13 files changed, 135 insertions(+), 105 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 08d71070ddc..22df26bd37a 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -135,6 +135,13 @@ struct _ts { /* Pointer to currently executing frame. */ struct _PyInterpreterFrame *current_frame; + /* Pointer to the base frame (bottommost sentinel frame). + Used by profilers to validate complete stack unwinding. + Points to the embedded base_frame in _PyThreadStateImpl. + The frame is embedded there rather than here because _PyInterpreterFrame + is defined in internal headers that cannot be exposed in the public API. */ + struct _PyInterpreterFrame *base_frame; + struct _PyInterpreterFrame *last_profiled_frame; Py_tracefunc c_profilefunc; diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index bfd86c08887..1cdc4449b17 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets { uint64_t next; uint64_t interp; uint64_t current_frame; + uint64_t base_frame; uint64_t last_profiled_frame; uint64_t thread_id; uint64_t native_thread_id; @@ -273,6 +274,7 @@ typedef struct _Py_DebugOffsets { .next = offsetof(PyThreadState, next), \ .interp = offsetof(PyThreadState, interp), \ .current_frame = offsetof(PyThreadState, current_frame), \ + .base_frame = offsetof(PyThreadState, base_frame), \ .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 50048801b2e..c4f723ac8ab 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -10,6 +10,7 @@ extern "C" { #include "pycore_brc.h" // struct _brc_thread_state #include "pycore_freelist_state.h" // struct _Py_freelists +#include "pycore_interpframe_structs.h" // _PyInterpreterFrame #include "pycore_mimalloc.h" // struct _mimalloc_thread_state #include "pycore_qsbr.h" // struct qsbr #include "pycore_uop.h" // struct _PyUOpInstruction @@ -61,6 +62,10 @@ typedef struct _PyThreadStateImpl { // semi-public fields are in PyThreadState. PyThreadState base; + // Embedded base frame - sentinel at the bottom of the frame stack. + // Used by profiling/sampling to detect incomplete stack traces. + _PyInterpreterFrame base_frame; + // The reference count field is used to synchronize deallocation of the // thread state during runtime finalization. Py_ssize_t refcount; diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md index d56e5481d3c..60ab2055afa 100644 --- a/InternalDocs/frames.md +++ b/InternalDocs/frames.md @@ -111,6 +111,35 @@ ### Shim frames instruction which cleans up the shim frame and returns. +### Base frame + +Each thread state contains an embedded `_PyInterpreterFrame` called the "base frame" +that serves as a sentinel at the bottom of the frame stack. This frame is allocated +in `_PyThreadStateImpl` (the internal extension of `PyThreadState`) and initialized +when the thread state is created. The `owner` field is set to `FRAME_OWNED_BY_INTERPRETER`. + +External profilers and sampling tools can validate that they have successfully unwound +the complete call stack by checking that the frame chain terminates at the base frame. +The `PyThreadState.base_frame` pointer provides the expected address to compare against. +If a stack walk doesn't reach this frame, the sample is incomplete (possibly due to a +race condition) and should be discarded. + +The base frame is embedded in `_PyThreadStateImpl` rather than `PyThreadState` because +`_PyInterpreterFrame` is defined in internal headers that cannot be exposed in the +public API. A pointer (`PyThreadState.base_frame`) is provided for profilers to access +the address without needing internal headers. + +See the initialization in `new_threadstate()` in [Python/pystate.c](../Python/pystate.c). + +#### How profilers should use the base frame + +External profilers should read `tstate->base_frame` before walking the stack, then +walk from `tstate->current_frame` following `frame->previous` pointers until reaching +a frame with `owner == FRAME_OWNED_BY_INTERPRETER`. After the walk, verify that the +last frame address matches `base_frame`. If not, discard the sample as incomplete +since the frame chain may have been in an inconsistent state due to concurrent updates. + + ### Remote Profiling Frame Cache The `last_profiled_frame` field in `PyThreadState` supports an optimization for diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index f664e8ac53f..a97242483a8 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -1,7 +1,7 @@ -import contextlib import unittest import os import textwrap +import contextlib import importlib import sys import socket @@ -216,33 +216,13 @@ def requires_subinterpreters(meth): # Simple wrapper functions for RemoteUnwinder # ============================================================================ -# Errors that can occur transiently when reading process memory without synchronization -RETRIABLE_ERRORS = ( - "Task list appears corrupted", - "Invalid linked list structure reading remote memory", - "Unknown error reading memory", - "Unhandled frame owner", - "Failed to parse initial frame", - "Failed to process frame chain", - "Failed to unwind stack", -) - - -def _is_retriable_error(exc): - """Check if an exception is a transient error that should be retried.""" - msg = str(exc) - return any(msg.startswith(err) or err in msg for err in RETRIABLE_ERRORS) - - def get_stack_trace(pid): for _ in busy_retry(SHORT_TIMEOUT): try: unwinder = RemoteUnwinder(pid, all_threads=True, debug=True) return unwinder.get_stack_trace() except RuntimeError as e: - if _is_retriable_error(e): - continue - raise + continue raise RuntimeError("Failed to get stack trace after retries") @@ -252,9 +232,7 @@ def get_async_stack_trace(pid): unwinder = RemoteUnwinder(pid, debug=True) return unwinder.get_async_stack_trace() except RuntimeError as e: - if _is_retriable_error(e): - continue - raise + continue raise RuntimeError("Failed to get async stack trace after retries") @@ -264,9 +242,7 @@ def get_all_awaited_by(pid): unwinder = RemoteUnwinder(pid, debug=True) return unwinder.get_all_awaited_by() except RuntimeError as e: - if _is_retriable_error(e): - continue - raise + continue raise RuntimeError("Failed to get all awaited_by after retries") @@ -2268,18 +2244,13 @@ def make_unwinder(cache_frames=True): def _get_frames_with_retry(self, unwinder, required_funcs): """Get frames containing required_funcs, with retry for transient errors.""" for _ in range(MAX_TRIES): - try: + with contextlib.suppress(OSError, RuntimeError): traces = unwinder.get_stack_trace() for interp in traces: for thread in interp.threads: funcs = {f.funcname for f in thread.frame_info} if required_funcs.issubset(funcs): return thread.frame_info - except RuntimeError as e: - if _is_retriable_error(e): - pass - else: - raise time.sleep(0.1) return None @@ -2802,70 +2773,39 @@ def foo2(): make_unwinder, ): unwinder = make_unwinder(cache_frames=True) - buffer = b"" - def recv_msg(): - """Receive a single message from socket.""" - nonlocal buffer - while b"\n" not in buffer: - chunk = client_socket.recv(256) - if not chunk: - return None - buffer += chunk - msg, buffer = buffer.split(b"\n", 1) - return msg - - def get_thread_frames(target_funcs): - """Get frames for thread matching target functions.""" - retries = 0 - for _ in busy_retry(SHORT_TIMEOUT): - if retries >= 5: - break - retries += 1 - # On Windows, ReadProcessMemory can fail with OSError - # (WinError 299) when frame pointers are in flux - with contextlib.suppress(RuntimeError, OSError): - traces = unwinder.get_stack_trace() - for interp in traces: - for thread in interp.threads: - funcs = [f.funcname for f in thread.frame_info] - if any(f in funcs for f in target_funcs): - return funcs - return None + # Message dispatch table: signal -> required functions for that thread + dispatch = { + b"t1:baz1": {"baz1", "bar1", "foo1"}, + b"t2:baz2": {"baz2", "bar2", "foo2"}, + b"t1:blech1": {"blech1", "foo1"}, + b"t2:blech2": {"blech2", "foo2"}, + } # Track results for each sync point results = {} - # Process 4 sync points: baz1, baz2, blech1, blech2 - # With the lock, threads are serialized - handle one at a time - for _ in range(4): - msg = recv_msg() - self.assertIsNotNone(msg, "Expected message from subprocess") + # Process 4 sync points (order depends on thread scheduling) + buffer = _wait_for_signal(client_socket, b"\n") + for i in range(4): + # Extract first message from buffer + msg, sep, buffer = buffer.partition(b"\n") + self.assertIn(msg, dispatch, f"Unexpected message: {msg!r}") - # Determine which thread/function and take snapshot - if msg == b"t1:baz1": - funcs = get_thread_frames(["baz1", "bar1", "foo1"]) - self.assertIsNotNone(funcs, "Thread 1 not found at baz1") - results["t1:baz1"] = funcs - elif msg == b"t2:baz2": - funcs = get_thread_frames(["baz2", "bar2", "foo2"]) - self.assertIsNotNone(funcs, "Thread 2 not found at baz2") - results["t2:baz2"] = funcs - elif msg == b"t1:blech1": - funcs = get_thread_frames(["blech1", "foo1"]) - self.assertIsNotNone(funcs, "Thread 1 not found at blech1") - results["t1:blech1"] = funcs - elif msg == b"t2:blech2": - funcs = get_thread_frames(["blech2", "foo2"]) - self.assertIsNotNone(funcs, "Thread 2 not found at blech2") - results["t2:blech2"] = funcs + # Sample frames for the thread at this sync point + required_funcs = dispatch[msg] + frames = self._get_frames_with_retry(unwinder, required_funcs) + self.assertIsNotNone(frames, f"Thread not found for {msg!r}") + results[msg] = [f.funcname for f in frames] - # Release thread to continue + # Release thread and wait for next message (if not last) client_socket.sendall(b"k") + if i < 3: + buffer += _wait_for_signal(client_socket, b"\n") # Validate Phase 1: baz snapshots - t1_baz = results.get("t1:baz1") - t2_baz = results.get("t2:baz2") + t1_baz = results.get(b"t1:baz1") + t2_baz = results.get(b"t2:baz2") self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot") self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot") @@ -2890,8 +2830,8 @@ def get_thread_frames(target_funcs): self.assertNotIn("foo1", t2_baz) # Validate Phase 2: blech snapshots (cache invalidation test) - t1_blech = results.get("t1:blech1") - t2_blech = results.get("t2:blech2") + t1_blech = results.get(b"t1:blech1") + t2_blech = results.get(b"t2:blech2") self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot") self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot") diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst new file mode 100644 index 00000000000..a4a29e40027 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst @@ -0,0 +1,6 @@ +Add incomplete sample detection to prevent corrupted profiling data. Each +thread state now contains an embedded base frame (sentinel at the bottom of +the frame stack) with owner type ``FRAME_OWNED_BY_INTERPRETER``. The profiler +validates that stack unwinding terminates at this sentinel frame. Samples that +fail to reach the base frame (due to race conditions, memory corruption, or +other errors) are now rejected rather than being included as spurious data. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 804e2c904e1..7f3c0d363f5 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -401,6 +401,7 @@ extern int process_frame_chain( uintptr_t initial_frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, int *stopped_at_cached_frame, diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index b77c0ca556d..eaf3287c6fe 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -154,14 +154,13 @@ is_frame_valid( void* frame = (void*)frame_addr; - if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_INTERPRETER) { - return 0; // C frame + char owner = GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner); + if (owner == FRAME_OWNED_BY_INTERPRETER) { + return 0; // C frame or sentinel base frame } - if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR - && GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_THREAD) { - PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n", - GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner)); + if (owner != FRAME_OWNED_BY_GENERATOR && owner != FRAME_OWNED_BY_THREAD) { + PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n", owner); set_exception_cause(unwinder, PyExc_RuntimeError, "Unhandled frame owner type in async frame"); return -1; } @@ -260,6 +259,7 @@ process_frame_chain( uintptr_t initial_frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, int *stopped_at_cached_frame, @@ -269,6 +269,7 @@ process_frame_chain( { uintptr_t frame_addr = initial_frame_addr; uintptr_t prev_frame_addr = 0; + uintptr_t last_frame_addr = 0; // Track last frame visited for validation const size_t MAX_FRAMES = 1024 + 512; size_t frame_count = 0; @@ -296,6 +297,7 @@ process_frame_chain( PyObject *frame = NULL; uintptr_t next_frame_addr = 0; uintptr_t stackpointer = 0; + last_frame_addr = frame_addr; // Remember this frame address if (++frame_count > MAX_FRAMES) { PyErr_SetString(PyExc_RuntimeError, "Too many stack frames (possible infinite loop)"); @@ -303,7 +305,6 @@ process_frame_chain( return -1; } - // Try chunks first, fallback to direct memory read if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, chunks) < 0) { PyErr_Clear(); uintptr_t address_of_code_object = 0; @@ -377,6 +378,17 @@ process_frame_chain( frame_addr = next_frame_addr; } + // Validate we reached the base frame (sentinel at bottom of stack) + // Only validate if we walked the full chain (didn't stop at cached frame) + // and base_frame_addr is provided (non-zero) + int stopped_early = stopped_at_cached_frame && *stopped_at_cached_frame; + if (!stopped_early && base_frame_addr != 0 && last_frame_addr != base_frame_addr) { + PyErr_Format(PyExc_RuntimeError, + "Incomplete sample: did not reach base frame (expected 0x%lx, got 0x%lx)", + base_frame_addr, last_frame_addr); + return -1; + } + return 0; } @@ -540,7 +552,7 @@ collect_frames_with_cache( Py_ssize_t frames_before = PyList_GET_SIZE(frame_info); int stopped_at_cached = 0; - if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame, + if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame, last_profiled_frame, &stopped_at_cached, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { return -1; @@ -562,7 +574,7 @@ collect_frames_with_cache( // Cache miss - continue walking from last_profiled_frame to get the rest STATS_INC(unwinder, frame_cache_misses); Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info); - if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame, + if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame, 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { return -1; } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index ce013f902d1..69819eb8dcd 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -380,6 +380,7 @@ unwind_stack_for_thread( } uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame); + uintptr_t base_frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.base_frame); frame_info = PyList_New(0); if (!frame_info) { @@ -411,9 +412,9 @@ unwind_stack_for_thread( PyErr_Clear(); // Non-fatal } } else { - // No caching - process entire frame chain + // No caching - process entire frame chain with base_frame validation if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, - gc_frame, 0, NULL, NULL, NULL, 0) < 0) { + base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain"); goto error; } diff --git a/Python/ceval.c b/Python/ceval.c index 382ae210ebb..a1d54bd058b 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -3352,6 +3352,9 @@ PyEval_MergeCompilerFlags(PyCompilerFlags *cf) { PyThreadState *tstate = _PyThreadState_GET(); _PyInterpreterFrame *current_frame = tstate->current_frame; + if (current_frame == tstate->base_frame) { + current_frame = NULL; + } int result = cf->cf_flags != 0; if (current_frame != NULL) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 67368b5ce07..2527dca71d7 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -2571,7 +2571,7 @@ Py_EndInterpreter(PyThreadState *tstate) if (tstate != _PyThreadState_GET()) { Py_FatalError("thread is not current"); } - if (tstate->current_frame != NULL) { + if (tstate->current_frame != tstate->base_frame) { Py_FatalError("thread still has a frame"); } interp->finalizing = 1; diff --git a/Python/pystate.c b/Python/pystate.c index c12a1418e74..2956e785405 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1482,7 +1482,31 @@ init_threadstate(_PyThreadStateImpl *_tstate, // This is cleared when PyGILState_Ensure() creates the thread state. tstate->gilstate_counter = 1; - tstate->current_frame = NULL; + // Initialize the embedded base frame - sentinel at the bottom of the frame stack + _tstate->base_frame.previous = NULL; + _tstate->base_frame.f_executable = PyStackRef_None; + _tstate->base_frame.f_funcobj = PyStackRef_NULL; + _tstate->base_frame.f_globals = NULL; + _tstate->base_frame.f_builtins = NULL; + _tstate->base_frame.f_locals = NULL; + _tstate->base_frame.frame_obj = NULL; + _tstate->base_frame.instr_ptr = NULL; + _tstate->base_frame.stackpointer = _tstate->base_frame.localsplus; + _tstate->base_frame.return_offset = 0; + _tstate->base_frame.owner = FRAME_OWNED_BY_INTERPRETER; + _tstate->base_frame.visited = 0; +#ifdef Py_DEBUG + _tstate->base_frame.lltrace = 0; +#endif +#ifdef Py_GIL_DISABLED + _tstate->base_frame.tlbc_index = 0; +#endif + _tstate->base_frame.localsplus[0] = PyStackRef_NULL; + + // current_frame starts pointing to the base frame + tstate->current_frame = &_tstate->base_frame; + // base_frame pointer for profilers to validate stack unwinding + tstate->base_frame = &_tstate->base_frame; tstate->datastack_chunk = NULL; tstate->datastack_top = NULL; tstate->datastack_limit = NULL; @@ -1660,7 +1684,7 @@ PyThreadState_Clear(PyThreadState *tstate) int verbose = _PyInterpreterState_GetConfig(tstate->interp)->verbose; - if (verbose && tstate->current_frame != NULL) { + if (verbose && tstate->current_frame != tstate->base_frame) { /* bpo-20526: After the main thread calls _PyInterpreterState_SetFinalizing() in Py_FinalizeEx() (or in Py_EndInterpreter() for subinterpreters), diff --git a/Python/traceback.c b/Python/traceback.c index 48f9b4d04c6..8af63c22a9f 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -1036,7 +1036,7 @@ static int dump_frame(int fd, _PyInterpreterFrame *frame) { if (frame->owner == FRAME_OWNED_BY_INTERPRETER) { - /* Ignore trampoline frame */ + /* Ignore trampoline frames and base frame sentinel */ return 0; }