Merge remote-tracking branch 'upstream/main' into tachyon-opcodes

This commit is contained in:
Pablo Galindo Salgado 2025-12-07 04:38:30 +00:00
commit 8129e3d7f4
154 changed files with 10330 additions and 4926 deletions

View file

@ -189,6 +189,8 @@ parse_frame_object(
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
@ -258,14 +260,39 @@ process_frame_chain(
uintptr_t initial_frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame)
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
int *stopped_at_cached_frame,
uintptr_t *frame_addrs, // optional: C array to receive frame addresses
Py_ssize_t *num_addrs, // in/out: current count / updated count
Py_ssize_t max_addrs) // max capacity of frame_addrs array
{
uintptr_t frame_addr = initial_frame_addr;
uintptr_t prev_frame_addr = 0;
const size_t MAX_FRAMES = 1024;
const size_t MAX_FRAMES = 1024 + 512;
size_t frame_count = 0;
// Initialize output flag
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 0;
}
// Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 1;
}
return 0;
}
while ((void*)frame_addr != NULL) {
// Check if we've reached the cached frame - if so, stop here
if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 1;
}
break;
}
PyObject *frame = NULL;
uintptr_t next_frame_addr = 0;
uintptr_t stackpointer = 0;
@ -286,7 +313,6 @@ process_frame_chain(
}
}
if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
// If the first frame is missing, the chain is broken:
const char *e = "Failed to parse initial frame in chain";
PyErr_SetString(PyExc_RuntimeError, e);
return -1;
@ -316,30 +342,35 @@ process_frame_chain(
if (extra_frame_info == NULL) {
return -1;
}
int error = PyList_Append(frame_info, extra_frame_info);
Py_DECREF(extra_frame_info);
if (error) {
const char *e = "Failed to append extra frame to frame info list";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
if (PyList_Append(frame_info, extra_frame_info) < 0) {
Py_DECREF(extra_frame_info);
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
return -1;
}
// Extra frames use 0 as address (they're synthetic)
if (frame_addrs && *num_addrs < max_addrs) {
frame_addrs[(*num_addrs)++] = 0;
}
Py_DECREF(extra_frame_info);
}
if (frame) {
if (prev_frame_addr && frame_addr != prev_frame_addr) {
const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx";
PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr);
Py_DECREF(frame);
const char *e = "Frame chain consistency check failed";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed");
return -1;
}
if (PyList_Append(frame_info, frame) == -1) {
if (PyList_Append(frame_info, frame) < 0) {
Py_DECREF(frame);
const char *e = "Failed to append frame to frame info list";
set_exception_cause(unwinder, PyExc_RuntimeError, e);
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
return -1;
}
// Track the address for this frame
if (frame_addrs && *num_addrs < max_addrs) {
frame_addrs[(*num_addrs)++] = frame_addr;
}
Py_DECREF(frame);
}
@ -349,3 +380,208 @@ process_frame_chain(
return 0;
}
// Clear last_profiled_frame for all threads in the target process.
// This must be called at the start of profiling to avoid stale values
// from previous profilers causing us to stop frame walking early.
int
clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
{
uintptr_t current_interp = unwinder->interpreter_addr;
uintptr_t zero = 0;
while (current_interp != 0) {
// Get first thread in this interpreter
uintptr_t tstate_addr;
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
current_interp + unwinder->debug_offsets.interpreter_state.threads_head,
sizeof(void*),
&tstate_addr) < 0) {
// Non-fatal: just skip clearing
PyErr_Clear();
return 0;
}
// Iterate all threads in this interpreter
while (tstate_addr != 0) {
// Clear last_profiled_frame
uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
sizeof(uintptr_t), &zero) < 0) {
// Non-fatal: just continue
PyErr_Clear();
}
// Move to next thread
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
tstate_addr + unwinder->debug_offsets.thread_state.next,
sizeof(void*),
&tstate_addr) < 0) {
PyErr_Clear();
break;
}
}
// Move to next interpreter
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
current_interp + unwinder->debug_offsets.interpreter_state.next,
sizeof(void*),
&current_interp) < 0) {
PyErr_Clear();
break;
}
}
return 0;
}
// Fast path: check if we have a full cache hit (parent stack unchanged)
// A "full hit" means current frame == last profiled frame, so we can reuse
// cached parent frames. We always read the current frame from memory to get
// updated line numbers (the line within a frame can change between samples).
// Returns: 1 if full hit (frame_info populated with current frame + cached parents),
// 0 if miss, -1 on error
static int
try_full_cache_hit(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
uintptr_t last_profiled_frame,
uint64_t thread_id,
PyObject *frame_info)
{
if (!unwinder->frame_cache || last_profiled_frame == 0) {
return 0;
}
// Full hit only if current frame == last profiled frame
if (frame_addr != last_profiled_frame) {
return 0;
}
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
// Verify first address matches (sanity check)
if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
return 0;
}
// Always read the current frame from memory to get updated line number
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
uintptr_t previous_frame = 0;
int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
&code_object_addr, &previous_frame);
if (parse_result < 0) {
return -1;
}
// Get cached parent frames first (before modifying frame_info)
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
PyObject *parent_slice = NULL;
if (cached_size > 1) {
parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
if (!parent_slice) {
Py_XDECREF(current_frame);
return -1;
}
}
// Now safe to modify frame_info - add current frame if valid
if (current_frame != NULL) {
if (PyList_Append(frame_info, current_frame) < 0) {
Py_DECREF(current_frame);
Py_XDECREF(parent_slice);
return -1;
}
Py_DECREF(current_frame);
STATS_ADD(unwinder, frames_read_from_memory, 1);
}
// Extend with cached parent frames
if (parent_slice) {
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
Py_DECREF(parent_slice);
if (result < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
}
STATS_INC(unwinder, frame_cache_hits);
return 1;
}
// High-level helper: collect frames with cache optimization
// Returns complete frame_info list, handling all cache logic internally
int
collect_frames_with_cache(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id)
{
// Fast path: check for full cache hit first (no allocations needed)
int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
thread_id, frame_info);
if (full_hit != 0) {
return full_hit < 0 ? -1 : 0; // Either error or success
}
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs = 0;
Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
int stopped_at_cached = 0;
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame,
last_profiled_frame, &stopped_at_cached,
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
}
// Track frames read from memory (frames added by process_frame_chain)
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
// If stopped at cached frame, extend with cached continuation (both frames and addresses)
if (stopped_at_cached) {
Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
frame_info, addrs, &num_addrs,
FRAME_CACHE_MAX_FRAMES);
if (cache_result < 0) {
return -1;
}
if (cache_result == 0) {
// Cache miss - continue walking from last_profiled_frame to get the rest
STATS_INC(unwinder, frame_cache_misses);
Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
} else {
// Partial cache hit
STATS_INC(unwinder, frame_cache_partial_hits);
STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
}
} else if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
}
// Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
return -1;
}
return 0;
}