[3.15] gh-149584: Fix excessive overhead in the Tachyon profiler regarding the cache behavior (GH-149649) (#150152)

This commit is contained in:
Miss Islington (bot) 2026-05-20 13:59:10 +02:00 committed by GitHub
parent 7f29fa5032
commit 034c536d56
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 739 additions and 127 deletions

View file

@ -327,6 +327,33 @@ def _print_unwinder_stats(self):
print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
batched_attempts = stats.get('batched_read_attempts', 0)
batched_successes = stats.get('batched_read_successes', 0)
batched_misses = stats.get('batched_read_misses', 0)
segments_requested = stats.get('batched_read_segments_requested', 0)
segments_completed = stats.get('batched_read_segments_completed', 0)
if batched_attempts > 0:
batched_success_rate = stats.get('batched_read_success_rate', 0.0)
batched_miss_rate = 100.0 - batched_success_rate
segment_completion_rate = stats.get(
'batched_read_segment_completion_rate', 0.0
)
print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
print(f" Attempts: {batched_attempts:n}")
print(
f" Successes: {batched_successes:n} "
f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
)
print(
f" Misses: {batched_misses:n} "
f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
)
print(
f" Segments read: {segments_completed:n}/{segments_requested:n} "
f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
)
# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)

View file

@ -3767,6 +3767,13 @@ def test_get_stats(self):
"frames_read_from_cache",
"frames_read_from_memory",
"frame_cache_hit_rate",
"batched_read_attempts",
"batched_read_successes",
"batched_read_misses",
"batched_read_segments_requested",
"batched_read_segments_completed",
"batched_read_success_rate",
"batched_read_segment_completion_rate",
]
for key in expected_keys:
self.assertIn(key, stats)

View file

@ -0,0 +1,4 @@
Fix excessive overhead in the Tachyon profiler when inspecting a remote
process by avoiding repeated remote page-cache scans, batching predicted
remote reads, and reusing cached profiler result objects. Patch by Pablo
Galindo and Maurycy Pawłowski-Wieroński.

View file

@ -30,6 +30,7 @@ extern "C" {
#include "internal/pycore_llist.h" // struct llist_node
#include "internal/pycore_long.h" // _PyLong_GetZero
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
#include "internal/pycore_stackref.h" // Py_TAG_BITS
#include "../../Python/remote_debug.h"
@ -215,6 +216,8 @@ typedef struct {
PyObject *file_name;
int first_lineno;
PyObject *linetable; // bytes
PyObject *last_frame_info;
ptrdiff_t last_addrq;
uintptr_t addr_code_adaptive;
} CachedCodeMetadata;
@ -224,11 +227,41 @@ typedef struct {
typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
PyObject *frame_list; // owned reference, NULL if empty
} FrameCacheEntry;
#define INTERPRETER_THREAD_CACHE_SIZE 32
#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
#endif
// The two per-interpreter L2 caches below are split into per-field tables so
// that a writer rebinding one slot cannot leave stale data in a field owned by
// the other when the slot is reused across interpreters.
typedef struct {
uintptr_t interpreter_addr;
uintptr_t thread_state_addr;
} InterpreterTstateCacheEntry;
typedef struct {
uintptr_t interpreter_addr;
uint64_t code_object_generation;
} InterpreterGenerationCacheEntry;
// Carries already-read thread state and/or frame buffers across helpers so the
// downstream callee can skip a remote read. Address fields are caller-supplied
// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
// successfully populated them.
typedef struct {
const char *tstate;
uintptr_t tstate_addr;
const char *frame;
uintptr_t frame_addr;
} RemoteReadPrefetch;
/* Statistics for profiling performance analysis */
typedef struct {
uint64_t total_samples; // Total number of get_stack_trace calls
@ -242,14 +275,44 @@ typedef struct {
uint64_t code_object_cache_hits; // Code object cache hits
uint64_t code_object_cache_misses; // Code object cache misses
uint64_t stale_cache_invalidations; // Times stale entries were cleared
uint64_t batched_read_attempts; // Batched remote-read attempts
uint64_t batched_read_successes; // Attempts that read all requested segments
uint64_t batched_read_misses; // Attempts that fell back or partially read
uint64_t batched_read_segments_requested; // Segments requested by batched reads
uint64_t batched_read_segments_completed; // Segments completed by batched reads
} UnwinderStats;
#if defined(__GNUC__) || defined(__clang__)
# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
#else
# define REMOTE_DEBUG_UNLIKELY(value) (value)
#endif
/* Stats tracking macros - no-op when stats collection is disabled */
#define STATS_INC(unwinder, field) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)
#define STATS_ADD(unwinder, field, val) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
#if HAVE_PROCESS_VM_READV
# define STATS_BATCHED_READ(unwinder, requested, completed) \
do { \
if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
(unwinder)->stats.batched_read_attempts++; \
(unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
(unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
if ((completed) == (requested)) { \
(unwinder)->stats.batched_read_successes++; \
} \
else { \
(unwinder)->stats.batched_read_misses++; \
} \
} \
} while(0)
#else
# define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
#endif
typedef struct {
PyTypeObject *RemoteDebugging_Type;
@ -290,7 +353,6 @@ typedef struct {
struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
uintptr_t interpreter_addr;
uintptr_t tstate_addr;
uint64_t code_object_generation;
_Py_hashtable_t *code_object_cache;
int debug;
int only_active_thread;
@ -302,9 +364,17 @@ typedef struct {
int cache_frames;
int collect_stats; // whether to collect statistics
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
// L1 single-entry shortcut over cached_tstates[]: most workloads sample one
// interpreter, so check these pairs before hashing into the table below.
uintptr_t cached_tstate_interpreter_addr;
uintptr_t cached_tstate_addr;
uintptr_t cached_generation_interpreter_addr;
uint64_t cached_code_object_generation;
RemoteDebuggingState *cached_state;
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
UnwinderStats stats; // statistics for performance analysis
InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
InterpreterGenerationCacheEntry cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
#ifdef Py_GIL_DISABLED
uint32_t tlbc_generation;
_Py_hashtable_t *tlbc_cache;
@ -361,11 +431,13 @@ typedef struct {
typedef struct {
/* Inputs */
uintptr_t frame_addr; // Starting frame address
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
/* Outputs */
PyObject *frame_info; // List to append FrameInfo objects
@ -548,6 +620,7 @@ extern int process_frame_chain(
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
@ -566,6 +639,7 @@ extern int frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);
@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
uintptr_t main_thread_tstate
uintptr_t main_thread_tstate,
const RemoteReadPrefetch *prefetch
);
/* Thread stopping functions (for blocking mode) */

View file

@ -411,8 +411,15 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
" - code_object_cache_hits: Code object cache hits\n"
" - code_object_cache_misses: Code object cache misses\n"
" - stale_cache_invalidations: Times stale cache entries were cleared\n"
" - batched_read_attempts: Batched remote-read attempts\n"
" - batched_read_successes: Attempts that read all requested segments\n"
" - batched_read_misses: Attempts that fell back or partially read\n"
" - batched_read_segments_requested: Segments requested by batched reads\n"
" - batched_read_segments_completed: Segments completed by batched reads\n"
" - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
" - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
" - batched_read_success_rate: Percentage of batched reads that completed all segments\n"
" - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n"
"\n"
"Raises:\n"
" RuntimeError: If stats collection was not enabled (stats=False)");
@ -1540,4 +1547,4 @@ skip_optional_kwonly:
exit:
return return_value;
}
/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/
/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/

View file

@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
meta->func_name = func;
meta->file_name = file;
meta->linetable = linetable;
meta->last_frame_info = NULL;
meta->last_addrq = -1;
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
#endif
; // Empty statement to avoid C23 extension warning
if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
*result = Py_NewRef(meta->last_frame_info);
return 0;
}
LocationInfo info = {0};
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
PyBytes_GET_SIZE(meta->linetable),
@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
goto error;
}
if (!unwinder->opcodes) {
Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
meta->last_addrq = addrq;
}
*result = tuple;
return 0;

View file

@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
return NULL;
}
FrameCacheEntry *
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
{
if (!unwinder->frame_cache || tstate_addr == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}
// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
@ -216,6 +234,7 @@ frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
@ -257,6 +276,13 @@ frame_cache_store(
return -1;
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
return -1;
}
}
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);

View file

@ -186,30 +186,16 @@ is_frame_valid(
return 1;
}
int
parse_frame_object(
static int
parse_frame_buffer(
RemoteUnwinderObject *unwinder,
PyObject** result,
uintptr_t address,
const char *frame,
uintptr_t* address_of_code_object,
uintptr_t* previous_frame
) {
char frame[SIZEOF_INTERP_FRAME];
*address_of_code_object = 0;
Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
address,
SIZEOF_INTERP_FRAME,
frame
);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
@ -237,6 +223,31 @@ parse_frame_object(
return parse_code_object(unwinder, result, &code_ctx);
}
int
parse_frame_object(
RemoteUnwinderObject *unwinder,
PyObject** result,
uintptr_t address,
uintptr_t* address_of_code_object,
uintptr_t* previous_frame
) {
char frame[SIZEOF_INTERP_FRAME];
Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle,
address,
SIZEOF_INTERP_FRAME,
frame
);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
}
int
parse_frame_from_chunks(
RemoteUnwinderObject *unwinder,
@ -312,15 +323,32 @@ process_frame_chain(
}
assert(frame_count <= MAX_FRAMES);
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
if (ctx->chunks && ctx->chunks->count > 0) {
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
goto parsed_frame;
}
PyErr_Clear();
}
{
uintptr_t address_of_code_object = 0;
if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
int parse_result;
if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) {
parse_result = parse_frame_buffer(
unwinder, &frame, ctx->prefetch.frame,
&address_of_code_object, &next_frame_addr);
}
else {
parse_result = parse_frame_object(
unwinder, &frame, frame_addr,
&address_of_code_object, &next_frame_addr);
}
if (parse_result < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
return -1;
}
}
parsed_frame:
// Skip first frame if requested (used for cache miss continuation)
if (ctx->skip_first_frame && frame_count == 1) {
Py_XDECREF(frame);
@ -501,41 +529,37 @@ try_full_cache_hit(
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
uintptr_t previous_frame = 0;
int parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
int parse_result;
if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) {
parse_result = parse_frame_buffer(unwinder, &current_frame,
ctx->prefetch.frame,
&code_object_addr, &previous_frame);
}
else {
parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
&code_object_addr, &previous_frame);
}
if (parse_result < 0) {
return -1;
}
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
PyObject *parent_slice = NULL;
if (cached_size > 1) {
parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
if (!parent_slice) {
Py_XDECREF(current_frame);
return -1;
}
}
if (current_frame != NULL) {
if (PyList_Append(ctx->frame_info, current_frame) < 0) {
Py_DECREF(current_frame);
Py_XDECREF(parent_slice);
return -1;
}
Py_DECREF(current_frame);
STATS_ADD(unwinder, frames_read_from_memory, 1);
}
if (parent_slice) {
Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
Py_DECREF(parent_slice);
if (result < 0) {
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
for (Py_ssize_t i = 1; i < cached_size; i++) {
PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0);
STATS_INC(unwinder, frame_cache_hits);
return 1;
@ -606,7 +630,8 @@ collect_frames_with_cache(
}
if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
ctx->thread_state_addr, ctx->base_frame_addr,
ctx->last_frame_visited) < 0) {
return -1;
}

View file

@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr)
Py_DECREF(meta->func_name);
Py_DECREF(meta->file_name);
Py_DECREF(meta->linetable);
Py_XDECREF(meta->last_frame_info);
PyMem_RawFree(meta);
}
@ -360,6 +361,10 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
self->cache_frames = cache_frames;
self->collect_stats = stats;
self->stale_invalidation_counter = 0;
self->cached_tstate_interpreter_addr = 0;
self->cached_tstate_addr = 0;
memset(self->cached_tstates, 0, sizeof(self->cached_tstates));
memset(self->cached_generations, 0, sizeof(self->cached_generations));
self->debug = debug;
self->only_active_thread = only_active_thread;
self->mode = mode;
@ -473,6 +478,172 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
return 0;
}
static inline size_t
interpreter_thread_cache_index(uintptr_t interpreter_addr)
{
// Direct-mapped table indexed by the remote interpreter address. Each entry
// stores the full address and verifies it on lookup, so hash collisions
// degrade to misses and cannot return a value from the wrong interpreter.
return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr)
& (INTERPRETER_THREAD_CACHE_SIZE - 1);
}
static inline uintptr_t
get_cached_tstate_for_interpreter(
RemoteUnwinderObject *self,
uintptr_t interpreter_addr)
{
if (interpreter_addr == 0) {
return 0;
}
if (self->cached_tstate_interpreter_addr == interpreter_addr) {
return self->cached_tstate_addr;
}
InterpreterTstateCacheEntry *entry =
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
if (entry->interpreter_addr == interpreter_addr) {
self->cached_tstate_interpreter_addr = interpreter_addr;
self->cached_tstate_addr = entry->thread_state_addr;
return entry->thread_state_addr;
}
return 0;
}
static inline void
set_cached_tstate_for_interpreter(
RemoteUnwinderObject *self,
uintptr_t interpreter_addr,
uintptr_t thread_state_addr)
{
if (interpreter_addr == 0 || thread_state_addr == 0) {
return;
}
self->cached_tstate_interpreter_addr = interpreter_addr;
self->cached_tstate_addr = thread_state_addr;
InterpreterTstateCacheEntry *entry =
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
entry->interpreter_addr = interpreter_addr;
entry->thread_state_addr = thread_state_addr;
}
static void
refresh_generation_caches_from_interp_state(
RemoteUnwinderObject *self,
uintptr_t interpreter_addr,
const char *interp_state_buffer)
{
uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
self->debug_offsets.interpreter_state.code_object_generation);
if (self->cached_generation_interpreter_addr == interpreter_addr) {
if (code_object_generation != self->cached_code_object_generation) {
self->cached_code_object_generation = code_object_generation;
_Py_hashtable_clear(self->code_object_cache);
}
}
else {
InterpreterGenerationCacheEntry *entry =
&self->cached_generations[interpreter_thread_cache_index(interpreter_addr)];
// A slot rebound from another interpreter must be treated as changed:
// the code_object_cache is global, so even if the new generation
// numerically matches what the previous occupant had, stale entries
// from that occupant could still be served.
int changed = entry->interpreter_addr != interpreter_addr
|| entry->code_object_generation != code_object_generation;
entry->interpreter_addr = interpreter_addr;
entry->code_object_generation = code_object_generation;
if (changed) {
_Py_hashtable_clear(self->code_object_cache);
}
self->cached_generation_interpreter_addr = interpreter_addr;
self->cached_code_object_generation = code_object_generation;
}
#ifdef Py_GIL_DISABLED
uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
self->debug_offsets.interpreter_state.tlbc_generation);
if (current_tlbc_generation != self->tlbc_generation) {
self->tlbc_generation = current_tlbc_generation;
_Py_hashtable_clear(self->tlbc_cache);
}
#endif
}
static int
refresh_generation_caches_for_interpreter(
RemoteUnwinderObject *self,
uintptr_t interpreter_addr)
{
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
if (_Py_RemoteDebug_ReadRemoteMemory(
&self->handle,
interpreter_addr,
INTERP_STATE_BUFFER_SIZE,
interp_state_buffer) < 0) {
set_exception_cause(self, PyExc_RuntimeError,
"Failed to read interpreter state buffer");
return -1;
}
refresh_generation_caches_from_interp_state(self, interpreter_addr, interp_state_buffer);
return 0;
}
static int
read_interp_state_and_maybe_thread_frame(
RemoteUnwinderObject *unwinder,
uintptr_t interpreter_addr,
char *interp_state_buffer,
char *tstate_buffer,
char *frame_buffer,
RemoteReadPrefetch *prefetch)
{
prefetch->tstate = NULL;
prefetch->frame = NULL;
if (prefetch->tstate_addr != 0) {
size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size;
_Py_RemoteReadSegment segments[3] = {
{interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE},
{prefetch->tstate_addr, tstate_buffer, tstate_size},
{prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
};
int nsegs = prefetch->frame_addr != 0 ? 3 : 2;
Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
&unwinder->handle, segments, nsegs);
int completed = 0;
if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) {
completed = 1;
Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
+ (Py_ssize_t)tstate_size;
if (nread >= with_tstate) {
completed = 2;
}
if (nsegs == 3
&& nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) {
completed = 3;
}
}
STATS_BATCHED_READ(unwinder, nsegs, completed);
if (completed >= 1) {
if (completed >= 2) {
prefetch->tstate = tstate_buffer;
}
if (completed >= 3) {
prefetch->frame = frame_buffer;
}
return 0;
}
}
return _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle,
interpreter_addr,
INTERP_STATE_BUFFER_SIZE,
interp_state_buffer);
}
/*[clinic input]
@permit_long_docstring_body
@critical_section
@ -537,15 +708,32 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
while (current_interpreter != 0) {
// Read interpreter state to get the interpreter ID
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
if (_Py_RemoteDebug_PagedReadRemoteMemory(
&self->handle,
char prefetched_tstate[SIZEOF_THREAD_STATE];
char prefetched_frame[SIZEOF_INTERP_FRAME];
RemoteReadPrefetch prefetch = {0};
if (self->cache_frames) {
prefetch.tstate_addr = get_cached_tstate_for_interpreter(
self, current_interpreter);
}
if (prefetch.tstate_addr != 0) {
FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr);
if (entry && entry->num_addrs > 0) {
prefetch.frame_addr = entry->addrs[0];
}
}
if (read_interp_state_and_maybe_thread_frame(
self,
current_interpreter,
INTERP_STATE_BUFFER_SIZE,
interp_state_buffer) < 0) {
interp_state_buffer,
prefetched_tstate,
prefetched_frame,
&prefetch) < 0) {
set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer");
Py_CLEAR(result);
goto exit;
}
refresh_generation_caches_from_interp_state(self, current_interpreter, interp_state_buffer);
uintptr_t gc_frame = 0;
if (self->gc) {
@ -557,25 +745,6 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer,
self->debug_offsets.interpreter_state.id);
// Get code object generation from buffer
uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
self->debug_offsets.interpreter_state.code_object_generation);
if (code_object_generation != self->code_object_generation) {
self->code_object_generation = code_object_generation;
_Py_hashtable_clear(self->code_object_cache);
}
#ifdef Py_GIL_DISABLED
// Check TLBC generation and invalidate cache if needed
uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
self->debug_offsets.interpreter_state.tlbc_generation);
if (current_tlbc_generation != self->tlbc_generation) {
self->tlbc_generation = current_tlbc_generation;
_Py_hashtable_clear(self->tlbc_cache);
}
#endif
// Create a list to hold threads for this interpreter
PyObject *interpreter_threads = PyList_New(0);
if (!interpreter_threads) {
@ -611,6 +780,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
// Target specific thread (only process first interpreter)
current_tstate = self->tstate_addr;
}
if (current_tstate != 0 && self->cache_frames) {
set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate);
}
// Acquire main thread state information
uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
@ -621,7 +793,8 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate,
gil_holder_tstate,
gc_frame,
main_thread_tstate);
main_thread_tstate,
&prefetch);
if (!frame_info) {
// Check if this was an intentional skip due to mode-based filtering
if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL ||
@ -771,6 +944,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s
if (ensure_async_debug_offsets(self) < 0) {
return NULL;
}
if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
return NULL;
}
PyObject *result = PyList_New(0);
if (result == NULL) {
@ -860,6 +1036,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
if (ensure_async_debug_offsets(self) < 0) {
return NULL;
}
if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
return NULL;
}
PyObject *result = PyList_New(0);
if (result == NULL) {
@ -904,8 +1083,15 @@ RemoteUnwinder was created with stats=True.
- code_object_cache_hits: Code object cache hits
- code_object_cache_misses: Code object cache misses
- stale_cache_invalidations: Times stale cache entries were cleared
- batched_read_attempts: Batched remote-read attempts
- batched_read_successes: Attempts that read all requested segments
- batched_read_misses: Attempts that fell back or partially read
- batched_read_segments_requested: Segments requested by batched reads
- batched_read_segments_completed: Segments completed by batched reads
- frame_cache_hit_rate: Percentage of samples that hit the cache
- code_object_cache_hit_rate: Percentage of code object lookups that hit cache
- batched_read_success_rate: Percentage of batched reads that completed all segments
- batched_read_segment_completion_rate: Percentage of requested segments read by batched reads
Raises:
RuntimeError: If stats collection was not enabled (stats=False)
@ -913,7 +1099,7 @@ RemoteUnwinder was created with stats=True.
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/
{
if (!self->collect_stats) {
PyErr_SetString(PyExc_RuntimeError,
@ -948,9 +1134,24 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
ADD_STAT(code_object_cache_hits);
ADD_STAT(code_object_cache_misses);
ADD_STAT(stale_cache_invalidations);
ADD_STAT(batched_read_attempts);
ADD_STAT(batched_read_successes);
ADD_STAT(batched_read_misses);
ADD_STAT(batched_read_segments_requested);
ADD_STAT(batched_read_segments_completed);
#undef ADD_STAT
#define ADD_DERIVED_STAT(name, value) do { \
PyObject *val = PyFloat_FromDouble(value); \
if (!val || PyDict_SetItemString(result, name, val) < 0) { \
Py_XDECREF(val); \
Py_DECREF(result); \
return NULL; \
} \
Py_DECREF(val); \
} while(0)
// Calculate and add derived statistics
// Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
double frame_cache_hit_rate = 0.0;
@ -959,26 +1160,33 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
/ (double)total_cache_lookups;
}
PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
Py_XDECREF(hit_rate);
Py_DECREF(result);
return NULL;
}
Py_DECREF(hit_rate);
ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate);
double code_object_hit_rate = 0.0;
uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
if (total_code_lookups > 0) {
code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
}
PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
Py_XDECREF(code_hit_rate);
Py_DECREF(result);
return NULL;
ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate);
double batched_read_success_rate = 0.0;
if (self->stats.batched_read_attempts > 0) {
batched_read_success_rate =
100.0 * (double)self->stats.batched_read_successes
/ (double)self->stats.batched_read_attempts;
}
Py_DECREF(code_hit_rate);
ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate);
double batched_read_segment_completion_rate = 0.0;
if (self->stats.batched_read_segments_requested > 0) {
batched_read_segment_completion_rate =
100.0 * (double)self->stats.batched_read_segments_completed
/ (double)self->stats.batched_read_segments_requested;
}
ADD_DERIVED_STAT("batched_read_segment_completion_rate",
batched_read_segment_completion_rate);
#undef ADD_DERIVED_STAT
return result;
}

View file

@ -289,28 +289,110 @@ typedef struct {
unsigned int :24;
} _thread_status;
static int
read_thread_state_and_maybe_frame(
RemoteUnwinderObject *unwinder,
uintptr_t tstate_addr,
size_t tstate_size,
char *tstate_buffer,
uintptr_t predicted_frame_addr,
char *frame_buffer,
int *frame_read)
{
*frame_read = 0;
if (predicted_frame_addr != 0) {
_Py_RemoteReadSegment segments[2] = {
{tstate_addr, tstate_buffer, tstate_size},
{predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
};
Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
&unwinder->handle, segments, 2);
int completed = 0;
if (nread >= (Py_ssize_t)tstate_size) {
completed = 1;
if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) {
completed = 2;
}
}
STATS_BATCHED_READ(unwinder, 2, completed);
if (completed >= 1) {
*frame_read = completed == 2;
return 0;
}
}
return _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle, tstate_addr, tstate_size, tstate_buffer);
}
PyObject*
unwind_stack_for_thread(
RemoteUnwinderObject *unwinder,
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
uintptr_t main_thread_tstate
uintptr_t main_thread_tstate,
const RemoteReadPrefetch *prefetch
) {
PyObject *frame_info = NULL;
PyObject *thread_id = NULL;
PyObject *result = NULL;
StackChunkList chunks = {0};
char ts[SIZEOF_THREAD_STATE];
int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
goto error;
char local_ts[SIZEOF_THREAD_STATE];
char local_prefetched_frame[SIZEOF_INTERP_FRAME];
const char *ts;
RemoteReadPrefetch ctx_prefetch = {0};
if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) {
ts = prefetch->tstate;
if (prefetch->frame) {
ctx_prefetch.frame = prefetch->frame;
ctx_prefetch.frame_addr = prefetch->frame_addr;
}
}
else if (unwinder->cache_frames) {
uintptr_t predicted_frame_addr = 0;
int have_prefetched_frame = 0;
FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate);
if (entry && entry->num_addrs > 0) {
predicted_frame_addr = entry->addrs[0];
}
int rc = read_thread_state_and_maybe_frame(
unwinder,
*current_tstate,
(size_t)unwinder->debug_offsets.thread_state.size,
local_ts,
predicted_frame_addr,
local_prefetched_frame,
&have_prefetched_frame);
if (rc < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
goto error;
}
ts = local_ts;
if (have_prefetched_frame) {
ctx_prefetch.frame = local_prefetched_frame;
ctx_prefetch.frame_addr = predicted_frame_addr;
}
}
else {
int rc = _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle,
*current_tstate,
(size_t)unwinder->debug_offsets.thread_state.size,
local_ts);
if (rc < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
goto error;
}
ts = local_ts;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
if (ctx_prefetch.frame) {
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
}
long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
@ -432,9 +514,11 @@ unwind_stack_for_thread(
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
FrameWalkContext ctx = {
.frame_addr = frame_addr,
.thread_state_addr = *current_tstate,
.base_frame_addr = base_frame_addr,
.gc_frame = gc_frame,
.chunks = &chunks,
.prefetch = ctx_prefetch,
.frame_info = frame_info,
.frame_addrs = addrs,
.num_addrs = 0,
@ -467,10 +551,18 @@ unwind_stack_for_thread(
*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
thread_id = PyLong_FromLongLong(tid);
if (unwinder->cache_frames) {
FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid);
if (entry && entry->thread_id_obj) {
thread_id = Py_NewRef(entry->thread_id_obj);
}
}
if (thread_id == NULL) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
goto error;
thread_id = PyLong_FromLongLong(tid);
if (thread_id == NULL) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
goto error;
}
}
RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder);

View file

@ -147,6 +147,7 @@ typedef struct {
int memfd;
#endif
page_cache_entry_t pages[MAX_PAGES];
int page_cache_count;
Py_ssize_t page_size;
} proc_handle_t;
@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
handle->pages[i].data = NULL;
handle->pages[i].valid = 0;
}
handle->page_cache_count = 0;
}
UNUSED static void
_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
{
for (int i = 0; i < MAX_PAGES; i++) {
for (int i = 0; i < handle->page_cache_count; i++) {
handle->pages[i].valid = 0;
}
handle->page_cache_count = 0;
}
#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
handle->memfd = -1;
#endif
handle->page_size = get_page_size();
handle->page_cache_count = 0;
for (int i = 0; i < MAX_PAGES; i++) {
handle->pages[i].data = NULL;
handle->pages[i].valid = 0;
@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
}
// Search for valid cached page
for (int i = 0; i < MAX_PAGES; i++) {
// Search only the pages used since the last clear. The cache is cleared
// between profiler samples, so entries are packed at the front.
for (int i = 0; i < handle->page_cache_count; i++) {
page_cache_entry_t *entry = &handle->pages[i];
if (entry->valid && entry->page_addr == page_base) {
memcpy(out, entry->data + offset_in_page, size);
@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
}
}
// Find reusable slot
for (int i = 0; i < MAX_PAGES; i++) {
page_cache_entry_t *entry = &handle->pages[i];
if (!entry->valid) {
if (handle->page_cache_count < MAX_PAGES) {
page_cache_entry_t *entry = &handle->pages[handle->page_cache_count];
if (entry->data == NULL) {
entry->data = PyMem_RawMalloc(page_size);
if (entry->data == NULL) {
entry->data = PyMem_RawMalloc(page_size);
if (entry->data == NULL) {
PyErr_NoMemory();
_set_debug_exception_cause(PyExc_MemoryError,
"Cannot allocate %zu bytes for page cache entry "
"during read from PID %d at address 0x%lx",
page_size, handle->pid, addr);
return -1;
}
PyErr_NoMemory();
_set_debug_exception_cause(PyExc_MemoryError,
"Cannot allocate %zu bytes for page cache entry "
"during read from PID %d at address 0x%lx",
page_size, handle->pid, addr);
return -1;
}
if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
// Try to just copy the exact amount as a fallback
PyErr_Clear();
goto fallback;
}
entry->page_addr = page_base;
entry->valid = 1;
memcpy(out, entry->data + offset_in_page, size);
return 0;
}
if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
// Try to just copy the exact amount as a fallback
PyErr_Clear();
goto fallback;
}
entry->page_addr = page_base;
entry->valid = 1;
handle->page_cache_count++;
memcpy(out, entry->data + offset_in_page, size);
return 0;
}
fallback:
@ -1330,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
}
typedef struct {
uintptr_t remote_addr;
void *local_buf;
size_t size;
} _Py_RemoteReadSegment;
#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4
// Batched read of multiple remote regions in a single syscall when supported.
// Returns total bytes read (>= 0) on success, -1 if batched reads are
// unavailable or the syscall failed. Callers compare the return value against
// cumulative segment sizes to determine which segments were fully populated.
UNUSED static Py_ssize_t
_Py_RemoteDebug_BatchedReadRemoteMemory(
proc_handle_t *handle,
const _Py_RemoteReadSegment *segments,
int nsegs)
{
#if defined(__linux__) && HAVE_PROCESS_VM_READV
if (handle->memfd == -1
&& nsegs > 0
&& nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) {
struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
for (int i = 0; i < nsegs; i++) {
local[i].iov_base = segments[i].local_buf;
local[i].iov_len = segments[i].size;
remote[i].iov_base = (void *)segments[i].remote_addr;
remote[i].iov_len = segments[i].size;
}
ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0);
if (nread >= 0) {
return (Py_ssize_t)nread;
}
}
#else
(void)handle;
(void)segments;
(void)nsegs;
#endif
return -1;
}
UNUSED static int
_Py_RemoteDebug_ReadDebugOffsets(
proc_handle_t *handle,

View file

@ -151,6 +151,45 @@ def create_threads(n):
time.sleep(0.05)
'''
ASYNC_CODE = '''\
import asyncio
import contextlib
import math
def compute_slice(seed):
result = 0.0
for i in range(2000):
result += math.sin(seed + i) * math.sqrt(i + 1)
return result
async def leaf_task(seed):
total = 0.0
while True:
total += compute_slice(seed)
await asyncio.sleep(0)
async def parent_task(seed):
child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}")
try:
while True:
compute_slice(seed)
await asyncio.sleep(0.001)
finally:
child.cancel()
with contextlib.suppress(asyncio.CancelledError):
await child
async def main():
tasks = [
asyncio.create_task(parent_task(i), name=f"parent-{i}")
for i in range(8)
]
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())
'''
CODE_EXAMPLES = {
"basic": {
"code": CODE,
@ -164,10 +203,29 @@ def create_threads(n):
"code": CODE_WITH_TONS_OF_THREADS,
"description": "Tons of threads doing mixed CPU/IO work",
},
"asyncio": {
"code": ASYNC_CODE,
"description": "Asyncio tasks with active and awaited coroutine chains",
},
}
OPERATIONS = {
"stack_trace": {
"method": "get_stack_trace",
"label": "get_stack_trace()",
},
"async_stack_trace": {
"method": "get_async_stack_trace",
"label": "get_async_stack_trace()",
},
"all_awaited_by": {
"method": "get_all_awaited_by",
"label": "get_all_awaited_by()",
},
}
def benchmark(unwinder, duration_seconds=10, blocking=False):
def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"):
"""Benchmark mode - measure raw sampling speed for specified duration"""
sample_count = 0
fail_count = 0
@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
start_time = time.perf_counter()
end_time = start_time + duration_seconds
total_attempts = 0
operation_info = OPERATIONS[operation]
operation_method = getattr(unwinder, operation_info["method"])
colors = get_colors(can_colorize())
print(
f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}"
f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed "
f"for {duration_seconds} seconds...{colors.RESET}"
)
try:
@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
if blocking:
unwinder.pause_threads()
try:
stack_trace = unwinder.get_stack_trace()
if stack_trace:
sample = operation_method()
if sample:
sample_count += 1
finally:
if blocking:
@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
(sample_count / total_attempts) * 100 if total_attempts > 0 else 0
),
"total_work_time": total_work_time,
"operation": operation_info["label"],
"avg_work_time_us": (
(total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0
),
@ -252,7 +314,7 @@ def print_benchmark_results(results):
colors = get_colors(can_colorize())
print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}")
print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}")
print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
# Basic statistics
@ -329,6 +391,8 @@ def parse_arguments():
%(prog)s -d 60 # Run basic benchmark for 60 seconds
%(prog)s --code deep_static # Run deep static call stack benchmark
%(prog)s --code deep_static -d 30 # Run deep static benchmark for 30 seconds
%(prog)s --operation async_stack_trace
%(prog)s --operation all_awaited_by
Available code examples:
{examples_desc}
@ -348,8 +412,15 @@ def parse_arguments():
"--code",
"-c",
choices=list(CODE_EXAMPLES.keys()),
default="basic",
help="Code example to benchmark (default: basic)",
default=None,
help="Code example to benchmark (default: basic, or asyncio for async operations)",
)
parser.add_argument(
"--operation",
choices=list(OPERATIONS.keys()),
default="stack_trace",
help="Remote unwinder operation to benchmark (default: stack_trace)",
)
parser.add_argument(
@ -365,7 +436,10 @@ def parse_arguments():
help="Stop all threads before sampling for consistent snapshots",
)
return parser.parse_args()
args = parser.parse_args()
if args.code is None:
args.code = "asyncio" if args.operation != "stack_trace" else "basic"
return args
def create_target_process(temp_file, code_example="basic"):
@ -420,6 +494,9 @@ def main():
print(
f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds"
)
print(
f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}"
)
print(
f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}"
)
@ -451,7 +528,12 @@ def main():
unwinder = _remote_debugging.RemoteUnwinder(
process.pid, cache_frames=True, **kwargs
)
results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking)
results = benchmark(
unwinder,
duration_seconds=args.duration,
blocking=args.blocking,
operation=args.operation,
)
finally:
cleanup_process(process, temp_file_path)