mirror of
https://github.com/python/cpython.git
synced 2026-06-23 01:21:05 +00:00
[3.15] gh-149584: Fix excessive overhead in the Tachyon profiler regarding the cache behavior (GH-149649) (#150152)
This commit is contained in:
parent
7f29fa5032
commit
034c536d56
12 changed files with 739 additions and 127 deletions
|
|
@ -327,6 +327,33 @@ def _print_unwinder_stats(self):
|
|||
print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
|
||||
print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
|
||||
|
||||
batched_attempts = stats.get('batched_read_attempts', 0)
|
||||
batched_successes = stats.get('batched_read_successes', 0)
|
||||
batched_misses = stats.get('batched_read_misses', 0)
|
||||
segments_requested = stats.get('batched_read_segments_requested', 0)
|
||||
segments_completed = stats.get('batched_read_segments_completed', 0)
|
||||
if batched_attempts > 0:
|
||||
batched_success_rate = stats.get('batched_read_success_rate', 0.0)
|
||||
batched_miss_rate = 100.0 - batched_success_rate
|
||||
segment_completion_rate = stats.get(
|
||||
'batched_read_segment_completion_rate', 0.0
|
||||
)
|
||||
|
||||
print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
|
||||
print(f" Attempts: {batched_attempts:n}")
|
||||
print(
|
||||
f" Successes: {batched_successes:n} "
|
||||
f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
|
||||
)
|
||||
print(
|
||||
f" Misses: {batched_misses:n} "
|
||||
f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
|
||||
)
|
||||
print(
|
||||
f" Segments read: {segments_completed:n}/{segments_requested:n} "
|
||||
f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
|
||||
)
|
||||
|
||||
# Memory operations
|
||||
memory_reads = stats.get('memory_reads', 0)
|
||||
memory_bytes = stats.get('memory_bytes_read', 0)
|
||||
|
|
|
|||
|
|
@ -3767,6 +3767,13 @@ def test_get_stats(self):
|
|||
"frames_read_from_cache",
|
||||
"frames_read_from_memory",
|
||||
"frame_cache_hit_rate",
|
||||
"batched_read_attempts",
|
||||
"batched_read_successes",
|
||||
"batched_read_misses",
|
||||
"batched_read_segments_requested",
|
||||
"batched_read_segments_completed",
|
||||
"batched_read_success_rate",
|
||||
"batched_read_segment_completion_rate",
|
||||
]
|
||||
for key in expected_keys:
|
||||
self.assertIn(key, stats)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
Fix excessive overhead in the Tachyon profiler when inspecting a remote
|
||||
process by avoiding repeated remote page-cache scans, batching predicted
|
||||
remote reads, and reusing cached profiler result objects. Patch by Pablo
|
||||
Galindo and Maurycy Pawłowski-Wieroński.
|
||||
|
|
@ -30,6 +30,7 @@ extern "C" {
|
|||
#include "internal/pycore_llist.h" // struct llist_node
|
||||
#include "internal/pycore_long.h" // _PyLong_GetZero
|
||||
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
|
||||
#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
|
||||
#include "internal/pycore_stackref.h" // Py_TAG_BITS
|
||||
#include "../../Python/remote_debug.h"
|
||||
|
||||
|
|
@ -215,6 +216,8 @@ typedef struct {
|
|||
PyObject *file_name;
|
||||
int first_lineno;
|
||||
PyObject *linetable; // bytes
|
||||
PyObject *last_frame_info;
|
||||
ptrdiff_t last_addrq;
|
||||
uintptr_t addr_code_adaptive;
|
||||
} CachedCodeMetadata;
|
||||
|
||||
|
|
@ -224,11 +227,41 @@ typedef struct {
|
|||
|
||||
typedef struct {
|
||||
uint64_t thread_id; // 0 = empty slot
|
||||
uintptr_t thread_state_addr;
|
||||
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
|
||||
Py_ssize_t num_addrs;
|
||||
PyObject *thread_id_obj; // owned reference, NULL if empty
|
||||
PyObject *frame_list; // owned reference, NULL if empty
|
||||
} FrameCacheEntry;
|
||||
|
||||
#define INTERPRETER_THREAD_CACHE_SIZE 32
|
||||
#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
|
||||
# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
|
||||
#endif
|
||||
|
||||
// The two per-interpreter L2 caches below are split into per-field tables so
|
||||
// that a writer rebinding one slot cannot leave stale data in a field owned by
|
||||
// the other when the slot is reused across interpreters.
|
||||
typedef struct {
|
||||
uintptr_t interpreter_addr;
|
||||
uintptr_t thread_state_addr;
|
||||
} InterpreterTstateCacheEntry;
|
||||
typedef struct {
|
||||
uintptr_t interpreter_addr;
|
||||
uint64_t code_object_generation;
|
||||
} InterpreterGenerationCacheEntry;
|
||||
|
||||
// Carries already-read thread state and/or frame buffers across helpers so the
|
||||
// downstream callee can skip a remote read. Address fields are caller-supplied
|
||||
// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
|
||||
// successfully populated them.
|
||||
typedef struct {
|
||||
const char *tstate;
|
||||
uintptr_t tstate_addr;
|
||||
const char *frame;
|
||||
uintptr_t frame_addr;
|
||||
} RemoteReadPrefetch;
|
||||
|
||||
/* Statistics for profiling performance analysis */
|
||||
typedef struct {
|
||||
uint64_t total_samples; // Total number of get_stack_trace calls
|
||||
|
|
@ -242,14 +275,44 @@ typedef struct {
|
|||
uint64_t code_object_cache_hits; // Code object cache hits
|
||||
uint64_t code_object_cache_misses; // Code object cache misses
|
||||
uint64_t stale_cache_invalidations; // Times stale entries were cleared
|
||||
uint64_t batched_read_attempts; // Batched remote-read attempts
|
||||
uint64_t batched_read_successes; // Attempts that read all requested segments
|
||||
uint64_t batched_read_misses; // Attempts that fell back or partially read
|
||||
uint64_t batched_read_segments_requested; // Segments requested by batched reads
|
||||
uint64_t batched_read_segments_completed; // Segments completed by batched reads
|
||||
} UnwinderStats;
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
|
||||
#else
|
||||
# define REMOTE_DEBUG_UNLIKELY(value) (value)
|
||||
#endif
|
||||
|
||||
/* Stats tracking macros - no-op when stats collection is disabled */
|
||||
#define STATS_INC(unwinder, field) \
|
||||
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
|
||||
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)
|
||||
|
||||
#define STATS_ADD(unwinder, field, val) \
|
||||
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
|
||||
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
|
||||
|
||||
#if HAVE_PROCESS_VM_READV
|
||||
# define STATS_BATCHED_READ(unwinder, requested, completed) \
|
||||
do { \
|
||||
if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
|
||||
(unwinder)->stats.batched_read_attempts++; \
|
||||
(unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
|
||||
(unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
|
||||
if ((completed) == (requested)) { \
|
||||
(unwinder)->stats.batched_read_successes++; \
|
||||
} \
|
||||
else { \
|
||||
(unwinder)->stats.batched_read_misses++; \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
# define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
PyTypeObject *RemoteDebugging_Type;
|
||||
|
|
@ -290,7 +353,6 @@ typedef struct {
|
|||
struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
|
||||
uintptr_t interpreter_addr;
|
||||
uintptr_t tstate_addr;
|
||||
uint64_t code_object_generation;
|
||||
_Py_hashtable_t *code_object_cache;
|
||||
int debug;
|
||||
int only_active_thread;
|
||||
|
|
@ -302,9 +364,17 @@ typedef struct {
|
|||
int cache_frames;
|
||||
int collect_stats; // whether to collect statistics
|
||||
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
|
||||
// L1 single-entry shortcut over cached_tstates[]: most workloads sample one
|
||||
// interpreter, so check these pairs before hashing into the table below.
|
||||
uintptr_t cached_tstate_interpreter_addr;
|
||||
uintptr_t cached_tstate_addr;
|
||||
uintptr_t cached_generation_interpreter_addr;
|
||||
uint64_t cached_code_object_generation;
|
||||
RemoteDebuggingState *cached_state;
|
||||
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
|
||||
UnwinderStats stats; // statistics for performance analysis
|
||||
InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
|
||||
InterpreterGenerationCacheEntry cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
|
||||
#ifdef Py_GIL_DISABLED
|
||||
uint32_t tlbc_generation;
|
||||
_Py_hashtable_t *tlbc_cache;
|
||||
|
|
@ -361,11 +431,13 @@ typedef struct {
|
|||
typedef struct {
|
||||
/* Inputs */
|
||||
uintptr_t frame_addr; // Starting frame address
|
||||
uintptr_t thread_state_addr; // Owning thread state address
|
||||
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
|
||||
uintptr_t gc_frame; // GC frame address (0 if not tracking)
|
||||
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
|
||||
StackChunkList *chunks; // Pre-copied stack chunks
|
||||
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
|
||||
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
|
||||
|
||||
/* Outputs */
|
||||
PyObject *frame_info; // List to append FrameInfo objects
|
||||
|
|
@ -548,6 +620,7 @@ extern int process_frame_chain(
|
|||
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
|
||||
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
|
||||
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
|
||||
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
|
||||
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
|
||||
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
|
||||
extern int frame_cache_lookup_and_extend(
|
||||
|
|
@ -566,6 +639,7 @@ extern int frame_cache_store(
|
|||
PyObject *frame_list,
|
||||
const uintptr_t *addrs,
|
||||
Py_ssize_t num_addrs,
|
||||
uintptr_t thread_state_addr,
|
||||
uintptr_t base_frame_addr,
|
||||
uintptr_t last_frame_visited);
|
||||
|
||||
|
|
@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
|
|||
uintptr_t *current_tstate,
|
||||
uintptr_t gil_holder_tstate,
|
||||
uintptr_t gc_frame,
|
||||
uintptr_t main_thread_tstate
|
||||
uintptr_t main_thread_tstate,
|
||||
const RemoteReadPrefetch *prefetch
|
||||
);
|
||||
|
||||
/* Thread stopping functions (for blocking mode) */
|
||||
|
|
|
|||
9
Modules/_remote_debugging/clinic/module.c.h
generated
9
Modules/_remote_debugging/clinic/module.c.h
generated
|
|
@ -411,8 +411,15 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
|
|||
" - code_object_cache_hits: Code object cache hits\n"
|
||||
" - code_object_cache_misses: Code object cache misses\n"
|
||||
" - stale_cache_invalidations: Times stale cache entries were cleared\n"
|
||||
" - batched_read_attempts: Batched remote-read attempts\n"
|
||||
" - batched_read_successes: Attempts that read all requested segments\n"
|
||||
" - batched_read_misses: Attempts that fell back or partially read\n"
|
||||
" - batched_read_segments_requested: Segments requested by batched reads\n"
|
||||
" - batched_read_segments_completed: Segments completed by batched reads\n"
|
||||
" - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
|
||||
" - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
|
||||
" - batched_read_success_rate: Percentage of batched reads that completed all segments\n"
|
||||
" - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n"
|
||||
"\n"
|
||||
"Raises:\n"
|
||||
" RuntimeError: If stats collection was not enabled (stats=False)");
|
||||
|
|
@ -1540,4 +1547,4 @@ skip_optional_kwonly:
|
|||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/
|
||||
|
|
|
|||
|
|
@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
|
|||
meta->func_name = func;
|
||||
meta->file_name = file;
|
||||
meta->linetable = linetable;
|
||||
meta->last_frame_info = NULL;
|
||||
meta->last_addrq = -1;
|
||||
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
|
||||
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
|
||||
|
||||
|
|
@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
|
|||
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
|
||||
#endif
|
||||
; // Empty statement to avoid C23 extension warning
|
||||
|
||||
if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
|
||||
*result = Py_NewRef(meta->last_frame_info);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LocationInfo info = {0};
|
||||
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
|
||||
PyBytes_GET_SIZE(meta->linetable),
|
||||
|
|
@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
|
|||
goto error;
|
||||
}
|
||||
|
||||
if (!unwinder->opcodes) {
|
||||
Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
|
||||
meta->last_addrq = addrq;
|
||||
}
|
||||
|
||||
*result = tuple;
|
||||
return 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
|
|||
return;
|
||||
}
|
||||
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
||||
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
|
||||
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
||||
}
|
||||
PyMem_Free(unwinder->frame_cache);
|
||||
|
|
@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
FrameCacheEntry *
|
||||
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
|
||||
{
|
||||
if (!unwinder->frame_cache || tstate_addr == 0) {
|
||||
return NULL;
|
||||
}
|
||||
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
|
||||
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
|
||||
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
|
||||
return &unwinder->frame_cache[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Allocate a cache slot for a thread
|
||||
// Returns NULL if cache is full (graceful degradation)
|
||||
static FrameCacheEntry *
|
||||
|
|
@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
|
|||
}
|
||||
if (!found) {
|
||||
// Clear this entry
|
||||
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
|
||||
Py_CLEAR(unwinder->frame_cache[i].frame_list);
|
||||
unwinder->frame_cache[i].thread_id = 0;
|
||||
unwinder->frame_cache[i].thread_state_addr = 0;
|
||||
unwinder->frame_cache[i].num_addrs = 0;
|
||||
STATS_INC(unwinder, stale_cache_invalidations);
|
||||
}
|
||||
|
|
@ -216,6 +234,7 @@ frame_cache_store(
|
|||
PyObject *frame_list,
|
||||
const uintptr_t *addrs,
|
||||
Py_ssize_t num_addrs,
|
||||
uintptr_t thread_state_addr,
|
||||
uintptr_t base_frame_addr,
|
||||
uintptr_t last_frame_visited)
|
||||
{
|
||||
|
|
@ -257,6 +276,13 @@ frame_cache_store(
|
|||
return -1;
|
||||
}
|
||||
entry->thread_id = thread_id;
|
||||
entry->thread_state_addr = thread_state_addr;
|
||||
if (entry->thread_id_obj == NULL) {
|
||||
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
|
||||
if (entry->thread_id_obj == NULL) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
|
||||
entry->num_addrs = num_addrs;
|
||||
assert(entry->num_addrs == num_addrs);
|
||||
|
|
|
|||
|
|
@ -186,30 +186,16 @@ is_frame_valid(
|
|||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
parse_frame_object(
|
||||
static int
|
||||
parse_frame_buffer(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
PyObject** result,
|
||||
uintptr_t address,
|
||||
const char *frame,
|
||||
uintptr_t* address_of_code_object,
|
||||
uintptr_t* previous_frame
|
||||
) {
|
||||
char frame[SIZEOF_INTERP_FRAME];
|
||||
*address_of_code_object = 0;
|
||||
|
||||
Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
|
||||
&unwinder->handle,
|
||||
address,
|
||||
SIZEOF_INTERP_FRAME,
|
||||
frame
|
||||
);
|
||||
if (bytes_read < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
|
||||
return -1;
|
||||
}
|
||||
STATS_INC(unwinder, memory_reads);
|
||||
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
|
||||
|
||||
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
|
||||
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
|
||||
int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
|
||||
|
|
@ -237,6 +223,31 @@ parse_frame_object(
|
|||
return parse_code_object(unwinder, result, &code_ctx);
|
||||
}
|
||||
|
||||
int
|
||||
parse_frame_object(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
PyObject** result,
|
||||
uintptr_t address,
|
||||
uintptr_t* address_of_code_object,
|
||||
uintptr_t* previous_frame
|
||||
) {
|
||||
char frame[SIZEOF_INTERP_FRAME];
|
||||
Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
|
||||
&unwinder->handle,
|
||||
address,
|
||||
SIZEOF_INTERP_FRAME,
|
||||
frame
|
||||
);
|
||||
if (bytes_read < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
|
||||
return -1;
|
||||
}
|
||||
STATS_INC(unwinder, memory_reads);
|
||||
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
|
||||
|
||||
return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
|
||||
}
|
||||
|
||||
int
|
||||
parse_frame_from_chunks(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
|
|
@ -312,15 +323,32 @@ process_frame_chain(
|
|||
}
|
||||
assert(frame_count <= MAX_FRAMES);
|
||||
|
||||
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
|
||||
if (ctx->chunks && ctx->chunks->count > 0) {
|
||||
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
|
||||
goto parsed_frame;
|
||||
}
|
||||
PyErr_Clear();
|
||||
}
|
||||
{
|
||||
uintptr_t address_of_code_object = 0;
|
||||
if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
|
||||
int parse_result;
|
||||
if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) {
|
||||
parse_result = parse_frame_buffer(
|
||||
unwinder, &frame, ctx->prefetch.frame,
|
||||
&address_of_code_object, &next_frame_addr);
|
||||
}
|
||||
else {
|
||||
parse_result = parse_frame_object(
|
||||
unwinder, &frame, frame_addr,
|
||||
&address_of_code_object, &next_frame_addr);
|
||||
}
|
||||
if (parse_result < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
parsed_frame:
|
||||
// Skip first frame if requested (used for cache miss continuation)
|
||||
if (ctx->skip_first_frame && frame_count == 1) {
|
||||
Py_XDECREF(frame);
|
||||
|
|
@ -501,41 +529,37 @@ try_full_cache_hit(
|
|||
PyObject *current_frame = NULL;
|
||||
uintptr_t code_object_addr = 0;
|
||||
uintptr_t previous_frame = 0;
|
||||
int parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr,
|
||||
int parse_result;
|
||||
if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) {
|
||||
parse_result = parse_frame_buffer(unwinder, ¤t_frame,
|
||||
ctx->prefetch.frame,
|
||||
&code_object_addr, &previous_frame);
|
||||
}
|
||||
else {
|
||||
parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr,
|
||||
&code_object_addr, &previous_frame);
|
||||
}
|
||||
if (parse_result < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
|
||||
PyObject *parent_slice = NULL;
|
||||
if (cached_size > 1) {
|
||||
parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
|
||||
if (!parent_slice) {
|
||||
Py_XDECREF(current_frame);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (current_frame != NULL) {
|
||||
if (PyList_Append(ctx->frame_info, current_frame) < 0) {
|
||||
Py_DECREF(current_frame);
|
||||
Py_XDECREF(parent_slice);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(current_frame);
|
||||
STATS_ADD(unwinder, frames_read_from_memory, 1);
|
||||
}
|
||||
|
||||
if (parent_slice) {
|
||||
Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
|
||||
int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
|
||||
Py_DECREF(parent_slice);
|
||||
if (result < 0) {
|
||||
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
|
||||
for (Py_ssize_t i = 1; i < cached_size; i++) {
|
||||
PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
|
||||
if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
|
||||
return -1;
|
||||
}
|
||||
STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
|
||||
}
|
||||
STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0);
|
||||
|
||||
STATS_INC(unwinder, frame_cache_hits);
|
||||
return 1;
|
||||
|
|
@ -606,7 +630,8 @@ collect_frames_with_cache(
|
|||
}
|
||||
|
||||
if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
|
||||
ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
|
||||
ctx->thread_state_addr, ctx->base_frame_addr,
|
||||
ctx->last_frame_visited) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr)
|
|||
Py_DECREF(meta->func_name);
|
||||
Py_DECREF(meta->file_name);
|
||||
Py_DECREF(meta->linetable);
|
||||
Py_XDECREF(meta->last_frame_info);
|
||||
PyMem_RawFree(meta);
|
||||
}
|
||||
|
||||
|
|
@ -360,6 +361,10 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
|
|||
self->cache_frames = cache_frames;
|
||||
self->collect_stats = stats;
|
||||
self->stale_invalidation_counter = 0;
|
||||
self->cached_tstate_interpreter_addr = 0;
|
||||
self->cached_tstate_addr = 0;
|
||||
memset(self->cached_tstates, 0, sizeof(self->cached_tstates));
|
||||
memset(self->cached_generations, 0, sizeof(self->cached_generations));
|
||||
self->debug = debug;
|
||||
self->only_active_thread = only_active_thread;
|
||||
self->mode = mode;
|
||||
|
|
@ -473,6 +478,172 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
interpreter_thread_cache_index(uintptr_t interpreter_addr)
|
||||
{
|
||||
// Direct-mapped table indexed by the remote interpreter address. Each entry
|
||||
// stores the full address and verifies it on lookup, so hash collisions
|
||||
// degrade to misses and cannot return a value from the wrong interpreter.
|
||||
return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr)
|
||||
& (INTERPRETER_THREAD_CACHE_SIZE - 1);
|
||||
}
|
||||
|
||||
static inline uintptr_t
|
||||
get_cached_tstate_for_interpreter(
|
||||
RemoteUnwinderObject *self,
|
||||
uintptr_t interpreter_addr)
|
||||
{
|
||||
if (interpreter_addr == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (self->cached_tstate_interpreter_addr == interpreter_addr) {
|
||||
return self->cached_tstate_addr;
|
||||
}
|
||||
|
||||
InterpreterTstateCacheEntry *entry =
|
||||
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
|
||||
if (entry->interpreter_addr == interpreter_addr) {
|
||||
self->cached_tstate_interpreter_addr = interpreter_addr;
|
||||
self->cached_tstate_addr = entry->thread_state_addr;
|
||||
return entry->thread_state_addr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_cached_tstate_for_interpreter(
|
||||
RemoteUnwinderObject *self,
|
||||
uintptr_t interpreter_addr,
|
||||
uintptr_t thread_state_addr)
|
||||
{
|
||||
if (interpreter_addr == 0 || thread_state_addr == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
self->cached_tstate_interpreter_addr = interpreter_addr;
|
||||
self->cached_tstate_addr = thread_state_addr;
|
||||
|
||||
InterpreterTstateCacheEntry *entry =
|
||||
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
|
||||
entry->interpreter_addr = interpreter_addr;
|
||||
entry->thread_state_addr = thread_state_addr;
|
||||
}
|
||||
|
||||
static void
|
||||
refresh_generation_caches_from_interp_state(
|
||||
RemoteUnwinderObject *self,
|
||||
uintptr_t interpreter_addr,
|
||||
const char *interp_state_buffer)
|
||||
{
|
||||
uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
|
||||
self->debug_offsets.interpreter_state.code_object_generation);
|
||||
|
||||
if (self->cached_generation_interpreter_addr == interpreter_addr) {
|
||||
if (code_object_generation != self->cached_code_object_generation) {
|
||||
self->cached_code_object_generation = code_object_generation;
|
||||
_Py_hashtable_clear(self->code_object_cache);
|
||||
}
|
||||
}
|
||||
else {
|
||||
InterpreterGenerationCacheEntry *entry =
|
||||
&self->cached_generations[interpreter_thread_cache_index(interpreter_addr)];
|
||||
// A slot rebound from another interpreter must be treated as changed:
|
||||
// the code_object_cache is global, so even if the new generation
|
||||
// numerically matches what the previous occupant had, stale entries
|
||||
// from that occupant could still be served.
|
||||
int changed = entry->interpreter_addr != interpreter_addr
|
||||
|| entry->code_object_generation != code_object_generation;
|
||||
entry->interpreter_addr = interpreter_addr;
|
||||
entry->code_object_generation = code_object_generation;
|
||||
if (changed) {
|
||||
_Py_hashtable_clear(self->code_object_cache);
|
||||
}
|
||||
self->cached_generation_interpreter_addr = interpreter_addr;
|
||||
self->cached_code_object_generation = code_object_generation;
|
||||
}
|
||||
|
||||
#ifdef Py_GIL_DISABLED
|
||||
uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
|
||||
self->debug_offsets.interpreter_state.tlbc_generation);
|
||||
if (current_tlbc_generation != self->tlbc_generation) {
|
||||
self->tlbc_generation = current_tlbc_generation;
|
||||
_Py_hashtable_clear(self->tlbc_cache);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
refresh_generation_caches_for_interpreter(
|
||||
RemoteUnwinderObject *self,
|
||||
uintptr_t interpreter_addr)
|
||||
{
|
||||
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
|
||||
if (_Py_RemoteDebug_ReadRemoteMemory(
|
||||
&self->handle,
|
||||
interpreter_addr,
|
||||
INTERP_STATE_BUFFER_SIZE,
|
||||
interp_state_buffer) < 0) {
|
||||
set_exception_cause(self, PyExc_RuntimeError,
|
||||
"Failed to read interpreter state buffer");
|
||||
return -1;
|
||||
}
|
||||
refresh_generation_caches_from_interp_state(self, interpreter_addr, interp_state_buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
read_interp_state_and_maybe_thread_frame(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uintptr_t interpreter_addr,
|
||||
char *interp_state_buffer,
|
||||
char *tstate_buffer,
|
||||
char *frame_buffer,
|
||||
RemoteReadPrefetch *prefetch)
|
||||
{
|
||||
prefetch->tstate = NULL;
|
||||
prefetch->frame = NULL;
|
||||
if (prefetch->tstate_addr != 0) {
|
||||
size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size;
|
||||
_Py_RemoteReadSegment segments[3] = {
|
||||
{interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE},
|
||||
{prefetch->tstate_addr, tstate_buffer, tstate_size},
|
||||
{prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
|
||||
};
|
||||
int nsegs = prefetch->frame_addr != 0 ? 3 : 2;
|
||||
Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
|
||||
&unwinder->handle, segments, nsegs);
|
||||
int completed = 0;
|
||||
if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) {
|
||||
completed = 1;
|
||||
Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
|
||||
+ (Py_ssize_t)tstate_size;
|
||||
if (nread >= with_tstate) {
|
||||
completed = 2;
|
||||
}
|
||||
if (nsegs == 3
|
||||
&& nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) {
|
||||
completed = 3;
|
||||
}
|
||||
}
|
||||
STATS_BATCHED_READ(unwinder, nsegs, completed);
|
||||
if (completed >= 1) {
|
||||
if (completed >= 2) {
|
||||
prefetch->tstate = tstate_buffer;
|
||||
}
|
||||
if (completed >= 3) {
|
||||
prefetch->frame = frame_buffer;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return _Py_RemoteDebug_ReadRemoteMemory(
|
||||
&unwinder->handle,
|
||||
interpreter_addr,
|
||||
INTERP_STATE_BUFFER_SIZE,
|
||||
interp_state_buffer);
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
@permit_long_docstring_body
|
||||
@critical_section
|
||||
|
|
@ -537,15 +708,32 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
|
|||
while (current_interpreter != 0) {
|
||||
// Read interpreter state to get the interpreter ID
|
||||
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
|
||||
if (_Py_RemoteDebug_PagedReadRemoteMemory(
|
||||
&self->handle,
|
||||
char prefetched_tstate[SIZEOF_THREAD_STATE];
|
||||
char prefetched_frame[SIZEOF_INTERP_FRAME];
|
||||
RemoteReadPrefetch prefetch = {0};
|
||||
if (self->cache_frames) {
|
||||
prefetch.tstate_addr = get_cached_tstate_for_interpreter(
|
||||
self, current_interpreter);
|
||||
}
|
||||
if (prefetch.tstate_addr != 0) {
|
||||
FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr);
|
||||
if (entry && entry->num_addrs > 0) {
|
||||
prefetch.frame_addr = entry->addrs[0];
|
||||
}
|
||||
}
|
||||
|
||||
if (read_interp_state_and_maybe_thread_frame(
|
||||
self,
|
||||
current_interpreter,
|
||||
INTERP_STATE_BUFFER_SIZE,
|
||||
interp_state_buffer) < 0) {
|
||||
interp_state_buffer,
|
||||
prefetched_tstate,
|
||||
prefetched_frame,
|
||||
&prefetch) < 0) {
|
||||
set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer");
|
||||
Py_CLEAR(result);
|
||||
goto exit;
|
||||
}
|
||||
refresh_generation_caches_from_interp_state(self, current_interpreter, interp_state_buffer);
|
||||
|
||||
uintptr_t gc_frame = 0;
|
||||
if (self->gc) {
|
||||
|
|
@ -557,25 +745,6 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
|
|||
int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer,
|
||||
self->debug_offsets.interpreter_state.id);
|
||||
|
||||
// Get code object generation from buffer
|
||||
uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
|
||||
self->debug_offsets.interpreter_state.code_object_generation);
|
||||
|
||||
if (code_object_generation != self->code_object_generation) {
|
||||
self->code_object_generation = code_object_generation;
|
||||
_Py_hashtable_clear(self->code_object_cache);
|
||||
}
|
||||
|
||||
#ifdef Py_GIL_DISABLED
|
||||
// Check TLBC generation and invalidate cache if needed
|
||||
uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
|
||||
self->debug_offsets.interpreter_state.tlbc_generation);
|
||||
if (current_tlbc_generation != self->tlbc_generation) {
|
||||
self->tlbc_generation = current_tlbc_generation;
|
||||
_Py_hashtable_clear(self->tlbc_cache);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Create a list to hold threads for this interpreter
|
||||
PyObject *interpreter_threads = PyList_New(0);
|
||||
if (!interpreter_threads) {
|
||||
|
|
@ -611,6 +780,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
|
|||
// Target specific thread (only process first interpreter)
|
||||
current_tstate = self->tstate_addr;
|
||||
}
|
||||
if (current_tstate != 0 && self->cache_frames) {
|
||||
set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate);
|
||||
}
|
||||
|
||||
// Acquire main thread state information
|
||||
uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
|
||||
|
|
@ -621,7 +793,8 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
|
|||
PyObject* frame_info = unwind_stack_for_thread(self, ¤t_tstate,
|
||||
gil_holder_tstate,
|
||||
gc_frame,
|
||||
main_thread_tstate);
|
||||
main_thread_tstate,
|
||||
&prefetch);
|
||||
if (!frame_info) {
|
||||
// Check if this was an intentional skip due to mode-based filtering
|
||||
if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL ||
|
||||
|
|
@ -771,6 +944,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s
|
|||
if (ensure_async_debug_offsets(self) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *result = PyList_New(0);
|
||||
if (result == NULL) {
|
||||
|
|
@ -860,6 +1036,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
|
|||
if (ensure_async_debug_offsets(self) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *result = PyList_New(0);
|
||||
if (result == NULL) {
|
||||
|
|
@ -904,8 +1083,15 @@ RemoteUnwinder was created with stats=True.
|
|||
- code_object_cache_hits: Code object cache hits
|
||||
- code_object_cache_misses: Code object cache misses
|
||||
- stale_cache_invalidations: Times stale cache entries were cleared
|
||||
- batched_read_attempts: Batched remote-read attempts
|
||||
- batched_read_successes: Attempts that read all requested segments
|
||||
- batched_read_misses: Attempts that fell back or partially read
|
||||
- batched_read_segments_requested: Segments requested by batched reads
|
||||
- batched_read_segments_completed: Segments completed by batched reads
|
||||
- frame_cache_hit_rate: Percentage of samples that hit the cache
|
||||
- code_object_cache_hit_rate: Percentage of code object lookups that hit cache
|
||||
- batched_read_success_rate: Percentage of batched reads that completed all segments
|
||||
- batched_read_segment_completion_rate: Percentage of requested segments read by batched reads
|
||||
|
||||
Raises:
|
||||
RuntimeError: If stats collection was not enabled (stats=False)
|
||||
|
|
@ -913,7 +1099,7 @@ RemoteUnwinder was created with stats=True.
|
|||
|
||||
static PyObject *
|
||||
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
|
||||
/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
|
||||
/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/
|
||||
{
|
||||
if (!self->collect_stats) {
|
||||
PyErr_SetString(PyExc_RuntimeError,
|
||||
|
|
@ -948,9 +1134,24 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
|
|||
ADD_STAT(code_object_cache_hits);
|
||||
ADD_STAT(code_object_cache_misses);
|
||||
ADD_STAT(stale_cache_invalidations);
|
||||
ADD_STAT(batched_read_attempts);
|
||||
ADD_STAT(batched_read_successes);
|
||||
ADD_STAT(batched_read_misses);
|
||||
ADD_STAT(batched_read_segments_requested);
|
||||
ADD_STAT(batched_read_segments_completed);
|
||||
|
||||
#undef ADD_STAT
|
||||
|
||||
#define ADD_DERIVED_STAT(name, value) do { \
|
||||
PyObject *val = PyFloat_FromDouble(value); \
|
||||
if (!val || PyDict_SetItemString(result, name, val) < 0) { \
|
||||
Py_XDECREF(val); \
|
||||
Py_DECREF(result); \
|
||||
return NULL; \
|
||||
} \
|
||||
Py_DECREF(val); \
|
||||
} while(0)
|
||||
|
||||
// Calculate and add derived statistics
|
||||
// Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
|
||||
double frame_cache_hit_rate = 0.0;
|
||||
|
|
@ -959,26 +1160,33 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
|
|||
frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
|
||||
/ (double)total_cache_lookups;
|
||||
}
|
||||
PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
|
||||
if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
|
||||
Py_XDECREF(hit_rate);
|
||||
Py_DECREF(result);
|
||||
return NULL;
|
||||
}
|
||||
Py_DECREF(hit_rate);
|
||||
ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate);
|
||||
|
||||
double code_object_hit_rate = 0.0;
|
||||
uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
|
||||
if (total_code_lookups > 0) {
|
||||
code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
|
||||
}
|
||||
PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
|
||||
if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
|
||||
Py_XDECREF(code_hit_rate);
|
||||
Py_DECREF(result);
|
||||
return NULL;
|
||||
ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate);
|
||||
|
||||
double batched_read_success_rate = 0.0;
|
||||
if (self->stats.batched_read_attempts > 0) {
|
||||
batched_read_success_rate =
|
||||
100.0 * (double)self->stats.batched_read_successes
|
||||
/ (double)self->stats.batched_read_attempts;
|
||||
}
|
||||
Py_DECREF(code_hit_rate);
|
||||
ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate);
|
||||
|
||||
double batched_read_segment_completion_rate = 0.0;
|
||||
if (self->stats.batched_read_segments_requested > 0) {
|
||||
batched_read_segment_completion_rate =
|
||||
100.0 * (double)self->stats.batched_read_segments_completed
|
||||
/ (double)self->stats.batched_read_segments_requested;
|
||||
}
|
||||
ADD_DERIVED_STAT("batched_read_segment_completion_rate",
|
||||
batched_read_segment_completion_rate);
|
||||
|
||||
#undef ADD_DERIVED_STAT
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -289,28 +289,110 @@ typedef struct {
|
|||
unsigned int :24;
|
||||
} _thread_status;
|
||||
|
||||
static int
|
||||
read_thread_state_and_maybe_frame(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uintptr_t tstate_addr,
|
||||
size_t tstate_size,
|
||||
char *tstate_buffer,
|
||||
uintptr_t predicted_frame_addr,
|
||||
char *frame_buffer,
|
||||
int *frame_read)
|
||||
{
|
||||
*frame_read = 0;
|
||||
if (predicted_frame_addr != 0) {
|
||||
_Py_RemoteReadSegment segments[2] = {
|
||||
{tstate_addr, tstate_buffer, tstate_size},
|
||||
{predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
|
||||
};
|
||||
Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
|
||||
&unwinder->handle, segments, 2);
|
||||
int completed = 0;
|
||||
if (nread >= (Py_ssize_t)tstate_size) {
|
||||
completed = 1;
|
||||
if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) {
|
||||
completed = 2;
|
||||
}
|
||||
}
|
||||
STATS_BATCHED_READ(unwinder, 2, completed);
|
||||
if (completed >= 1) {
|
||||
*frame_read = completed == 2;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return _Py_RemoteDebug_ReadRemoteMemory(
|
||||
&unwinder->handle, tstate_addr, tstate_size, tstate_buffer);
|
||||
}
|
||||
|
||||
PyObject*
|
||||
unwind_stack_for_thread(
|
||||
RemoteUnwinderObject *unwinder,
|
||||
uintptr_t *current_tstate,
|
||||
uintptr_t gil_holder_tstate,
|
||||
uintptr_t gc_frame,
|
||||
uintptr_t main_thread_tstate
|
||||
uintptr_t main_thread_tstate,
|
||||
const RemoteReadPrefetch *prefetch
|
||||
) {
|
||||
PyObject *frame_info = NULL;
|
||||
PyObject *thread_id = NULL;
|
||||
PyObject *result = NULL;
|
||||
StackChunkList chunks = {0};
|
||||
|
||||
char ts[SIZEOF_THREAD_STATE];
|
||||
int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
|
||||
&unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts);
|
||||
if (bytes_read < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
|
||||
goto error;
|
||||
char local_ts[SIZEOF_THREAD_STATE];
|
||||
char local_prefetched_frame[SIZEOF_INTERP_FRAME];
|
||||
const char *ts;
|
||||
RemoteReadPrefetch ctx_prefetch = {0};
|
||||
if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) {
|
||||
ts = prefetch->tstate;
|
||||
if (prefetch->frame) {
|
||||
ctx_prefetch.frame = prefetch->frame;
|
||||
ctx_prefetch.frame_addr = prefetch->frame_addr;
|
||||
}
|
||||
}
|
||||
else if (unwinder->cache_frames) {
|
||||
uintptr_t predicted_frame_addr = 0;
|
||||
int have_prefetched_frame = 0;
|
||||
FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate);
|
||||
if (entry && entry->num_addrs > 0) {
|
||||
predicted_frame_addr = entry->addrs[0];
|
||||
}
|
||||
|
||||
int rc = read_thread_state_and_maybe_frame(
|
||||
unwinder,
|
||||
*current_tstate,
|
||||
(size_t)unwinder->debug_offsets.thread_state.size,
|
||||
local_ts,
|
||||
predicted_frame_addr,
|
||||
local_prefetched_frame,
|
||||
&have_prefetched_frame);
|
||||
if (rc < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
|
||||
goto error;
|
||||
}
|
||||
ts = local_ts;
|
||||
if (have_prefetched_frame) {
|
||||
ctx_prefetch.frame = local_prefetched_frame;
|
||||
ctx_prefetch.frame_addr = predicted_frame_addr;
|
||||
}
|
||||
}
|
||||
else {
|
||||
int rc = _Py_RemoteDebug_ReadRemoteMemory(
|
||||
&unwinder->handle,
|
||||
*current_tstate,
|
||||
(size_t)unwinder->debug_offsets.thread_state.size,
|
||||
local_ts);
|
||||
if (rc < 0) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
|
||||
goto error;
|
||||
}
|
||||
ts = local_ts;
|
||||
}
|
||||
STATS_INC(unwinder, memory_reads);
|
||||
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
|
||||
if (ctx_prefetch.frame) {
|
||||
STATS_INC(unwinder, memory_reads);
|
||||
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
|
||||
}
|
||||
|
||||
long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
|
||||
|
||||
|
|
@ -432,9 +514,11 @@ unwind_stack_for_thread(
|
|||
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
|
||||
FrameWalkContext ctx = {
|
||||
.frame_addr = frame_addr,
|
||||
.thread_state_addr = *current_tstate,
|
||||
.base_frame_addr = base_frame_addr,
|
||||
.gc_frame = gc_frame,
|
||||
.chunks = &chunks,
|
||||
.prefetch = ctx_prefetch,
|
||||
.frame_info = frame_info,
|
||||
.frame_addrs = addrs,
|
||||
.num_addrs = 0,
|
||||
|
|
@ -467,10 +551,18 @@ unwind_stack_for_thread(
|
|||
|
||||
*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
|
||||
|
||||
thread_id = PyLong_FromLongLong(tid);
|
||||
if (unwinder->cache_frames) {
|
||||
FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid);
|
||||
if (entry && entry->thread_id_obj) {
|
||||
thread_id = Py_NewRef(entry->thread_id_obj);
|
||||
}
|
||||
}
|
||||
if (thread_id == NULL) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
|
||||
goto error;
|
||||
thread_id = PyLong_FromLongLong(tid);
|
||||
if (thread_id == NULL) {
|
||||
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder);
|
||||
|
|
|
|||
|
|
@ -147,6 +147,7 @@ typedef struct {
|
|||
int memfd;
|
||||
#endif
|
||||
page_cache_entry_t pages[MAX_PAGES];
|
||||
int page_cache_count;
|
||||
Py_ssize_t page_size;
|
||||
} proc_handle_t;
|
||||
|
||||
|
|
@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
|
|||
handle->pages[i].data = NULL;
|
||||
handle->pages[i].valid = 0;
|
||||
}
|
||||
handle->page_cache_count = 0;
|
||||
}
|
||||
|
||||
UNUSED static void
|
||||
_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
|
||||
{
|
||||
for (int i = 0; i < MAX_PAGES; i++) {
|
||||
for (int i = 0; i < handle->page_cache_count; i++) {
|
||||
handle->pages[i].valid = 0;
|
||||
}
|
||||
handle->page_cache_count = 0;
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
|
||||
|
|
@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
|
|||
handle->memfd = -1;
|
||||
#endif
|
||||
handle->page_size = get_page_size();
|
||||
handle->page_cache_count = 0;
|
||||
for (int i = 0; i < MAX_PAGES; i++) {
|
||||
handle->pages[i].data = NULL;
|
||||
handle->pages[i].valid = 0;
|
||||
|
|
@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
|
|||
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
|
||||
}
|
||||
|
||||
// Search for valid cached page
|
||||
for (int i = 0; i < MAX_PAGES; i++) {
|
||||
// Search only the pages used since the last clear. The cache is cleared
|
||||
// between profiler samples, so entries are packed at the front.
|
||||
for (int i = 0; i < handle->page_cache_count; i++) {
|
||||
page_cache_entry_t *entry = &handle->pages[i];
|
||||
if (entry->valid && entry->page_addr == page_base) {
|
||||
memcpy(out, entry->data + offset_in_page, size);
|
||||
|
|
@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
|
|||
}
|
||||
}
|
||||
|
||||
// Find reusable slot
|
||||
for (int i = 0; i < MAX_PAGES; i++) {
|
||||
page_cache_entry_t *entry = &handle->pages[i];
|
||||
if (!entry->valid) {
|
||||
if (handle->page_cache_count < MAX_PAGES) {
|
||||
page_cache_entry_t *entry = &handle->pages[handle->page_cache_count];
|
||||
if (entry->data == NULL) {
|
||||
entry->data = PyMem_RawMalloc(page_size);
|
||||
if (entry->data == NULL) {
|
||||
entry->data = PyMem_RawMalloc(page_size);
|
||||
if (entry->data == NULL) {
|
||||
PyErr_NoMemory();
|
||||
_set_debug_exception_cause(PyExc_MemoryError,
|
||||
"Cannot allocate %zu bytes for page cache entry "
|
||||
"during read from PID %d at address 0x%lx",
|
||||
page_size, handle->pid, addr);
|
||||
return -1;
|
||||
}
|
||||
PyErr_NoMemory();
|
||||
_set_debug_exception_cause(PyExc_MemoryError,
|
||||
"Cannot allocate %zu bytes for page cache entry "
|
||||
"during read from PID %d at address 0x%lx",
|
||||
page_size, handle->pid, addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
|
||||
// Try to just copy the exact amount as a fallback
|
||||
PyErr_Clear();
|
||||
goto fallback;
|
||||
}
|
||||
|
||||
entry->page_addr = page_base;
|
||||
entry->valid = 1;
|
||||
memcpy(out, entry->data + offset_in_page, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
|
||||
// Try to just copy the exact amount as a fallback
|
||||
PyErr_Clear();
|
||||
goto fallback;
|
||||
}
|
||||
|
||||
entry->page_addr = page_base;
|
||||
entry->valid = 1;
|
||||
handle->page_cache_count++;
|
||||
memcpy(out, entry->data + offset_in_page, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
fallback:
|
||||
|
|
@ -1330,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
|
|||
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uintptr_t remote_addr;
|
||||
void *local_buf;
|
||||
size_t size;
|
||||
} _Py_RemoteReadSegment;
|
||||
|
||||
#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4
|
||||
|
||||
// Batched read of multiple remote regions in a single syscall when supported.
|
||||
// Returns total bytes read (>= 0) on success, -1 if batched reads are
|
||||
// unavailable or the syscall failed. Callers compare the return value against
|
||||
// cumulative segment sizes to determine which segments were fully populated.
|
||||
UNUSED static Py_ssize_t
|
||||
_Py_RemoteDebug_BatchedReadRemoteMemory(
|
||||
proc_handle_t *handle,
|
||||
const _Py_RemoteReadSegment *segments,
|
||||
int nsegs)
|
||||
{
|
||||
#if defined(__linux__) && HAVE_PROCESS_VM_READV
|
||||
if (handle->memfd == -1
|
||||
&& nsegs > 0
|
||||
&& nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) {
|
||||
struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
|
||||
struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
|
||||
for (int i = 0; i < nsegs; i++) {
|
||||
local[i].iov_base = segments[i].local_buf;
|
||||
local[i].iov_len = segments[i].size;
|
||||
remote[i].iov_base = (void *)segments[i].remote_addr;
|
||||
remote[i].iov_len = segments[i].size;
|
||||
}
|
||||
ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0);
|
||||
if (nread >= 0) {
|
||||
return (Py_ssize_t)nread;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)handle;
|
||||
(void)segments;
|
||||
(void)nsegs;
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
UNUSED static int
|
||||
_Py_RemoteDebug_ReadDebugOffsets(
|
||||
proc_handle_t *handle,
|
||||
|
|
|
|||
|
|
@ -151,6 +151,45 @@ def create_threads(n):
|
|||
time.sleep(0.05)
|
||||
'''
|
||||
|
||||
ASYNC_CODE = '''\
|
||||
import asyncio
|
||||
import contextlib
|
||||
import math
|
||||
|
||||
def compute_slice(seed):
|
||||
result = 0.0
|
||||
for i in range(2000):
|
||||
result += math.sin(seed + i) * math.sqrt(i + 1)
|
||||
return result
|
||||
|
||||
async def leaf_task(seed):
|
||||
total = 0.0
|
||||
while True:
|
||||
total += compute_slice(seed)
|
||||
await asyncio.sleep(0)
|
||||
|
||||
async def parent_task(seed):
|
||||
child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}")
|
||||
try:
|
||||
while True:
|
||||
compute_slice(seed)
|
||||
await asyncio.sleep(0.001)
|
||||
finally:
|
||||
child.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await child
|
||||
|
||||
async def main():
|
||||
tasks = [
|
||||
asyncio.create_task(parent_task(i), name=f"parent-{i}")
|
||||
for i in range(8)
|
||||
]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
'''
|
||||
|
||||
CODE_EXAMPLES = {
|
||||
"basic": {
|
||||
"code": CODE,
|
||||
|
|
@ -164,10 +203,29 @@ def create_threads(n):
|
|||
"code": CODE_WITH_TONS_OF_THREADS,
|
||||
"description": "Tons of threads doing mixed CPU/IO work",
|
||||
},
|
||||
"asyncio": {
|
||||
"code": ASYNC_CODE,
|
||||
"description": "Asyncio tasks with active and awaited coroutine chains",
|
||||
},
|
||||
}
|
||||
|
||||
OPERATIONS = {
|
||||
"stack_trace": {
|
||||
"method": "get_stack_trace",
|
||||
"label": "get_stack_trace()",
|
||||
},
|
||||
"async_stack_trace": {
|
||||
"method": "get_async_stack_trace",
|
||||
"label": "get_async_stack_trace()",
|
||||
},
|
||||
"all_awaited_by": {
|
||||
"method": "get_all_awaited_by",
|
||||
"label": "get_all_awaited_by()",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def benchmark(unwinder, duration_seconds=10, blocking=False):
|
||||
def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"):
|
||||
"""Benchmark mode - measure raw sampling speed for specified duration"""
|
||||
sample_count = 0
|
||||
fail_count = 0
|
||||
|
|
@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
|
|||
start_time = time.perf_counter()
|
||||
end_time = start_time + duration_seconds
|
||||
total_attempts = 0
|
||||
operation_info = OPERATIONS[operation]
|
||||
operation_method = getattr(unwinder, operation_info["method"])
|
||||
|
||||
colors = get_colors(can_colorize())
|
||||
|
||||
print(
|
||||
f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}"
|
||||
f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed "
|
||||
f"for {duration_seconds} seconds...{colors.RESET}"
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
|
|||
if blocking:
|
||||
unwinder.pause_threads()
|
||||
try:
|
||||
stack_trace = unwinder.get_stack_trace()
|
||||
if stack_trace:
|
||||
sample = operation_method()
|
||||
if sample:
|
||||
sample_count += 1
|
||||
finally:
|
||||
if blocking:
|
||||
|
|
@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
|
|||
(sample_count / total_attempts) * 100 if total_attempts > 0 else 0
|
||||
),
|
||||
"total_work_time": total_work_time,
|
||||
"operation": operation_info["label"],
|
||||
"avg_work_time_us": (
|
||||
(total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0
|
||||
),
|
||||
|
|
@ -252,7 +314,7 @@ def print_benchmark_results(results):
|
|||
colors = get_colors(can_colorize())
|
||||
|
||||
print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
|
||||
print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}")
|
||||
print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}")
|
||||
print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
|
||||
|
||||
# Basic statistics
|
||||
|
|
@ -329,6 +391,8 @@ def parse_arguments():
|
|||
%(prog)s -d 60 # Run basic benchmark for 60 seconds
|
||||
%(prog)s --code deep_static # Run deep static call stack benchmark
|
||||
%(prog)s --code deep_static -d 30 # Run deep static benchmark for 30 seconds
|
||||
%(prog)s --operation async_stack_trace
|
||||
%(prog)s --operation all_awaited_by
|
||||
|
||||
Available code examples:
|
||||
{examples_desc}
|
||||
|
|
@ -348,8 +412,15 @@ def parse_arguments():
|
|||
"--code",
|
||||
"-c",
|
||||
choices=list(CODE_EXAMPLES.keys()),
|
||||
default="basic",
|
||||
help="Code example to benchmark (default: basic)",
|
||||
default=None,
|
||||
help="Code example to benchmark (default: basic, or asyncio for async operations)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--operation",
|
||||
choices=list(OPERATIONS.keys()),
|
||||
default="stack_trace",
|
||||
help="Remote unwinder operation to benchmark (default: stack_trace)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
|
@ -365,7 +436,10 @@ def parse_arguments():
|
|||
help="Stop all threads before sampling for consistent snapshots",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
args = parser.parse_args()
|
||||
if args.code is None:
|
||||
args.code = "asyncio" if args.operation != "stack_trace" else "basic"
|
||||
return args
|
||||
|
||||
|
||||
def create_target_process(temp_file, code_example="basic"):
|
||||
|
|
@ -420,6 +494,9 @@ def main():
|
|||
print(
|
||||
f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds"
|
||||
)
|
||||
print(
|
||||
f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}"
|
||||
)
|
||||
print(
|
||||
f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}"
|
||||
)
|
||||
|
|
@ -451,7 +528,12 @@ def main():
|
|||
unwinder = _remote_debugging.RemoteUnwinder(
|
||||
process.pid, cache_frames=True, **kwargs
|
||||
)
|
||||
results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking)
|
||||
results = benchmark(
|
||||
unwinder,
|
||||
duration_seconds=args.duration,
|
||||
blocking=args.blocking,
|
||||
operation=args.operation,
|
||||
)
|
||||
finally:
|
||||
cleanup_process(process, temp_file_path)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue