cpython/Python/perf_jit_trampoline.c
Diego Russo c7b7ca2cd5
GH-126910: Add gdb support for unwinding JIT frames (#146071)
Co-authored-by: Pablo Galindo Salgado <pablogsal@gmail.com>
2026-05-02 13:42:03 +00:00

825 lines
31 KiB
C

/*
* Python Perf Trampoline Support - JIT Dump Implementation
*
* This file implements the perf jitdump API for Python's performance profiling
* integration. It allows perf (Linux performance analysis tool) to understand
* and profile dynamically generated Python bytecode by creating JIT dump files
* that perf can inject into its analysis.
*
*
* IMPORTANT: This file exports specific callback functions that are part of
* Python's internal API. Do not modify the function signatures or behavior
* of exported functions without coordinating with the Python core team.
*
* Usually the binary and libraries are mapped in separate region like below:
*
* address ->
* --+---------------------+--//--+---------------------+--
* | .text | .data | ... | | .text | .data | ... |
* --+---------------------+--//--+---------------------+--
* myprog libc.so
*
* So it'd be easy and straight-forward to find a mapped binary or library from an
* address.
*
* But for JIT code, the code arena only cares about the code section. But the
* resulting DSOs (which is generated by perf inject -j) contain ELF headers and
* unwind info too. Then it'd generate following address space with synthesized
* MMAP events. Let's say it has a sample between address B and C.
*
* sample
* |
* address -> A B v C
* ---------------------------------------------------------------------------------------------------
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
* ...
* ---------------------------------------------------------------------------------------------------
*
* If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
* the unwind info. If it maps both .text section and unwind sections, the sample
* could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
* which one is right. So to make perf happy we have non-overlapping ranges for each
* DSO:
*
* address ->
* -------------------------------------------------------------------------------------------------------
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
* ...
* -------------------------------------------------------------------------------------------------------
*
* As the trampolines are constant, we add a constant padding but in general the padding needs to have the
* size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
*/
#include "Python.h"
#include "pycore_ceval.h" // _PyPerf_Callbacks
#include "pycore_frame.h"
#include "pycore_interp.h"
#include "pycore_mmap.h" // _PyAnnotateMemoryMap()
#include "pycore_jit_unwind.h"
#include "pycore_runtime.h" // _PyRuntime
#ifdef PY_HAVE_PERF_TRAMPOLINE
/* Standard library includes for perf jitdump implementation */
#if defined(__linux__)
# include <elf.h> // ELF architecture constants
#endif
#include <fcntl.h> // File control operations
#include <stdio.h> // Standard I/O operations
#include <stdlib.h> // Standard library functions
#include <string.h> // memcpy, strlen
#include <sys/mman.h> // Memory mapping functions (mmap)
#include <sys/types.h> // System data types
#include <unistd.h> // System calls (sysconf, getpid)
#include <sys/time.h> // Time functions (gettimeofday)
#if defined(__linux__)
# include <sys/syscall.h> // System call interface
#endif
// =============================================================================
// CONSTANTS AND CONFIGURATION
// =============================================================================
/*
* Memory layout considerations for perf jitdump:
*
* Perf expects non-overlapping memory regions for each JIT-compiled function.
* When perf processes the jitdump file, it creates synthetic DSO (Dynamic
* Shared Object) files that contain:
* - ELF headers
* - .text section (actual machine code)
* - Unwind information (for stack traces)
*
* To ensure proper address space layout, we add padding between code regions.
* This prevents address conflicts when perf maps the synthesized DSOs.
*
* Memory layout example:
* /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
* /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
*
* The padding size is now calculated automatically during initialization
* based on the actual unwind information requirements.
*/
/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
#if !defined(__linux__)
# if defined(__i386__) || defined(_M_IX86)
# define EM_386 3
# elif defined(__arm__) || defined(_M_ARM)
# define EM_ARM 40
# elif defined(__x86_64__) || defined(_M_X64)
# define EM_X86_64 62
# elif defined(__aarch64__)
# define EM_AARCH64 183
# elif defined(__riscv)
# define EM_RISCV 243
# endif
#endif
/* Convenient access to the global trampoline API state */
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
/* Type aliases for clarity and portability */
typedef uint64_t uword; // Word-sized unsigned integer
typedef const char* CodeComments; // Code comment strings
/* Memory size constants */
#define MB (1024 * 1024) // 1 Megabyte for buffer sizing
// =============================================================================
// ARCHITECTURE-SPECIFIC DEFINITIONS
// =============================================================================
/*
* Returns the ELF machine architecture constant for the current platform.
* This is required for the jitdump header to correctly identify the target
* architecture for perf processing.
*
*/
static uint64_t GetElfMachineArchitecture(void) {
#if defined(__x86_64__) || defined(_M_X64)
return EM_X86_64;
#elif defined(__i386__) || defined(_M_IX86)
return EM_386;
#elif defined(__aarch64__)
return EM_AARCH64;
#elif defined(__arm__) || defined(_M_ARM)
return EM_ARM;
#elif defined(__riscv)
return EM_RISCV;
#else
Py_UNREACHABLE(); // Unsupported architecture - should never reach here
return 0;
#endif
}
// =============================================================================
// PERF JITDUMP DATA STRUCTURES
// =============================================================================
/*
* Perf jitdump file format structures
*
* These structures define the binary format that perf expects for JIT dump files.
* The format is documented in the Linux perf tools source code and must match
* exactly for proper perf integration.
*/
/*
* Jitdump file header - written once at the beginning of each jitdump file
* Contains metadata about the process and jitdump format version
*/
typedef struct {
uint32_t magic; // Magic number (0x4A695444 = "JiTD")
uint32_t version; // Jitdump format version (currently 1)
uint32_t size; // Size of this header structure
uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture)
uint32_t reserved; // Reserved field (must be 0)
uint32_t process_id; // Process ID of the JIT compiler
uint64_t time_stamp; // Timestamp when jitdump was created
uint64_t flags; // Feature flags (currently unused)
} Header;
/*
* Perf event types supported by the jitdump format
* Each event type has a corresponding structure format
*/
enum PerfEvent {
PerfLoad = 0, // Code load event (new JIT function)
PerfMove = 1, // Code move event (function relocated)
PerfDebugInfo = 2, // Debug information event
PerfClose = 3, // JIT session close event
PerfUnwindingInfo = 4 // Stack unwinding information event
};
/*
* Base event structure - common header for all perf events
* Every event in the jitdump file starts with this structure
*/
struct BaseEvent {
uint32_t event; // Event type (from PerfEvent enum)
uint32_t size; // Total size of this event including payload
uint64_t time_stamp; // Timestamp when event occurred
};
/*
* Code load event - indicates a new JIT-compiled function is available
* This is the most important event type for Python profiling
*/
typedef struct {
struct BaseEvent base; // Common event header
uint32_t process_id; // Process ID where code was generated
#if defined(__APPLE__)
uint64_t thread_id; // Thread ID where code was generated
#else
uint32_t thread_id; // Thread ID where code was generated
#endif
uint64_t vma; // Virtual memory address where code is loaded
uint64_t code_address; // Address of the actual machine code
uint64_t code_size; // Size of the machine code in bytes
uint64_t code_id; // Unique identifier for this code region
/* Followed by:
* - null-terminated function name string
* - raw machine code bytes
*/
} CodeLoadEvent;
/*
* Code unwinding information event - provides DWARF data for stack traces
* Essential for proper stack unwinding during profiling
*/
typedef struct {
struct BaseEvent base; // Common event header
uint64_t unwind_data_size; // Size of the unwinding data
uint64_t eh_frame_hdr_size; // Size of the EH frame header
uint64_t mapped_size; // Total mapped size (with padding)
/* Followed by:
* - EH frame header
* - DWARF unwinding information
* - Padding to alignment boundary
*/
} CodeUnwindingInfoEvent;
/*
* EH Frame Header structure for DWARF unwinding
*
* This header provides metadata about the .eh_frame data that follows.
* It uses PC-relative and data-relative encodings to keep the synthesized
* DSO self-contained when perf injects it.
*/
typedef struct __attribute__((packed)) {
uint8_t version;
uint8_t eh_frame_ptr_enc;
uint8_t fde_count_enc;
uint8_t table_enc;
int32_t eh_frame_ptr;
uint32_t eh_fde_count;
int32_t from;
int32_t to;
} EhFrameHeader;
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");
// =============================================================================
// GLOBAL STATE MANAGEMENT
// =============================================================================
/*
* Global state for the perf jitdump implementation
*
* This structure maintains all the state needed for generating jitdump files.
* It's designed as a singleton since there's typically only one jitdump file
* per Python process.
*/
typedef struct {
FILE* perf_map; // File handle for the jitdump file
PyMutex map_lock; // Thread synchronization lock
void* mapped_buffer; // Memory-mapped region (signals perf we're active)
size_t mapped_size; // Size of the mapped region
uint32_t code_id; // Counter for unique code region identifiers
uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs
} PerfMapJitState;
/* Global singleton instance */
static PerfMapJitState perf_jit_map_state;
// =============================================================================
// TIME UTILITIES
// =============================================================================
/* Time conversion constant */
static const intptr_t nanoseconds_per_second = 1000000000;
/*
* Get current monotonic time in nanoseconds
*
* Monotonic time is preferred for event timestamps because it's not affected
* by system clock adjustments. This ensures consistent timing relationships
* between events even if the system clock is changed.
*
* Returns: Current monotonic time in nanoseconds since an arbitrary epoch
*/
static int64_t get_current_monotonic_ticks(void) {
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
Py_UNREACHABLE(); // Should never fail on supported systems
return 0;
}
/* Convert to nanoseconds for maximum precision */
int64_t result = ts.tv_sec;
result *= nanoseconds_per_second;
result += ts.tv_nsec;
return result;
}
/*
* Get current wall clock time in microseconds
*
* Used for the jitdump file header timestamp. Unlike monotonic time,
* this represents actual wall clock time that can be correlated with
* other system events.
*
* Returns: Current time in microseconds since Unix epoch
*/
static int64_t get_current_time_microseconds(void) {
struct timeval tv;
if (gettimeofday(&tv, NULL) < 0) {
Py_UNREACHABLE(); // Should never fail on supported systems
return 0;
}
return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
}
// =============================================================================
// FILE I/O UTILITIES
// =============================================================================
/*
* Write data to the jitdump file with error handling
*
* This function ensures that all data is written to the file, handling
* partial writes that can occur with large buffers or when the system
* is under load.
*
* Args:
* buffer: Pointer to data to write
* size: Number of bytes to write
*/
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
FILE* out_file = perf_jit_map_state.perf_map;
const char* ptr = (const char*)(buffer);
while (size > 0) {
const size_t written = fwrite(ptr, 1, size, out_file);
if (written == 0) {
Py_UNREACHABLE(); // Write failure - should be very rare
break;
}
size -= written;
ptr += written;
}
}
/*
* Write the jitdump file header
*
* The header must be written exactly once at the beginning of each jitdump
* file. It provides metadata that perf uses to parse the rest of the file.
*
* Args:
* pid: Process ID to include in the header
* out_file: File handle to write to (currently unused, uses global state)
*/
static void perf_map_jit_write_header(int pid, FILE* out_file) {
Header header;
/* Initialize header with required values */
header.magic = 0x4A695444; // "JiTD" magic number
header.version = 1; // Current jitdump version
header.size = sizeof(Header); // Header size for validation
header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture
header.reserved = 0; // padding reserved for future use
header.process_id = pid; // Process identifier
header.time_stamp = get_current_time_microseconds(); // Creation time
header.flags = 0; // No special flags currently used
perf_map_jit_write_fully(&header, sizeof(header));
}
// =============================================================================
// JITDUMP INITIALIZATION
// =============================================================================
/*
* Initialize the perf jitdump interface
*
* This function sets up everything needed to generate jitdump files:
* 1. Creates the jitdump file with a unique name
* 2. Maps the first page to signal perf that we're using the interface
* 3. Writes the jitdump header
* 4. Initializes synchronization primitives
*
* The memory mapping is crucial - perf detects jitdump files by scanning
* for processes that have mapped files matching the pattern /tmp/jit-*.dump
*
* Returns: Pointer to initialized state, or NULL on failure
*/
static void* perf_map_jit_init(void) {
PyMutex_Lock(&perf_jit_map_state.map_lock);
if (perf_jit_map_state.perf_map != NULL) {
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return &perf_jit_map_state;
}
char filename[100];
int pid = getpid();
/* Create unique filename based on process ID */
snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
/* Create/open the jitdump file with appropriate permissions */
const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
if (fd == -1) {
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return NULL; // Failed to create file
}
/* Get system page size for memory mapping */
const long page_size = sysconf(_SC_PAGESIZE);
if (page_size == -1) {
close(fd);
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return NULL; // Failed to get page size
}
#if defined(__APPLE__)
// On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
perf_jit_map_state.mapped_buffer = NULL;
#else
/*
* Map the first page of the jitdump file
*
* This memory mapping serves as a signal to perf that this process
* is generating JIT code. Perf scans /proc/.../maps looking for mapped
* files that match the jitdump naming pattern.
*
* The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
*/
perf_jit_map_state.mapped_buffer = mmap(
NULL, // Let kernel choose address
page_size, // Map one page
PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf)
MAP_PRIVATE, // Private mapping
fd, // File descriptor
0 // Offset 0 (first page)
);
if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
perf_jit_map_state.mapped_buffer = NULL;
close(fd);
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return NULL; // Memory mapping failed
}
(void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
"cpython:perf_jit_trampoline");
#endif
perf_jit_map_state.mapped_size = page_size;
/* Convert file descriptor to FILE* for easier I/O operations */
perf_jit_map_state.perf_map = fdopen(fd, "w+");
if (perf_jit_map_state.perf_map == NULL) {
close(fd);
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return NULL; // Failed to create FILE*
}
/*
* Set up file buffering for better performance
*
* We use a large buffer (2MB) because jitdump files can be written
* frequently during program execution. Buffering reduces system call
* overhead and improves overall performance.
*/
setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
/* Write the jitdump file header */
perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
/* Initialize code ID counter */
perf_jit_map_state.code_id = 0;
perf_jit_map_state.build_id_salt =
((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();
/* Calculate padding size based on actual unwind info requirements */
size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
trampoline_api.code_alignment = 32;
PyMutex_Unlock(&perf_jit_map_state.map_lock);
return &perf_jit_map_state;
}
// =============================================================================
// MAIN JITDUMP ENTRY WRITING
// =============================================================================
/*
* Write a complete jitdump entry for a code region with a provided name.
*
* This shares the same implementation as the trampoline callback, but
* allows callers that don't have a PyCodeObject to reuse the jitdump
* infrastructure.
*/
static void perf_map_jit_write_entry_with_name(
void *state,
const void *code_addr,
size_t code_size,
const char *entry,
const char *filename
)
{
/* Initialize jitdump system on first use */
void* ret = perf_map_jit_init();
if (ret == NULL) {
return; // Initialization failed, silently abort
}
if (entry == NULL) {
entry = "";
}
if (filename == NULL) {
filename = "";
}
/*
* Create formatted function name for perf display
*
* Format: "py::<function_name>:<filename>"
* The "py::" prefix helps identify Python functions in mixed-language
* profiles (e.g., when profiling C extensions alongside Python code).
*/
size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
if (perf_map_entry == NULL) {
return; // Memory allocation failed
}
snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
const size_t name_length = strlen(perf_map_entry);
uword base = (uword)code_addr;
uword size = code_size;
/*
* Generate DWARF unwinding information
*
* DWARF data is essential for proper stack unwinding during profiling.
* Without it, perf cannot generate accurate call graphs, especially
* in optimized code where frame pointers may be omitted.
*/
uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient)
size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
buffer, sizeof(buffer), code_addr, code_size, 0);
if (eh_frame_size == 0) {
PyMem_RawFree(perf_map_entry);
return;
}
/*
* A logical jitdump entry is written as multiple records and also consumes
* a process-global code_id. Serialize the whole sequence so concurrent JIT
* compilation cannot interleave records or reuse an ID.
*/
PyMutex_Lock(&perf_jit_map_state.map_lock);
/*
* Write Code Unwinding Information Event
*
* This event must be written before the code load event to ensure
* perf has the unwinding information available when it processes
* the code region.
*/
CodeUnwindingInfoEvent ev2;
ev2.base.event = PerfUnwindingInfo;
ev2.base.time_stamp = get_current_monotonic_ticks();
ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
/* Verify we don't exceed our padding budget */
assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment
/* Calculate total event size with padding */
int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align
ev2.base.size = (uint32_t)(content_size + padding_size);
/* Write the unwinding info event header */
perf_map_jit_write_fully(&ev2, sizeof(ev2));
/*
* Write EH Frame Header
*
* The EH frame header provides metadata about the DWARF unwinding
* information that follows. It includes pointers and counts that
* help perf navigate the unwinding data efficiently.
*/
EhFrameHeader f;
f.version = 1;
f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
f.fde_count_enc = DWRF_EH_PE_udata4;
f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;
/* Calculate relative offsets for EH frame navigation */
f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
f.eh_fde_count = 1; // We generate exactly one FDE per function
f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
uint32_t cie_payload_size;
memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
f.to = -(int32_t)(eh_frame_size - cie_size);
/* Write EH frame data and header */
perf_map_jit_write_fully(buffer, eh_frame_size);
perf_map_jit_write_fully(&f, sizeof(f));
/* Write padding to maintain alignment */
char padding_bytes[] = "\0\0\0\0\0\0\0\0";
perf_map_jit_write_fully(&padding_bytes, padding_size);
/*
* Write Code Load Event
*
* This event tells perf about the new code region. It includes:
* - Memory addresses and sizes
* - Process and thread identification
* - Function name for symbol resolution
* - The actual machine code bytes
*/
CodeLoadEvent ev;
ev.base.event = PerfLoad;
ev.base.size = sizeof(ev) + (name_length+1) + size;
ev.base.time_stamp = get_current_monotonic_ticks();
ev.process_id = getpid();
#if defined(__APPLE__)
pthread_threadid_np(NULL, &ev.thread_id);
#else
ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call
#endif
ev.vma = base; // Virtual memory address
ev.code_address = base; // Same as VMA for our use case
ev.code_size = size;
/* Assign unique code ID and increment counter */
perf_jit_map_state.code_id += 1;
ev.code_id = perf_jit_map_state.code_id;
/* Write code load event and associated data */
perf_map_jit_write_fully(&ev, sizeof(ev));
perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator
/*
* Ensure each synthetic DSO has unique .text bytes.
*
* perf merges DSOs that share a build-id. Since trampolines can share
* identical code and unwind bytes, perf may resolve all JIT frames to
* the first symbol it saw (including entries from previous runs when
* build-id caching is enabled). Patch a small marker in the emitted
* bytes to make the build-id depend on a per-process salt and code id
* without modifying the live code.
*/
uint64_t marker = perf_jit_map_state.build_id_salt ^
((uint64_t)perf_jit_map_state.code_id << 32) ^
(uint64_t)code_size;
if (size >= sizeof(marker)) {
size_t prefix = size - sizeof(marker);
perf_map_jit_write_fully((void *)(base), prefix);
perf_map_jit_write_fully(&marker, sizeof(marker));
}
else if (size > 0) {
uint8_t tmp[sizeof(marker)];
memcpy(tmp, (void *)(base), size);
for (size_t i = 0; i < size; i++) {
tmp[i] ^= (uint8_t)(marker >> (i * 8));
}
perf_map_jit_write_fully(tmp, size);
}
/* Clean up allocated memory */
PyMutex_Unlock(&perf_jit_map_state.map_lock);
PyMem_RawFree(perf_map_entry);
}
/*
* Write a complete jitdump entry for a Python function
*
* This is the main function called by Python's trampoline system whenever
* a new piece of JIT-compiled code needs to be recorded. It writes both
* the unwinding information and the code load event to the jitdump file.
*
* The function performs these steps:
* 1. Initialize jitdump system if not already done
* 2. Extract function name and filename from Python code object
* 3. Generate DWARF unwinding information
* 4. Write unwinding info event to jitdump file
* 5. Write code load event to jitdump file
*
* Args:
* state: Jitdump state (currently unused, uses global state)
* code_addr: Address where the compiled code resides
* code_size: Size of the compiled code in bytes
* co: Python code object containing metadata
*
* IMPORTANT: This function signature is part of Python's internal API
* and must not be changed without coordinating with core Python development.
*/
static void perf_map_jit_write_entry(void *state, const void *code_addr,
size_t code_size, PyCodeObject *co)
{
const char *entry = "";
const char *filename = "";
if (co != NULL) {
if (co->co_qualname != NULL) {
entry = PyUnicode_AsUTF8(co->co_qualname);
}
if (co->co_filename != NULL) {
filename = PyUnicode_AsUTF8(co->co_filename);
}
}
perf_map_jit_write_entry_with_name(state, code_addr, code_size,
entry, filename);
}
void
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
const char *entry, const char *filename)
{
perf_map_jit_write_entry_with_name(
NULL, code_addr, code_size, entry, filename);
}
// =============================================================================
// CLEANUP AND FINALIZATION
// =============================================================================
/*
* Finalize and cleanup the perf jitdump system
*
* This function is called when Python is shutting down or when the
* perf trampoline system is being disabled. It ensures all resources
* are properly released and all buffered data is flushed to disk.
*
* Args:
* state: Jitdump state (currently unused, uses global state)
*
* Returns: 0 on success
*
* IMPORTANT: This function signature is part of Python's internal API
* and must not be changed without coordinating with core Python development.
*/
static int perf_map_jit_fini(void* state) {
/*
* Close jitdump file with proper synchronization
*
* We need to acquire the lock to ensure no other threads are
* writing to the file when we close it. This prevents corruption
* and ensures all data is properly flushed.
*/
PyMutex_Lock(&perf_jit_map_state.map_lock);
if (perf_jit_map_state.perf_map != NULL) {
fclose(perf_jit_map_state.perf_map); // This also flushes buffers
perf_jit_map_state.perf_map = NULL;
}
PyMutex_Unlock(&perf_jit_map_state.map_lock);
/*
* Unmap the memory region
*
* This removes the signal to perf that we were generating JIT code.
* After this point, perf will no longer detect this process as
* having JIT capabilities.
*/
if (perf_jit_map_state.mapped_buffer != NULL) {
munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
perf_jit_map_state.mapped_buffer = NULL;
}
/* Clear global state reference */
trampoline_api.state = NULL;
return 0; // Success
}
// =============================================================================
// PUBLIC API EXPORT
// =============================================================================
/*
* Python Perf Callbacks Structure
*
* This structure defines the callback interface that Python's trampoline
* system uses to integrate with perf profiling. It contains function
* pointers for initialization, event writing, and cleanup.
*
* CRITICAL: This structure and its contents are part of Python's internal
* API. The function signatures and behavior must remain stable to maintain
* compatibility with the Python interpreter's perf integration system.
*
* Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
*/
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
&perf_map_jit_init, // Initialization function
&perf_map_jit_write_entry, // Event writing function
&perf_map_jit_fini, // Cleanup function
};
#endif /* PY_HAVE_PERF_TRAMPOLINE */