mirror of
https://github.com/python/cpython.git
synced 2026-05-04 09:31:02 +00:00
825 lines
31 KiB
C
825 lines
31 KiB
C
/*
|
|
* Python Perf Trampoline Support - JIT Dump Implementation
|
|
*
|
|
* This file implements the perf jitdump API for Python's performance profiling
|
|
* integration. It allows perf (Linux performance analysis tool) to understand
|
|
* and profile dynamically generated Python bytecode by creating JIT dump files
|
|
* that perf can inject into its analysis.
|
|
*
|
|
*
|
|
* IMPORTANT: This file exports specific callback functions that are part of
|
|
* Python's internal API. Do not modify the function signatures or behavior
|
|
* of exported functions without coordinating with the Python core team.
|
|
*
|
|
* Usually the binary and libraries are mapped in separate region like below:
|
|
*
|
|
* address ->
|
|
* --+---------------------+--//--+---------------------+--
|
|
* | .text | .data | ... | | .text | .data | ... |
|
|
* --+---------------------+--//--+---------------------+--
|
|
* myprog libc.so
|
|
*
|
|
* So it'd be easy and straight-forward to find a mapped binary or library from an
|
|
* address.
|
|
*
|
|
* But for JIT code, the code arena only cares about the code section. But the
|
|
* resulting DSOs (which is generated by perf inject -j) contain ELF headers and
|
|
* unwind info too. Then it'd generate following address space with synthesized
|
|
* MMAP events. Let's say it has a sample between address B and C.
|
|
*
|
|
* sample
|
|
* |
|
|
* address -> A B v C
|
|
* ---------------------------------------------------------------------------------------------------
|
|
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
|
* ...
|
|
* ---------------------------------------------------------------------------------------------------
|
|
*
|
|
* If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
|
|
* the unwind info. If it maps both .text section and unwind sections, the sample
|
|
* could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
|
|
* which one is right. So to make perf happy we have non-overlapping ranges for each
|
|
* DSO:
|
|
*
|
|
* address ->
|
|
* -------------------------------------------------------------------------------------------------------
|
|
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
|
* ...
|
|
* -------------------------------------------------------------------------------------------------------
|
|
*
|
|
* As the trampolines are constant, we add a constant padding but in general the padding needs to have the
|
|
* size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
|
|
*/
|
|
|
|
|
|
|
|
#include "Python.h"
|
|
#include "pycore_ceval.h" // _PyPerf_Callbacks
|
|
#include "pycore_frame.h"
|
|
#include "pycore_interp.h"
|
|
#include "pycore_mmap.h" // _PyAnnotateMemoryMap()
|
|
#include "pycore_jit_unwind.h"
|
|
#include "pycore_runtime.h" // _PyRuntime
|
|
|
|
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
|
|
|
/* Standard library includes for perf jitdump implementation */
|
|
#if defined(__linux__)
|
|
# include <elf.h> // ELF architecture constants
|
|
#endif
|
|
#include <fcntl.h> // File control operations
|
|
#include <stdio.h> // Standard I/O operations
|
|
#include <stdlib.h> // Standard library functions
|
|
#include <string.h> // memcpy, strlen
|
|
#include <sys/mman.h> // Memory mapping functions (mmap)
|
|
#include <sys/types.h> // System data types
|
|
#include <unistd.h> // System calls (sysconf, getpid)
|
|
#include <sys/time.h> // Time functions (gettimeofday)
|
|
#if defined(__linux__)
|
|
# include <sys/syscall.h> // System call interface
|
|
#endif
|
|
|
|
// =============================================================================
|
|
// CONSTANTS AND CONFIGURATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Memory layout considerations for perf jitdump:
|
|
*
|
|
* Perf expects non-overlapping memory regions for each JIT-compiled function.
|
|
* When perf processes the jitdump file, it creates synthetic DSO (Dynamic
|
|
* Shared Object) files that contain:
|
|
* - ELF headers
|
|
* - .text section (actual machine code)
|
|
* - Unwind information (for stack traces)
|
|
*
|
|
* To ensure proper address space layout, we add padding between code regions.
|
|
* This prevents address conflicts when perf maps the synthesized DSOs.
|
|
*
|
|
* Memory layout example:
|
|
* /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
|
|
* /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
|
|
*
|
|
* The padding size is now calculated automatically during initialization
|
|
* based on the actual unwind information requirements.
|
|
*/
|
|
|
|
|
|
/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
|
|
#if !defined(__linux__)
|
|
# if defined(__i386__) || defined(_M_IX86)
|
|
# define EM_386 3
|
|
# elif defined(__arm__) || defined(_M_ARM)
|
|
# define EM_ARM 40
|
|
# elif defined(__x86_64__) || defined(_M_X64)
|
|
# define EM_X86_64 62
|
|
# elif defined(__aarch64__)
|
|
# define EM_AARCH64 183
|
|
# elif defined(__riscv)
|
|
# define EM_RISCV 243
|
|
# endif
|
|
#endif
|
|
|
|
/* Convenient access to the global trampoline API state */
|
|
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
|
|
|
|
/* Type aliases for clarity and portability */
|
|
typedef uint64_t uword; // Word-sized unsigned integer
|
|
typedef const char* CodeComments; // Code comment strings
|
|
|
|
/* Memory size constants */
|
|
#define MB (1024 * 1024) // 1 Megabyte for buffer sizing
|
|
|
|
// =============================================================================
|
|
// ARCHITECTURE-SPECIFIC DEFINITIONS
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Returns the ELF machine architecture constant for the current platform.
|
|
* This is required for the jitdump header to correctly identify the target
|
|
* architecture for perf processing.
|
|
*
|
|
*/
|
|
static uint64_t GetElfMachineArchitecture(void) {
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
return EM_X86_64;
|
|
#elif defined(__i386__) || defined(_M_IX86)
|
|
return EM_386;
|
|
#elif defined(__aarch64__)
|
|
return EM_AARCH64;
|
|
#elif defined(__arm__) || defined(_M_ARM)
|
|
return EM_ARM;
|
|
#elif defined(__riscv)
|
|
return EM_RISCV;
|
|
#else
|
|
Py_UNREACHABLE(); // Unsupported architecture - should never reach here
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
// =============================================================================
|
|
// PERF JITDUMP DATA STRUCTURES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Perf jitdump file format structures
|
|
*
|
|
* These structures define the binary format that perf expects for JIT dump files.
|
|
* The format is documented in the Linux perf tools source code and must match
|
|
* exactly for proper perf integration.
|
|
*/
|
|
|
|
/*
|
|
* Jitdump file header - written once at the beginning of each jitdump file
|
|
* Contains metadata about the process and jitdump format version
|
|
*/
|
|
typedef struct {
|
|
uint32_t magic; // Magic number (0x4A695444 = "JiTD")
|
|
uint32_t version; // Jitdump format version (currently 1)
|
|
uint32_t size; // Size of this header structure
|
|
uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture)
|
|
uint32_t reserved; // Reserved field (must be 0)
|
|
uint32_t process_id; // Process ID of the JIT compiler
|
|
uint64_t time_stamp; // Timestamp when jitdump was created
|
|
uint64_t flags; // Feature flags (currently unused)
|
|
} Header;
|
|
|
|
/*
|
|
* Perf event types supported by the jitdump format
|
|
* Each event type has a corresponding structure format
|
|
*/
|
|
enum PerfEvent {
|
|
PerfLoad = 0, // Code load event (new JIT function)
|
|
PerfMove = 1, // Code move event (function relocated)
|
|
PerfDebugInfo = 2, // Debug information event
|
|
PerfClose = 3, // JIT session close event
|
|
PerfUnwindingInfo = 4 // Stack unwinding information event
|
|
};
|
|
|
|
/*
|
|
* Base event structure - common header for all perf events
|
|
* Every event in the jitdump file starts with this structure
|
|
*/
|
|
struct BaseEvent {
|
|
uint32_t event; // Event type (from PerfEvent enum)
|
|
uint32_t size; // Total size of this event including payload
|
|
uint64_t time_stamp; // Timestamp when event occurred
|
|
};
|
|
|
|
/*
|
|
* Code load event - indicates a new JIT-compiled function is available
|
|
* This is the most important event type for Python profiling
|
|
*/
|
|
typedef struct {
|
|
struct BaseEvent base; // Common event header
|
|
uint32_t process_id; // Process ID where code was generated
|
|
#if defined(__APPLE__)
|
|
uint64_t thread_id; // Thread ID where code was generated
|
|
#else
|
|
uint32_t thread_id; // Thread ID where code was generated
|
|
#endif
|
|
uint64_t vma; // Virtual memory address where code is loaded
|
|
uint64_t code_address; // Address of the actual machine code
|
|
uint64_t code_size; // Size of the machine code in bytes
|
|
uint64_t code_id; // Unique identifier for this code region
|
|
/* Followed by:
|
|
* - null-terminated function name string
|
|
* - raw machine code bytes
|
|
*/
|
|
} CodeLoadEvent;
|
|
|
|
/*
|
|
* Code unwinding information event - provides DWARF data for stack traces
|
|
* Essential for proper stack unwinding during profiling
|
|
*/
|
|
typedef struct {
|
|
struct BaseEvent base; // Common event header
|
|
uint64_t unwind_data_size; // Size of the unwinding data
|
|
uint64_t eh_frame_hdr_size; // Size of the EH frame header
|
|
uint64_t mapped_size; // Total mapped size (with padding)
|
|
/* Followed by:
|
|
* - EH frame header
|
|
* - DWARF unwinding information
|
|
* - Padding to alignment boundary
|
|
*/
|
|
} CodeUnwindingInfoEvent;
|
|
|
|
/*
|
|
* EH Frame Header structure for DWARF unwinding
|
|
*
|
|
* This header provides metadata about the .eh_frame data that follows.
|
|
* It uses PC-relative and data-relative encodings to keep the synthesized
|
|
* DSO self-contained when perf injects it.
|
|
*/
|
|
typedef struct __attribute__((packed)) {
|
|
uint8_t version;
|
|
uint8_t eh_frame_ptr_enc;
|
|
uint8_t fde_count_enc;
|
|
uint8_t table_enc;
|
|
int32_t eh_frame_ptr;
|
|
uint32_t eh_fde_count;
|
|
int32_t from;
|
|
int32_t to;
|
|
} EhFrameHeader;
|
|
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");
|
|
|
|
// =============================================================================
|
|
// GLOBAL STATE MANAGEMENT
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Global state for the perf jitdump implementation
|
|
*
|
|
* This structure maintains all the state needed for generating jitdump files.
|
|
* It's designed as a singleton since there's typically only one jitdump file
|
|
* per Python process.
|
|
*/
|
|
typedef struct {
|
|
FILE* perf_map; // File handle for the jitdump file
|
|
PyMutex map_lock; // Thread synchronization lock
|
|
void* mapped_buffer; // Memory-mapped region (signals perf we're active)
|
|
size_t mapped_size; // Size of the mapped region
|
|
uint32_t code_id; // Counter for unique code region identifiers
|
|
uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs
|
|
} PerfMapJitState;
|
|
|
|
/* Global singleton instance */
|
|
static PerfMapJitState perf_jit_map_state;
|
|
|
|
// =============================================================================
|
|
// TIME UTILITIES
|
|
// =============================================================================
|
|
|
|
/* Time conversion constant */
|
|
static const intptr_t nanoseconds_per_second = 1000000000;
|
|
|
|
/*
|
|
* Get current monotonic time in nanoseconds
|
|
*
|
|
* Monotonic time is preferred for event timestamps because it's not affected
|
|
* by system clock adjustments. This ensures consistent timing relationships
|
|
* between events even if the system clock is changed.
|
|
*
|
|
* Returns: Current monotonic time in nanoseconds since an arbitrary epoch
|
|
*/
|
|
static int64_t get_current_monotonic_ticks(void) {
|
|
struct timespec ts;
|
|
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
|
|
Py_UNREACHABLE(); // Should never fail on supported systems
|
|
return 0;
|
|
}
|
|
|
|
/* Convert to nanoseconds for maximum precision */
|
|
int64_t result = ts.tv_sec;
|
|
result *= nanoseconds_per_second;
|
|
result += ts.tv_nsec;
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Get current wall clock time in microseconds
|
|
*
|
|
* Used for the jitdump file header timestamp. Unlike monotonic time,
|
|
* this represents actual wall clock time that can be correlated with
|
|
* other system events.
|
|
*
|
|
* Returns: Current time in microseconds since Unix epoch
|
|
*/
|
|
static int64_t get_current_time_microseconds(void) {
|
|
struct timeval tv;
|
|
if (gettimeofday(&tv, NULL) < 0) {
|
|
Py_UNREACHABLE(); // Should never fail on supported systems
|
|
return 0;
|
|
}
|
|
return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
|
|
}
|
|
|
|
// =============================================================================
|
|
// FILE I/O UTILITIES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Write data to the jitdump file with error handling
|
|
*
|
|
* This function ensures that all data is written to the file, handling
|
|
* partial writes that can occur with large buffers or when the system
|
|
* is under load.
|
|
*
|
|
* Args:
|
|
* buffer: Pointer to data to write
|
|
* size: Number of bytes to write
|
|
*/
|
|
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
|
|
FILE* out_file = perf_jit_map_state.perf_map;
|
|
const char* ptr = (const char*)(buffer);
|
|
|
|
while (size > 0) {
|
|
const size_t written = fwrite(ptr, 1, size, out_file);
|
|
if (written == 0) {
|
|
Py_UNREACHABLE(); // Write failure - should be very rare
|
|
break;
|
|
}
|
|
size -= written;
|
|
ptr += written;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write the jitdump file header
|
|
*
|
|
* The header must be written exactly once at the beginning of each jitdump
|
|
* file. It provides metadata that perf uses to parse the rest of the file.
|
|
*
|
|
* Args:
|
|
* pid: Process ID to include in the header
|
|
* out_file: File handle to write to (currently unused, uses global state)
|
|
*/
|
|
static void perf_map_jit_write_header(int pid, FILE* out_file) {
|
|
Header header;
|
|
|
|
/* Initialize header with required values */
|
|
header.magic = 0x4A695444; // "JiTD" magic number
|
|
header.version = 1; // Current jitdump version
|
|
header.size = sizeof(Header); // Header size for validation
|
|
header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture
|
|
header.reserved = 0; // padding reserved for future use
|
|
header.process_id = pid; // Process identifier
|
|
header.time_stamp = get_current_time_microseconds(); // Creation time
|
|
header.flags = 0; // No special flags currently used
|
|
|
|
perf_map_jit_write_fully(&header, sizeof(header));
|
|
}
|
|
|
|
// =============================================================================
|
|
// JITDUMP INITIALIZATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Initialize the perf jitdump interface
|
|
*
|
|
* This function sets up everything needed to generate jitdump files:
|
|
* 1. Creates the jitdump file with a unique name
|
|
* 2. Maps the first page to signal perf that we're using the interface
|
|
* 3. Writes the jitdump header
|
|
* 4. Initializes synchronization primitives
|
|
*
|
|
* The memory mapping is crucial - perf detects jitdump files by scanning
|
|
* for processes that have mapped files matching the pattern /tmp/jit-*.dump
|
|
*
|
|
* Returns: Pointer to initialized state, or NULL on failure
|
|
*/
|
|
static void* perf_map_jit_init(void) {
|
|
PyMutex_Lock(&perf_jit_map_state.map_lock);
|
|
if (perf_jit_map_state.perf_map != NULL) {
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return &perf_jit_map_state;
|
|
}
|
|
|
|
char filename[100];
|
|
int pid = getpid();
|
|
|
|
/* Create unique filename based on process ID */
|
|
snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
|
|
|
|
/* Create/open the jitdump file with appropriate permissions */
|
|
const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
|
|
if (fd == -1) {
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return NULL; // Failed to create file
|
|
}
|
|
|
|
/* Get system page size for memory mapping */
|
|
const long page_size = sysconf(_SC_PAGESIZE);
|
|
if (page_size == -1) {
|
|
close(fd);
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return NULL; // Failed to get page size
|
|
}
|
|
|
|
#if defined(__APPLE__)
|
|
// On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
|
|
perf_jit_map_state.mapped_buffer = NULL;
|
|
#else
|
|
/*
|
|
* Map the first page of the jitdump file
|
|
*
|
|
* This memory mapping serves as a signal to perf that this process
|
|
* is generating JIT code. Perf scans /proc/.../maps looking for mapped
|
|
* files that match the jitdump naming pattern.
|
|
*
|
|
* The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
|
|
*/
|
|
perf_jit_map_state.mapped_buffer = mmap(
|
|
NULL, // Let kernel choose address
|
|
page_size, // Map one page
|
|
PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf)
|
|
MAP_PRIVATE, // Private mapping
|
|
fd, // File descriptor
|
|
0 // Offset 0 (first page)
|
|
);
|
|
|
|
if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
|
|
perf_jit_map_state.mapped_buffer = NULL;
|
|
close(fd);
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return NULL; // Memory mapping failed
|
|
}
|
|
(void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
|
|
"cpython:perf_jit_trampoline");
|
|
#endif
|
|
|
|
perf_jit_map_state.mapped_size = page_size;
|
|
|
|
/* Convert file descriptor to FILE* for easier I/O operations */
|
|
perf_jit_map_state.perf_map = fdopen(fd, "w+");
|
|
if (perf_jit_map_state.perf_map == NULL) {
|
|
close(fd);
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return NULL; // Failed to create FILE*
|
|
}
|
|
|
|
/*
|
|
* Set up file buffering for better performance
|
|
*
|
|
* We use a large buffer (2MB) because jitdump files can be written
|
|
* frequently during program execution. Buffering reduces system call
|
|
* overhead and improves overall performance.
|
|
*/
|
|
setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
|
|
|
|
/* Write the jitdump file header */
|
|
perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
|
|
|
|
/* Initialize code ID counter */
|
|
perf_jit_map_state.code_id = 0;
|
|
perf_jit_map_state.build_id_salt =
|
|
((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();
|
|
|
|
/* Calculate padding size based on actual unwind info requirements */
|
|
size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
|
|
size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
|
|
trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
|
|
trampoline_api.code_alignment = 32;
|
|
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
return &perf_jit_map_state;
|
|
}
|
|
|
|
// =============================================================================
|
|
// MAIN JITDUMP ENTRY WRITING
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Write a complete jitdump entry for a code region with a provided name.
|
|
*
|
|
* This shares the same implementation as the trampoline callback, but
|
|
* allows callers that don't have a PyCodeObject to reuse the jitdump
|
|
* infrastructure.
|
|
*/
|
|
static void perf_map_jit_write_entry_with_name(
|
|
void *state,
|
|
const void *code_addr,
|
|
size_t code_size,
|
|
const char *entry,
|
|
const char *filename
|
|
)
|
|
{
|
|
/* Initialize jitdump system on first use */
|
|
void* ret = perf_map_jit_init();
|
|
if (ret == NULL) {
|
|
return; // Initialization failed, silently abort
|
|
}
|
|
|
|
if (entry == NULL) {
|
|
entry = "";
|
|
}
|
|
if (filename == NULL) {
|
|
filename = "";
|
|
}
|
|
|
|
/*
|
|
* Create formatted function name for perf display
|
|
*
|
|
* Format: "py::<function_name>:<filename>"
|
|
* The "py::" prefix helps identify Python functions in mixed-language
|
|
* profiles (e.g., when profiling C extensions alongside Python code).
|
|
*/
|
|
size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
|
|
char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
|
|
if (perf_map_entry == NULL) {
|
|
return; // Memory allocation failed
|
|
}
|
|
snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
|
|
|
|
const size_t name_length = strlen(perf_map_entry);
|
|
uword base = (uword)code_addr;
|
|
uword size = code_size;
|
|
|
|
/*
|
|
* Generate DWARF unwinding information
|
|
*
|
|
* DWARF data is essential for proper stack unwinding during profiling.
|
|
* Without it, perf cannot generate accurate call graphs, especially
|
|
* in optimized code where frame pointers may be omitted.
|
|
*/
|
|
uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient)
|
|
size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
|
|
buffer, sizeof(buffer), code_addr, code_size, 0);
|
|
if (eh_frame_size == 0) {
|
|
PyMem_RawFree(perf_map_entry);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* A logical jitdump entry is written as multiple records and also consumes
|
|
* a process-global code_id. Serialize the whole sequence so concurrent JIT
|
|
* compilation cannot interleave records or reuse an ID.
|
|
*/
|
|
PyMutex_Lock(&perf_jit_map_state.map_lock);
|
|
|
|
/*
|
|
* Write Code Unwinding Information Event
|
|
*
|
|
* This event must be written before the code load event to ensure
|
|
* perf has the unwinding information available when it processes
|
|
* the code region.
|
|
*/
|
|
CodeUnwindingInfoEvent ev2;
|
|
ev2.base.event = PerfUnwindingInfo;
|
|
ev2.base.time_stamp = get_current_monotonic_ticks();
|
|
ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
|
|
|
|
/* Verify we don't exceed our padding budget */
|
|
assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
|
|
|
|
ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
|
|
ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment
|
|
|
|
/* Calculate total event size with padding */
|
|
int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
|
|
int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align
|
|
ev2.base.size = (uint32_t)(content_size + padding_size);
|
|
|
|
/* Write the unwinding info event header */
|
|
perf_map_jit_write_fully(&ev2, sizeof(ev2));
|
|
|
|
/*
|
|
* Write EH Frame Header
|
|
*
|
|
* The EH frame header provides metadata about the DWARF unwinding
|
|
* information that follows. It includes pointers and counts that
|
|
* help perf navigate the unwinding data efficiently.
|
|
*/
|
|
EhFrameHeader f;
|
|
f.version = 1;
|
|
f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
|
|
f.fde_count_enc = DWRF_EH_PE_udata4;
|
|
f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;
|
|
|
|
/* Calculate relative offsets for EH frame navigation */
|
|
f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
|
|
f.eh_fde_count = 1; // We generate exactly one FDE per function
|
|
f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
|
|
uint32_t cie_payload_size;
|
|
memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
|
|
int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
|
|
f.to = -(int32_t)(eh_frame_size - cie_size);
|
|
|
|
/* Write EH frame data and header */
|
|
perf_map_jit_write_fully(buffer, eh_frame_size);
|
|
perf_map_jit_write_fully(&f, sizeof(f));
|
|
|
|
/* Write padding to maintain alignment */
|
|
char padding_bytes[] = "\0\0\0\0\0\0\0\0";
|
|
perf_map_jit_write_fully(&padding_bytes, padding_size);
|
|
|
|
/*
|
|
* Write Code Load Event
|
|
*
|
|
* This event tells perf about the new code region. It includes:
|
|
* - Memory addresses and sizes
|
|
* - Process and thread identification
|
|
* - Function name for symbol resolution
|
|
* - The actual machine code bytes
|
|
*/
|
|
CodeLoadEvent ev;
|
|
ev.base.event = PerfLoad;
|
|
ev.base.size = sizeof(ev) + (name_length+1) + size;
|
|
ev.base.time_stamp = get_current_monotonic_ticks();
|
|
ev.process_id = getpid();
|
|
#if defined(__APPLE__)
|
|
pthread_threadid_np(NULL, &ev.thread_id);
|
|
#else
|
|
ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call
|
|
#endif
|
|
ev.vma = base; // Virtual memory address
|
|
ev.code_address = base; // Same as VMA for our use case
|
|
ev.code_size = size;
|
|
|
|
/* Assign unique code ID and increment counter */
|
|
perf_jit_map_state.code_id += 1;
|
|
ev.code_id = perf_jit_map_state.code_id;
|
|
|
|
/* Write code load event and associated data */
|
|
perf_map_jit_write_fully(&ev, sizeof(ev));
|
|
perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator
|
|
/*
|
|
* Ensure each synthetic DSO has unique .text bytes.
|
|
*
|
|
* perf merges DSOs that share a build-id. Since trampolines can share
|
|
* identical code and unwind bytes, perf may resolve all JIT frames to
|
|
* the first symbol it saw (including entries from previous runs when
|
|
* build-id caching is enabled). Patch a small marker in the emitted
|
|
* bytes to make the build-id depend on a per-process salt and code id
|
|
* without modifying the live code.
|
|
*/
|
|
uint64_t marker = perf_jit_map_state.build_id_salt ^
|
|
((uint64_t)perf_jit_map_state.code_id << 32) ^
|
|
(uint64_t)code_size;
|
|
if (size >= sizeof(marker)) {
|
|
size_t prefix = size - sizeof(marker);
|
|
perf_map_jit_write_fully((void *)(base), prefix);
|
|
perf_map_jit_write_fully(&marker, sizeof(marker));
|
|
}
|
|
else if (size > 0) {
|
|
uint8_t tmp[sizeof(marker)];
|
|
memcpy(tmp, (void *)(base), size);
|
|
for (size_t i = 0; i < size; i++) {
|
|
tmp[i] ^= (uint8_t)(marker >> (i * 8));
|
|
}
|
|
perf_map_jit_write_fully(tmp, size);
|
|
}
|
|
|
|
/* Clean up allocated memory */
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
PyMem_RawFree(perf_map_entry);
|
|
}
|
|
|
|
/*
|
|
* Write a complete jitdump entry for a Python function
|
|
*
|
|
* This is the main function called by Python's trampoline system whenever
|
|
* a new piece of JIT-compiled code needs to be recorded. It writes both
|
|
* the unwinding information and the code load event to the jitdump file.
|
|
*
|
|
* The function performs these steps:
|
|
* 1. Initialize jitdump system if not already done
|
|
* 2. Extract function name and filename from Python code object
|
|
* 3. Generate DWARF unwinding information
|
|
* 4. Write unwinding info event to jitdump file
|
|
* 5. Write code load event to jitdump file
|
|
*
|
|
* Args:
|
|
* state: Jitdump state (currently unused, uses global state)
|
|
* code_addr: Address where the compiled code resides
|
|
* code_size: Size of the compiled code in bytes
|
|
* co: Python code object containing metadata
|
|
*
|
|
* IMPORTANT: This function signature is part of Python's internal API
|
|
* and must not be changed without coordinating with core Python development.
|
|
*/
|
|
static void perf_map_jit_write_entry(void *state, const void *code_addr,
|
|
size_t code_size, PyCodeObject *co)
|
|
{
|
|
const char *entry = "";
|
|
const char *filename = "";
|
|
if (co != NULL) {
|
|
if (co->co_qualname != NULL) {
|
|
entry = PyUnicode_AsUTF8(co->co_qualname);
|
|
}
|
|
if (co->co_filename != NULL) {
|
|
filename = PyUnicode_AsUTF8(co->co_filename);
|
|
}
|
|
}
|
|
perf_map_jit_write_entry_with_name(state, code_addr, code_size,
|
|
entry, filename);
|
|
}
|
|
|
|
void
|
|
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
|
|
const char *entry, const char *filename)
|
|
{
|
|
perf_map_jit_write_entry_with_name(
|
|
NULL, code_addr, code_size, entry, filename);
|
|
}
|
|
|
|
// =============================================================================
|
|
// CLEANUP AND FINALIZATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Finalize and cleanup the perf jitdump system
|
|
*
|
|
* This function is called when Python is shutting down or when the
|
|
* perf trampoline system is being disabled. It ensures all resources
|
|
* are properly released and all buffered data is flushed to disk.
|
|
*
|
|
* Args:
|
|
* state: Jitdump state (currently unused, uses global state)
|
|
*
|
|
* Returns: 0 on success
|
|
*
|
|
* IMPORTANT: This function signature is part of Python's internal API
|
|
* and must not be changed without coordinating with core Python development.
|
|
*/
|
|
static int perf_map_jit_fini(void* state) {
|
|
/*
|
|
* Close jitdump file with proper synchronization
|
|
*
|
|
* We need to acquire the lock to ensure no other threads are
|
|
* writing to the file when we close it. This prevents corruption
|
|
* and ensures all data is properly flushed.
|
|
*/
|
|
PyMutex_Lock(&perf_jit_map_state.map_lock);
|
|
if (perf_jit_map_state.perf_map != NULL) {
|
|
fclose(perf_jit_map_state.perf_map); // This also flushes buffers
|
|
perf_jit_map_state.perf_map = NULL;
|
|
}
|
|
PyMutex_Unlock(&perf_jit_map_state.map_lock);
|
|
|
|
/*
|
|
* Unmap the memory region
|
|
*
|
|
* This removes the signal to perf that we were generating JIT code.
|
|
* After this point, perf will no longer detect this process as
|
|
* having JIT capabilities.
|
|
*/
|
|
if (perf_jit_map_state.mapped_buffer != NULL) {
|
|
munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
|
|
perf_jit_map_state.mapped_buffer = NULL;
|
|
}
|
|
|
|
/* Clear global state reference */
|
|
trampoline_api.state = NULL;
|
|
|
|
return 0; // Success
|
|
}
|
|
|
|
// =============================================================================
|
|
// PUBLIC API EXPORT
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Python Perf Callbacks Structure
|
|
*
|
|
* This structure defines the callback interface that Python's trampoline
|
|
* system uses to integrate with perf profiling. It contains function
|
|
* pointers for initialization, event writing, and cleanup.
|
|
*
|
|
* CRITICAL: This structure and its contents are part of Python's internal
|
|
* API. The function signatures and behavior must remain stable to maintain
|
|
* compatibility with the Python interpreter's perf integration system.
|
|
*
|
|
* Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
|
|
*/
|
|
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
|
|
&perf_map_jit_init, // Initialization function
|
|
&perf_map_jit_write_entry, // Event writing function
|
|
&perf_map_jit_fini, // Cleanup function
|
|
};
|
|
|
|
#endif /* PY_HAVE_PERF_TRAMPOLINE */
|