/*
 * Python Perf Trampoline Support - JIT Dump Implementation
 *
 * This file implements the perf jitdump API for Python's performance profiling
 * integration. It allows perf (Linux performance analysis tool) to understand
 * and profile dynamically generated Python bytecode by creating JIT dump files
 * that perf can inject into its analysis.
 *
 *
 * IMPORTANT: This file exports specific callback functions that are part of
 * Python's internal API. Do not modify the function signatures or behavior
 * of exported functions without coordinating with the Python core team.
 *
 * Usually the binary and libraries are mapped in separate region like below:
 *
 *   address ->
 *    --+---------------------+--//--+---------------------+--
 *      | .text | .data | ... |      | .text | .data | ... |
 *    --+---------------------+--//--+---------------------+--
 *          myprog                      libc.so
 *
 * So it'd be easy and straight-forward to find a mapped binary or library from an
 * address.
 *
 * But for JIT code, the code arena only cares about the code section. But the
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
 * unwind info too. Then it'd generate following address space with synthesized
 * MMAP events. Let's say it has a sample between address B and C.
 *
 *                                                sample
 *                                                  |
 *   address ->                         A       B   v   C
 *   ---------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
 *     ...
 *   ---------------------------------------------------------------------------------------------------
 *
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
 * the unwind info. If it maps both .text section and unwind sections, the sample
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
 * which one is right. So to make perf happy we have non-overlapping ranges for each
 * DSO:
 *
 *   address ->
 *   -------------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
 *     ...
 *   -------------------------------------------------------------------------------------------------------
 *
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
 */


#include "Python.h"
#include "pycore_ceval.h"         // _PyPerf_Callbacks
#include "pycore_frame.h"
#include "pycore_interp.h"
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
#include "pycore_jit_unwind.h"
#include "pycore_runtime.h"       // _PyRuntime

#ifdef PY_HAVE_PERF_TRAMPOLINE

/* Standard library includes for perf jitdump implementation */
#if defined(__linux__)
#  include <elf.h>                // ELF architecture constants
#endif
#include <fcntl.h>                // File control operations
#include <stdio.h>                // Standard I/O operations
#include <stdlib.h>               // Standard library functions
#include <string.h>               // memcpy, strlen
#include <sys/mman.h>             // Memory mapping functions (mmap)
#include <sys/types.h>            // System data types
#include <unistd.h>               // System calls (sysconf, getpid)
#include <sys/time.h>             // Time functions (gettimeofday)
#if defined(__linux__)
#  include <sys/syscall.h>        // System call interface
#endif

// =============================================================================
//                           CONSTANTS AND CONFIGURATION
// =============================================================================

/*
 * Memory layout considerations for perf jitdump:
 *
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
 * Shared Object) files that contain:
 * - ELF headers
 * - .text section (actual machine code)
 * - Unwind information (for stack traces)
 *
 * To ensure proper address space layout, we add padding between code regions.
 * This prevents address conflicts when perf maps the synthesized DSOs.
 *
 * Memory layout example:
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
 *
 * The padding size is now calculated automatically during initialization
 * based on the actual unwind information requirements.
 */


/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
#if !defined(__linux__)
#  if defined(__i386__) || defined(_M_IX86)
#    define EM_386      3
#  elif defined(__arm__) || defined(_M_ARM)
#    define EM_ARM      40
#  elif defined(__x86_64__) || defined(_M_X64)
#    define EM_X86_64   62
#  elif defined(__aarch64__)
#    define EM_AARCH64  183
#  elif defined(__riscv)
#    define EM_RISCV    243
#  endif
#endif

/* Convenient access to the global trampoline API state */
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api

/* Type aliases for clarity and portability */
typedef uint64_t uword;                    // Word-sized unsigned integer
typedef const char* CodeComments;          // Code comment strings

/* Memory size constants */
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing

// =============================================================================
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
// =============================================================================

/*
 * Returns the ELF machine architecture constant for the current platform.
 * This is required for the jitdump header to correctly identify the target
 * architecture for perf processing.
 *
 */
static uint64_t GetElfMachineArchitecture(void) {
#if defined(__x86_64__) || defined(_M_X64)
    return EM_X86_64;
#elif defined(__i386__) || defined(_M_IX86)
    return EM_386;
#elif defined(__aarch64__)
    return EM_AARCH64;
#elif defined(__arm__) || defined(_M_ARM)
    return EM_ARM;
#elif defined(__riscv)
    return EM_RISCV;
#else
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
    return 0;
#endif
}

// =============================================================================
//                           PERF JITDUMP DATA STRUCTURES
// =============================================================================

/*
 * Perf jitdump file format structures
 *
 * These structures define the binary format that perf expects for JIT dump files.
 * The format is documented in the Linux perf tools source code and must match
 * exactly for proper perf integration.
 */

/*
 * Jitdump file header - written once at the beginning of each jitdump file
 * Contains metadata about the process and jitdump format version
 */
typedef struct {
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
    uint32_t version;            // Jitdump format version (currently 1)
    uint32_t size;               // Size of this header structure
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
    uint32_t reserved;           // Reserved field (must be 0)
    uint32_t process_id;         // Process ID of the JIT compiler
    uint64_t time_stamp;         // Timestamp when jitdump was created
    uint64_t flags;              // Feature flags (currently unused)
} Header;

/*
 * Perf event types supported by the jitdump format
 * Each event type has a corresponding structure format
 */
enum PerfEvent {
    PerfLoad = 0,           // Code load event (new JIT function)
    PerfMove = 1,           // Code move event (function relocated)
    PerfDebugInfo = 2,      // Debug information event
    PerfClose = 3,          // JIT session close event
    PerfUnwindingInfo = 4   // Stack unwinding information event
};

/*
 * Base event structure - common header for all perf events
 * Every event in the jitdump file starts with this structure
 */
struct BaseEvent {
    uint32_t event;         // Event type (from PerfEvent enum)
    uint32_t size;          // Total size of this event including payload
    uint64_t time_stamp;    // Timestamp when event occurred
};

/*
 * Code load event - indicates a new JIT-compiled function is available
 * This is the most important event type for Python profiling
 */
typedef struct {
    struct BaseEvent base;   // Common event header
    uint32_t process_id;     // Process ID where code was generated
#if defined(__APPLE__)
    uint64_t thread_id;      // Thread ID where code was generated
#else
    uint32_t thread_id;      // Thread ID where code was generated
#endif
    uint64_t vma;            // Virtual memory address where code is loaded
    uint64_t code_address;   // Address of the actual machine code
    uint64_t code_size;      // Size of the machine code in bytes
    uint64_t code_id;        // Unique identifier for this code region
    /* Followed by:
     * - null-terminated function name string
     * - raw machine code bytes
     */
} CodeLoadEvent;

/*
 * Code unwinding information event - provides DWARF data for stack traces
 * Essential for proper stack unwinding during profiling
 */
typedef struct {
    struct BaseEvent base;      // Common event header
    uint64_t unwind_data_size;  // Size of the unwinding data
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
    uint64_t mapped_size;       // Total mapped size (with padding)
    /* Followed by:
     * - EH frame header
     * - DWARF unwinding information
     * - Padding to alignment boundary
     */
} CodeUnwindingInfoEvent;

/*
 * EH Frame Header structure for DWARF unwinding
 *
 * This header provides metadata about the .eh_frame data that follows.
 * It uses PC-relative and data-relative encodings to keep the synthesized
 * DSO self-contained when perf injects it.
 */
typedef struct __attribute__((packed)) {
    uint8_t version;
    uint8_t eh_frame_ptr_enc;
    uint8_t fde_count_enc;
    uint8_t table_enc;
    int32_t eh_frame_ptr;
    uint32_t eh_fde_count;
    int32_t from;
    int32_t to;
} EhFrameHeader;
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");

// =============================================================================
//                              GLOBAL STATE MANAGEMENT
// =============================================================================

/*
 * Global state for the perf jitdump implementation
 *
 * This structure maintains all the state needed for generating jitdump files.
 * It's designed as a singleton since there's typically only one jitdump file
 * per Python process.
 */
typedef struct {
    FILE* perf_map;          // File handle for the jitdump file
    PyMutex map_lock;        // Thread synchronization lock
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
    size_t mapped_size;      // Size of the mapped region
    uint32_t code_id;        // Counter for unique code region identifiers
    uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
} PerfMapJitState;

/* Global singleton instance */
static PerfMapJitState perf_jit_map_state;

// =============================================================================
//                              TIME UTILITIES
// =============================================================================

/* Time conversion constant */
static const intptr_t nanoseconds_per_second = 1000000000;

/*
 * Get current monotonic time in nanoseconds
 *
 * Monotonic time is preferred for event timestamps because it's not affected
 * by system clock adjustments. This ensures consistent timing relationships
 * between events even if the system clock is changed.
 *
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
 */
static int64_t get_current_monotonic_ticks(void) {
    struct timespec ts;
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }

    /* Convert to nanoseconds for maximum precision */
    int64_t result = ts.tv_sec;
    result *= nanoseconds_per_second;
    result += ts.tv_nsec;
    return result;
}

/*
 * Get current wall clock time in microseconds
 *
 * Used for the jitdump file header timestamp. Unlike monotonic time,
 * this represents actual wall clock time that can be correlated with
 * other system events.
 *
 * Returns: Current time in microseconds since Unix epoch
 */
static int64_t get_current_time_microseconds(void) {
    struct timeval tv;
    if (gettimeofday(&tv, NULL) < 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
}

// =============================================================================
//                              FILE I/O UTILITIES
// =============================================================================

/*
 * Write data to the jitdump file with error handling
 *
 * This function ensures that all data is written to the file, handling
 * partial writes that can occur with large buffers or when the system
 * is under load.
 *
 * Args:
 *   buffer: Pointer to data to write
 *   size: Number of bytes to write
 */
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
    FILE* out_file = perf_jit_map_state.perf_map;
    const char* ptr = (const char*)(buffer);

    while (size > 0) {
        const size_t written = fwrite(ptr, 1, size, out_file);
        if (written == 0) {
            Py_UNREACHABLE();  // Write failure - should be very rare
            break;
        }
        size -= written;
        ptr += written;
    }
}

/*
 * Write the jitdump file header
 *
 * The header must be written exactly once at the beginning of each jitdump
 * file. It provides metadata that perf uses to parse the rest of the file.
 *
 * Args:
 *   pid: Process ID to include in the header
 *   out_file: File handle to write to (currently unused, uses global state)
 */
static void perf_map_jit_write_header(int pid, FILE* out_file) {
    Header header;

    /* Initialize header with required values */
    header.magic = 0x4A695444;                    // "JiTD" magic number
    header.version = 1;                           // Current jitdump version
    header.size = sizeof(Header);                 // Header size for validation
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
    header.reserved = 0;                          // padding reserved for future use
    header.process_id = pid;                      // Process identifier
    header.time_stamp = get_current_time_microseconds();   // Creation time
    header.flags = 0;                             // No special flags currently used

    perf_map_jit_write_fully(&header, sizeof(header));
}

// =============================================================================
//                              JITDUMP INITIALIZATION
// =============================================================================

/*
 * Initialize the perf jitdump interface
 *
 * This function sets up everything needed to generate jitdump files:
 * 1. Creates the jitdump file with a unique name
 * 2. Maps the first page to signal perf that we're using the interface
 * 3. Writes the jitdump header
 * 4. Initializes synchronization primitives
 *
 * The memory mapping is crucial - perf detects jitdump files by scanning
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
 *
 * Returns: Pointer to initialized state, or NULL on failure
 */
static void* perf_map_jit_init(void) {
    PyMutex_Lock(&perf_jit_map_state.map_lock);
    if (perf_jit_map_state.perf_map != NULL) {
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return &perf_jit_map_state;
    }

    char filename[100];
    int pid = getpid();

    /* Create unique filename based on process ID */
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);

    /* Create/open the jitdump file with appropriate permissions */
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
    if (fd == -1) {
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to create file
    }

    /* Get system page size for memory mapping */
    const long page_size = sysconf(_SC_PAGESIZE);
    if (page_size == -1) {
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to get page size
    }

#if defined(__APPLE__)
    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
    perf_jit_map_state.mapped_buffer = NULL;
#else
    /*
     * Map the first page of the jitdump file
     *
     * This memory mapping serves as a signal to perf that this process
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
     * files that match the jitdump naming pattern.
     *
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
     */
    perf_jit_map_state.mapped_buffer = mmap(
        NULL,                    // Let kernel choose address
        page_size,               // Map one page
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
        MAP_PRIVATE,             // Private mapping
        fd,                      // File descriptor
        0                        // Offset 0 (first page)
    );

    if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
        perf_jit_map_state.mapped_buffer = NULL;
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Memory mapping failed
    }
    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
                               "cpython:perf_jit_trampoline");
#endif

    perf_jit_map_state.mapped_size = page_size;

    /* Convert file descriptor to FILE* for easier I/O operations */
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
    if (perf_jit_map_state.perf_map == NULL) {
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to create FILE*
    }

    /*
     * Set up file buffering for better performance
     *
     * We use a large buffer (2MB) because jitdump files can be written
     * frequently during program execution. Buffering reduces system call
     * overhead and improves overall performance.
     */
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);

    /* Write the jitdump file header */
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

    /* Initialize code ID counter */
    perf_jit_map_state.code_id = 0;
    perf_jit_map_state.build_id_salt =
        ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();

    /* Calculate padding size based on actual unwind info requirements */
    size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
    trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
    trampoline_api.code_alignment = 32;

    PyMutex_Unlock(&perf_jit_map_state.map_lock);
    return &perf_jit_map_state;
}

// =============================================================================
//                              MAIN JITDUMP ENTRY WRITING
// =============================================================================

/*
 * Write a complete jitdump entry for a code region with a provided name.
 *
 * This shares the same implementation as the trampoline callback, but
 * allows callers that don't have a PyCodeObject to reuse the jitdump
 * infrastructure.
 */
static void perf_map_jit_write_entry_with_name(
    void *state,
    const void *code_addr,
    size_t code_size,
    const char *entry,
    const char *filename
)
{
    /* Initialize jitdump system on first use */
    void* ret = perf_map_jit_init();
    if (ret == NULL) {
        return;  // Initialization failed, silently abort
    }

    if (entry == NULL) {
        entry = "";
    }
    if (filename == NULL) {
        filename = "";
    }

    /*
     * Create formatted function name for perf display
     *
     * Format: "py::<function_name>:<filename>"
     * The "py::" prefix helps identify Python functions in mixed-language
     * profiles (e.g., when profiling C extensions alongside Python code).
     */
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
    if (perf_map_entry == NULL) {
        return;  // Memory allocation failed
    }
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

    const size_t name_length = strlen(perf_map_entry);
    uword base = (uword)code_addr;
    uword size = code_size;

    /*
     * Generate DWARF unwinding information
     *
     * DWARF data is essential for proper stack unwinding during profiling.
     * Without it, perf cannot generate accurate call graphs, especially
     * in optimized code where frame pointers may be omitted.
     */
    uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
        buffer, sizeof(buffer), code_addr, code_size, 0);
    if (eh_frame_size == 0) {
        PyMem_RawFree(perf_map_entry);
        return;
    }

    /*
     * A logical jitdump entry is written as multiple records and also consumes
     * a process-global code_id. Serialize the whole sequence so concurrent JIT
     * compilation cannot interleave records or reuse an ID.
     */
    PyMutex_Lock(&perf_jit_map_state.map_lock);

    /*
     * Write Code Unwinding Information Event
     *
     * This event must be written before the code load event to ensure
     * perf has the unwinding information available when it processes
     * the code region.
     */
    CodeUnwindingInfoEvent ev2;
    ev2.base.event = PerfUnwindingInfo;
    ev2.base.time_stamp = get_current_monotonic_ticks();
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;

    /* Verify we don't exceed our padding budget */
    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);

    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
    ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment

    /* Calculate total event size with padding */
    int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
    int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
    ev2.base.size = (uint32_t)(content_size + padding_size);

    /* Write the unwinding info event header */
    perf_map_jit_write_fully(&ev2, sizeof(ev2));

    /*
     * Write EH Frame Header
     *
     * The EH frame header provides metadata about the DWARF unwinding
     * information that follows. It includes pointers and counts that
     * help perf navigate the unwinding data efficiently.
     */
    EhFrameHeader f;
    f.version = 1;
    f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
    f.fde_count_enc = DWRF_EH_PE_udata4;
    f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;

    /* Calculate relative offsets for EH frame navigation */
    f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
    f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
    uint32_t cie_payload_size;
    memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
    int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
    f.to = -(int32_t)(eh_frame_size - cie_size);

    /* Write EH frame data and header */
    perf_map_jit_write_fully(buffer, eh_frame_size);
    perf_map_jit_write_fully(&f, sizeof(f));

    /* Write padding to maintain alignment */
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
    perf_map_jit_write_fully(&padding_bytes, padding_size);

    /*
     * Write Code Load Event
     *
     * This event tells perf about the new code region. It includes:
     * - Memory addresses and sizes
     * - Process and thread identification
     * - Function name for symbol resolution
     * - The actual machine code bytes
     */
    CodeLoadEvent ev;
    ev.base.event = PerfLoad;
    ev.base.size = sizeof(ev) + (name_length+1) + size;
    ev.base.time_stamp = get_current_monotonic_ticks();
    ev.process_id = getpid();
#if defined(__APPLE__)
    pthread_threadid_np(NULL, &ev.thread_id);
#else
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
#endif
    ev.vma = base;                       // Virtual memory address
    ev.code_address = base;              // Same as VMA for our use case
    ev.code_size = size;

    /* Assign unique code ID and increment counter */
    perf_jit_map_state.code_id += 1;
    ev.code_id = perf_jit_map_state.code_id;

    /* Write code load event and associated data */
    perf_map_jit_write_fully(&ev, sizeof(ev));
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
    /*
     * Ensure each synthetic DSO has unique .text bytes.
     *
     * perf merges DSOs that share a build-id. Since trampolines can share
     * identical code and unwind bytes, perf may resolve all JIT frames to
     * the first symbol it saw (including entries from previous runs when
     * build-id caching is enabled). Patch a small marker in the emitted
     * bytes to make the build-id depend on a per-process salt and code id
     * without modifying the live code.
     */
    uint64_t marker = perf_jit_map_state.build_id_salt ^
        ((uint64_t)perf_jit_map_state.code_id << 32) ^
        (uint64_t)code_size;
    if (size >= sizeof(marker)) {
        size_t prefix = size - sizeof(marker);
        perf_map_jit_write_fully((void *)(base), prefix);
        perf_map_jit_write_fully(&marker, sizeof(marker));
    }
    else if (size > 0) {
        uint8_t tmp[sizeof(marker)];
        memcpy(tmp, (void *)(base), size);
        for (size_t i = 0; i < size; i++) {
            tmp[i] ^= (uint8_t)(marker >> (i * 8));
        }
        perf_map_jit_write_fully(tmp, size);
    }

    /* Clean up allocated memory */
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
    PyMem_RawFree(perf_map_entry);
}

/*
 * Write a complete jitdump entry for a Python function
 *
 * This is the main function called by Python's trampoline system whenever
 * a new piece of JIT-compiled code needs to be recorded. It writes both
 * the unwinding information and the code load event to the jitdump file.
 *
 * The function performs these steps:
 * 1. Initialize jitdump system if not already done
 * 2. Extract function name and filename from Python code object
 * 3. Generate DWARF unwinding information
 * 4. Write unwinding info event to jitdump file
 * 5. Write code load event to jitdump file
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *   code_addr: Address where the compiled code resides
 *   code_size: Size of the compiled code in bytes
 *   co: Python code object containing metadata
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static void perf_map_jit_write_entry(void *state, const void *code_addr,
                                     size_t code_size, PyCodeObject *co)
{
    const char *entry = "";
    const char *filename = "";
    if (co != NULL) {
        if (co->co_qualname != NULL) {
            entry = PyUnicode_AsUTF8(co->co_qualname);
        }
        if (co->co_filename != NULL) {
            filename = PyUnicode_AsUTF8(co->co_filename);
        }
    }
    perf_map_jit_write_entry_with_name(state, code_addr, code_size,
                                       entry, filename);
}

void
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
                          const char *entry, const char *filename)
{
    perf_map_jit_write_entry_with_name(
        NULL, code_addr, code_size, entry, filename);
}

// =============================================================================
//                              CLEANUP AND FINALIZATION
// =============================================================================

/*
 * Finalize and cleanup the perf jitdump system
 *
 * This function is called when Python is shutting down or when the
 * perf trampoline system is being disabled. It ensures all resources
 * are properly released and all buffered data is flushed to disk.
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *
 * Returns: 0 on success
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static int perf_map_jit_fini(void* state) {
    /*
     * Close jitdump file with proper synchronization
     *
     * We need to acquire the lock to ensure no other threads are
     * writing to the file when we close it. This prevents corruption
     * and ensures all data is properly flushed.
     */
    PyMutex_Lock(&perf_jit_map_state.map_lock);
    if (perf_jit_map_state.perf_map != NULL) {
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
        perf_jit_map_state.perf_map = NULL;
    }
    PyMutex_Unlock(&perf_jit_map_state.map_lock);

    /*
     * Unmap the memory region
     *
     * This removes the signal to perf that we were generating JIT code.
     * After this point, perf will no longer detect this process as
     * having JIT capabilities.
     */
    if (perf_jit_map_state.mapped_buffer != NULL) {
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
        perf_jit_map_state.mapped_buffer = NULL;
    }

    /* Clear global state reference */
    trampoline_api.state = NULL;

    return 0;  // Success
}

// =============================================================================
//                              PUBLIC API EXPORT
// =============================================================================

/*
 * Python Perf Callbacks Structure
 *
 * This structure defines the callback interface that Python's trampoline
 * system uses to integrate with perf profiling. It contains function
 * pointers for initialization, event writing, and cleanup.
 *
 * CRITICAL: This structure and its contents are part of Python's internal
 * API. The function signatures and behavior must remain stable to maintain
 * compatibility with the Python interpreter's perf integration system.
 *
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
 */
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
    &perf_map_jit_init,        // Initialization function
    &perf_map_jit_write_entry, // Event writing function
    &perf_map_jit_fini,        // Cleanup function
};

#endif /* PY_HAVE_PERF_TRAMPOLINE */