cpython/Python/jit_unwind.c

/*
 * Python JIT - DWARF .eh_frame builder
 *
 * This file contains the DWARF CFI generator used to build .eh_frame
 * data for JIT code (perf jitdump and other unwinders).
 */

#include "Python.h"
#include "pycore_jit_unwind.h"
#include "pycore_lock.h"

#if defined(PY_HAVE_JIT_GDB_UNWIND)
#  include "jit_unwind_info.h"
#  if !JIT_UNWIND_INFO_SUPPORTED
#    error "JIT unwind info was not generated for this target"
#  endif
#endif

#if defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND)

#if defined(PY_HAVE_JIT_GDB_UNWIND)
#  include <elf.h>
#endif
#include <stdio.h>
#include <string.h>

// =============================================================================
//                              DWARF CONSTANTS
// =============================================================================

/*
 * DWARF (Debug With Arbitrary Record Formats) constants
 *
 * DWARF is a debugging data format used to provide stack unwinding information.
 * These constants define the various encoding types and opcodes used in
 * DWARF Call Frame Information (CFI) records.
 */

/* DWARF Call Frame Information version */
#define DWRF_CIE_VERSION 1

/* DWARF CFA (Call Frame Address) opcodes */
enum {
    DWRF_CFA_nop = 0x0,                    // No operation
    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
    DWRF_CFA_def_cfa_register = 0xd,      // Define CFA register
    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
    DWRF_CFA_offset = 0x80,               // Simple offset instruction
    DWRF_CFA_restore = 0xc0               // Restore register
};

/*
 * Architecture-specific DWARF register numbers
 *
 * These constants define the register numbering scheme used by DWARF
 * for each supported architecture. The numbers must match the ABI
 * specification for proper stack unwinding.
 */
enum {
#ifdef __x86_64__
    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
    DWRF_REG_AX,    // RAX
    DWRF_REG_DX,    // RDX
    DWRF_REG_CX,    // RCX
    DWRF_REG_BX,    // RBX
    DWRF_REG_SI,    // RSI
    DWRF_REG_DI,    // RDI
    DWRF_REG_BP,    // RBP
    DWRF_REG_SP,    // RSP
    DWRF_REG_8,     // R8
    DWRF_REG_9,     // R9
    DWRF_REG_10,    // R10
    DWRF_REG_11,    // R11
    DWRF_REG_12,    // R12
    DWRF_REG_13,    // R13
    DWRF_REG_14,    // R14
    DWRF_REG_15,    // R15
    DWRF_REG_RA,    // Return address (RIP)
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
    /* AArch64 register numbering */
    DWRF_REG_FP = 29,  // Frame Pointer
    DWRF_REG_RA = 30,  // Link register (return address)
    DWRF_REG_SP = 31,  // Stack pointer
#else
#    error "Unsupported target architecture"
#endif
};

// =============================================================================
//                              ELF OBJECT CONTEXT
// =============================================================================

/*
 * Context for building ELF/DWARF structures
 *
 * This structure maintains state while constructing DWARF unwind information.
 * It acts as a simple buffer manager with pointers to track current position
 * and important landmarks within the buffer.
 */
typedef struct ELFObjectContext {
    uint8_t* p;            // Current write position in buffer
    uint8_t* startp;       // Start of buffer (for offset calculations)
    uint8_t* fde_p;        // Start of FDE data (for PC-relative calculations)
    uintptr_t code_addr;   // Address of the code section
    size_t code_size;      // Size of the code section
} ELFObjectContext;

// =============================================================================
//                              DWARF GENERATION UTILITIES
// =============================================================================

/*
 * Append a null-terminated string to the ELF context buffer.
 *
 * Args:
 *   ctx: ELF object context
 *   str: String to append (must be null-terminated)
 *
 * Returns: Offset from start of buffer where string was written
 */
static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
    uint8_t* p = ctx->p;
    uint32_t ofs = (uint32_t)(p - ctx->startp);

    /* Copy string including null terminator */
    do {
        *p++ = (uint8_t)*str;
    } while (*str++);

    ctx->p = p;
    return ofs;
}

/*
 * Append a SLEB128 (Signed Little Endian Base 128) value
 *
 * SLEB128 is a variable-length encoding used extensively in DWARF.
 * It efficiently encodes small numbers in fewer bytes.
 *
 * Args:
 *   ctx: ELF object context
 *   v: Signed value to encode
 */
static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
    uint8_t* p = ctx->p;

    /* Encode 7 bits at a time, with continuation bit in MSB */
    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
    }
    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit

    ctx->p = p;
}

/*
 * Append a ULEB128 (Unsigned Little Endian Base 128) value
 *
 * Similar to SLEB128 but for unsigned values.
 *
 * Args:
 *   ctx: ELF object context
 *   v: Unsigned value to encode
 */
static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
    uint8_t* p = ctx->p;

    /* Encode 7 bits at a time, with continuation bit in MSB */
    for (; v >= 0x80; v >>= 7) {
        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
    }
    *p++ = (char)v;  // Final byte without continuation bit

    ctx->p = p;
}

/*
 * Macros for generating DWARF structures
 *
 * These macros provide a convenient way to write various data types
 * to the DWARF buffer while automatically advancing the pointer.
 */
#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string

/* Align to specified boundary with NOP instructions */
#define DWRF_ALIGNNOP(s)                                          \
    while ((uintptr_t)p & ((s)-1)) {                              \
        *p++ = DWRF_CFA_nop;                                       \
    }

/* Write a DWARF section with automatic size calculation */
#define DWRF_SECTION(name, stmt)                                  \
    {                                                             \
        uint32_t* szp_##name = (uint32_t*)p;                      \
        p += 4;                                                   \
        stmt;                                                     \
        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
    }

// =============================================================================
//                              DWARF EH FRAME GENERATION
// =============================================================================

static void elf_init_ehframe_perf(ELFObjectContext* ctx);
#if defined(PY_HAVE_JIT_GDB_UNWIND)
static void elf_init_ehframe_gdb(ELFObjectContext* ctx);
#endif

static inline void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) {
    if (absolute_addr) {
#if defined(PY_HAVE_JIT_GDB_UNWIND)
        elf_init_ehframe_gdb(ctx);
#else
        Py_UNREACHABLE();
#endif
    }
    else {
        elf_init_ehframe_perf(ctx);
    }
}

size_t
_PyJitUnwind_EhFrameSize(int absolute_addr)
{
    /* The .eh_frame we emit is small and bounded; keep a generous buffer. */
    uint8_t scratch[512];
    _Static_assert(sizeof(scratch) >= 256,
                   "scratch buffer may be too small for elf_init_ehframe");
    ELFObjectContext ctx;
    ctx.code_size = 1;
    ctx.code_addr = 0;
    ctx.startp = ctx.p = scratch;
    ctx.fde_p = NULL;
    /* Generate once into scratch to learn the required size. */
    elf_init_ehframe(&ctx, absolute_addr);
    ptrdiff_t size = ctx.p - ctx.startp;
    assert(size <= (ptrdiff_t)sizeof(scratch));
    return (size_t)size;
}

size_t
_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size,
                        const void *code_addr, size_t code_size,
                        int absolute_addr)
{
    if (buffer == NULL || code_addr == NULL || code_size == 0) {
        return 0;
    }
    /* Generate the frame twice: once to size-check, once to write. */
    size_t required = _PyJitUnwind_EhFrameSize(absolute_addr);
    if (required == 0 || required > buffer_size) {
        return 0;
    }
    ELFObjectContext ctx;
    ctx.code_size = code_size;
    ctx.code_addr = (uintptr_t)code_addr;
    ctx.startp = ctx.p = buffer;
    ctx.fde_p = NULL;
    elf_init_ehframe(&ctx, absolute_addr);
    size_t written = (size_t)(ctx.p - ctx.startp);
    /* The frame size is independent of code_addr/code_size (fixed-width fields). */
    assert(written == required);
    return written;
}

/*
 * Generate a minimal .eh_frame for a single JIT code region.
 *
 * The .eh_frame section contains Call Frame Information (CFI) that describes
 * how to unwind the stack at any point in the code. This is essential for
 * unwinding through JIT-generated code.
 *
 * The generated data contains:
 * 1. A CIE (Common Information Entry) describing the calling convention.
 * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame.
 *
 * Two flavors are emitted, dispatched on the absolute_addr flag:
 *
 * - absolute_addr == 0 (elf_init_ehframe_perf): PC-relative FDE address
 *   encoding for perf's synthesized DSO layout. The CIE describes the
 *   trampoline's entry state and the FDE walks through the prologue and
 *   epilogue with advance_loc instructions. This matches the pre-existing
 *   perf_jit_trampoline behavior byte-for-byte.
 *
 * - absolute_addr == 1 (elf_init_ehframe_gdb): absolute FDE address
 *   encoding for the GDB JIT in-memory ELF. The CIE describes the
 *   steady-state frame layout (CFA = %rbp+16 / x29+16, with saved fp and
 *   return-address column at fixed offsets) and the FDE emits no further
 *   CFI. The same rule applies at every PC in the registered region,
 *   which is correct for executor stencils (they pin the frame pointer
 *   across the region). This is the GDB-side fix; see elf_init_ehframe_gdb
 *   for details.
 */
static void elf_init_ehframe_perf(ELFObjectContext* ctx) {
    int fde_ptr_enc = DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4;
    uint8_t* p = ctx->p;
    uint8_t* framep = p;  // Remember start of frame data

    /*
    * DWARF Unwind Table for Trampoline Function
    *
    * This section defines DWARF Call Frame Information (CFI) using encoded macros
    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
    * and debuggers for stack unwinding in JIT-compiled code.
    *
    * -------------------------------------------------
    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
    * -------------------------------------------------
    *
    * 1. Create a trampoline source file (e.g., `trampoline.c`):
    *
    *      #include <Python.h>
    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
    *          return evaluator(ts, f, throwflag);
    *      }
    *
    * 2. Compile to an object file with frame pointer preservation:
    *
    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
    *
    * 3. Extract DWARF unwind info from the object file:
    *
    *      readelf -w trampoline.o
    *
    *    Example output from `.eh_frame`:
    *
    *      00000000 CIE
    *        Version:               1
    *        Augmentation:          "zR"
    *        Code alignment factor: 4
    *        Data alignment factor: -8
    *        Return address column: 30
    *        DW_CFA_def_cfa: r31 (sp) ofs 0
    *
    *      00000014 FDE cie=00000000 pc=0..14
    *        DW_CFA_advance_loc: 4
    *        DW_CFA_def_cfa_offset: 16
    *        DW_CFA_offset: r29 at cfa-16
    *        DW_CFA_offset: r30 at cfa-8
    *        DW_CFA_advance_loc: 12
    *        DW_CFA_restore: r30
    *        DW_CFA_restore: r29
    *        DW_CFA_def_cfa_offset: 0
    *
    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
    *
    * ----------------------------------
    * HOW TO TRANSLATE TO DWRF_* MACROS:
    * ----------------------------------
    *
    * After compiling your trampoline with:
    *
    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
    *
    * run:
    *
    *     readelf -w trampoline.o
    *
    * to inspect the generated `.eh_frame` data. You will see two main components:
    *
    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
    *
    * ---------------------
    * Translating the CIE:
    * ---------------------
    * From `readelf -w`, you might see:
    *
    *   00000000 0000000000000010 00000000 CIE
    *     Version:               1
    *     Augmentation:          "zR"
    *     Code alignment factor: 4
    *     Data alignment factor: -8
    *     Return address column: 30
    *     Augmentation data:     1b
    *     DW_CFA_def_cfa: r31 (sp) ofs 0
    *
    * Map this to:
    *
    *     DWRF_SECTION(CIE,
    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
    *         DWRF_STR("zR");                         // Augmentation string "zR"
    *         DWRF_UV(4);                             // Code alignment factor = 4
    *         DWRF_SV(-8);                            // Data alignment factor = -8
    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
    *         DWRF_UV(1);                             // Augmentation data length = 1
    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
    *
    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
    *         DWRF_UV(0);                             // Offset = 0
    *
    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
    *     )
    *
    * Notes:
    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
    *
    * ---------------------
    * Translating the FDE:
    * ---------------------
    * From `readelf -w`:
    *
    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
    *     DW_CFA_advance_loc: 4
    *     DW_CFA_def_cfa_offset: 16
    *     DW_CFA_offset: r29 at cfa-16
    *     DW_CFA_offset: r30 at cfa-8
    *     DW_CFA_advance_loc: 12
    *     DW_CFA_restore: r30
    *     DW_CFA_restore: r29
    *     DW_CFA_def_cfa_offset: 0
    *
    * Map the FDE header and instructions to:
    *
    *     DWRF_SECTION(FDE,
    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
    *         DWRF_U32(pc_relative_offset);           // PC-relative location of the code (calculated dynamically)
    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
    *         DWRF_U8(0);                             // Augmentation data length (none)
    *
    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
    *         DWRF_UV(16);
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
    *
    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
    *
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
    *         DWRF_UV(0);
    *     )
    *
    * To regenerate:
    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
    *      the code is in a different address space every time.
    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
    */

    /*
     * Emit DWARF EH CIE (Common Information Entry)
     *
     * The CIE describes the calling conventions and basic unwinding rules
     * that apply to all functions in this compilation unit.
     */
    DWRF_SECTION(CIE,
        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
#ifdef __x86_64__
        DWRF_UV(1);                           // Code alignment factor (x86_64: 1 byte)
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
        DWRF_UV(4);                           // Code alignment factor (AArch64: 4 bytes per instruction)
#endif
        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
        DWRF_U8(DWRF_REG_RA);                 // Return address register number
        DWRF_UV(1);                           // Augmentation data length
        DWRF_U8(fde_ptr_enc);                 // FDE pointer encoding

        /* Initial CFI instructions - describe default calling convention */
#ifdef __x86_64__
        /* x86_64 initial CFI state */
        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
        DWRF_UV(1);                           // At offset 1 from CFA
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
        /* AArch64 initial CFI state */
        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
        DWRF_UV(0);                           // CFA = SP + 0 (AArch64 starts with offset 0)
        // No initial register saves in AArch64 CIE
#endif
        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
    )

    /*
     * Emit DWARF EH FDE (Frame Description Entry)
     *
     * The FDE describes unwinding information specific to this function.
     * It references the CIE and provides function-specific CFI instructions.
     *
     * The PC-relative offset is calculated after the entire EH frame is built
     * to ensure accurate positioning relative to the synthesized DSO layout.
     */
    DWRF_SECTION(FDE,
        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
        /*
         * In perf jitdump mode the FDE PC field is encoded PC-relative and
         * points back to code_start. Record where that field lives so we can
         * patch in the final offset after the rest of the synthetic DSO
         * layout is known.
         */
        ctx->fde_p = p;                       // Remember where PC offset field is located for later calculation
        DWRF_U32(0);                          // Placeholder for PC-relative offset (calculated below)
        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code length)
        DWRF_U8(0);                           // Augmentation data length (none)

        /*
         * Architecture-specific CFI instructions
         *
         * These instructions describe how registers are saved and restored
         * during function calls. Each architecture has different calling
         * conventions and register usage patterns.
         */
#ifdef __x86_64__
        /* x86_64 calling convention unwinding rules */
#  if defined(__CET__) && (__CET__ & 1)
        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance past endbr64 (4 bytes)
#  endif
        DWRF_U8(DWRF_CFA_advance_loc | 1);    // Advance past push %rbp (1 byte)
        DWRF_U8(DWRF_CFA_def_cfa_offset);     // def_cfa_offset 16
        DWRF_UV(16);                          // New offset: SP + 16
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
        DWRF_UV(2);                           // Offset factor: 2 * 8 = 16 bytes
        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past mov %rsp,%rbp (3 bytes)
        DWRF_U8(DWRF_CFA_def_cfa_register);   // def_cfa_register r6
        DWRF_UV(DWRF_REG_BP);                 // Use base pointer register
        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
        DWRF_U8(DWRF_CFA_def_cfa);            // def_cfa r7 ofs 8
        DWRF_UV(DWRF_REG_SP);                 // Use stack pointer register
        DWRF_UV(8);                           // New offset: SP + 8
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
        /* AArch64 calling convention unwinding rules */
        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance by 1 instruction (4 bytes)
        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 16
        DWRF_UV(16);                              // Stack pointer moved by 16 bytes
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // x29 (frame pointer) saved
        DWRF_UV(2);                               // At CFA-16 (2 * 8 = 16 bytes from CFA)
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // x30 (link register) saved
        DWRF_UV(1);                               // At CFA-8 (1 * 8 = 8 bytes from CFA)
        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (12 bytes)
        DWRF_U8(DWRF_CFA_def_cfa_register);       // CFA = FP (x29) + 16
        DWRF_UV(DWRF_REG_FP);
        DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA);  // Restore x30 - NO DWRF_UV() after this!
        DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP);  // Restore x29 - NO DWRF_UV() after this!
        DWRF_U8(DWRF_CFA_def_cfa);                // CFA = SP + 0 (stack restored)
        DWRF_UV(DWRF_REG_SP);
        DWRF_UV(0);

#else
#    error "Unsupported target architecture"
#endif

        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
    )

    ctx->p = p;  // Update context pointer to end of generated data

    /* Calculate and update the PC-relative offset in the FDE
     *
     * When perf processes the jitdump, it creates a synthesized DSO with this layout:
     *
     *     Synthesized DSO Memory Layout:
     *     ┌─────────────────────────────────────────────────────────────┐ < code_start
     *     │                        Code Section                         │
     *     │                    (round_up(code_size, 8) bytes)           │
     *     ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
     *     │                      EH Frame Data                          │
     *     │  ┌─────────────────────────────────────────────────────┐    │
     *     │  │                 CIE data                            │    │
     *     │  └─────────────────────────────────────────────────────┘    │
     *     │  ┌─────────────────────────────────────────────────────┐    │
     *     │  │ FDE Header:                                         │    │
     *     │  │   - CIE offset (4 bytes)                            │    │
     *     │  │   - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
     *     │  │   - address range (4 bytes)                         │    │   (this specific field)
     *     │  │ CFI Instructions...                                 │    │
     *     │  └─────────────────────────────────────────────────────┘    │
     *     ├─────────────────────────────────────────────────────────────┤ < reference_point
     *     │                    EhFrameHeader                            │
     *     │                 (navigation metadata)                       │
     *     └─────────────────────────────────────────────────────────────┘
     *
     * The PC offset field in the FDE must contain the distance from itself to code_start:
     *
     *   distance = code_start - fde_pc_field
     *
     * Where:
     *   fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
     *   code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
     *
     * Therefore:
     *   distance = code_start_location - fde_pc_field_location
     *            = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
     *            = -rounded_code_size - fde_offset_in_frame
     *            = -(round_up(code_size, 8) + fde_offset_in_frame)
     *
     * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field.
     *
     */
    int32_t rounded_code_size =
        (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8);
    int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep);
    *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame);
}

/*
 * Build .eh_frame data for the GDB JIT interface.
 *
 * The executor runs inside the frame established by _PyJIT_Entry, but the
 * synthetic executor FDE collapses that state into a single logical JIT frame
 * that unwinds directly into _PyEval_*. Executor stencils never touch the
 * frame pointer - enforced by Tools/jit/_optimizers.py _validate() and
 * -mframe-pointer=reserved - so the steady-state rule is valid at every PC
 * and the FDE body is empty. Tools/jit/_targets.py derives the initial CFI
 * rules from the row active at the executor call in the compiled shim object.
 */
#if defined(PY_HAVE_JIT_GDB_UNWIND)
static void elf_init_ehframe_gdb(ELFObjectContext* ctx) {
    int fde_ptr_enc = DWRF_EH_PE_absptr;
    uint8_t* p = ctx->p;
    uint8_t* framep = p;

    DWRF_SECTION(CIE,
        DWRF_U32(0);                          // CIE ID
        DWRF_U8(DWRF_CIE_VERSION);
        DWRF_STR("zR");                       // aug data length + FDE ptr encoding follow
        DWRF_UV(JIT_UNWIND_CODE_ALIGNMENT_FACTOR);
        DWRF_SV(JIT_UNWIND_DATA_ALIGNMENT_FACTOR);
        DWRF_U8(JIT_UNWIND_RA_REG);
        DWRF_UV(1);                           // Augmentation data length
        DWRF_U8(fde_ptr_enc);                 // FDE pointer encoding

        /* Executor steady-state rule (our invariant, not the compiler's). */
        DWRF_U8(DWRF_CFA_def_cfa);
        DWRF_UV(JIT_UNWIND_CFA_REG);
        DWRF_UV(JIT_UNWIND_CFA_OFFSET);
        DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_FP_REG);
        DWRF_UV(JIT_UNWIND_FP_OFFSET);
        DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_RA_REG);
        DWRF_UV(JIT_UNWIND_RA_OFFSET);
        DWRF_ALIGNNOP(sizeof(uintptr_t));
    )

    DWRF_SECTION(FDE,
        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
        DWRF_ADDR(ctx->code_addr);            // Absolute code start
        DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered
        DWRF_U8(0);                           // Augmentation data length (none)
        DWRF_ALIGNNOP(sizeof(uintptr_t));
    )

    ctx->p = p;
}
#endif

#if defined(PY_HAVE_JIT_GDB_UNWIND)
enum {
    JIT_NOACTION = 0,
    JIT_REGISTER_FN = 1,
    JIT_UNREGISTER_FN = 2,
};

struct jit_code_entry {
    struct jit_code_entry *next;
    struct jit_code_entry *prev;
    const char *symfile_addr;
    uint64_t symfile_size;
    const void *code_addr;
};

struct jit_descriptor {
    uint32_t version;
    uint32_t action_flag;
    struct jit_code_entry *relevant_entry;
    struct jit_code_entry *first_entry;
};

PyMutex _Py_jit_debug_mutex = {0};

Py_EXPORTED_SYMBOL volatile struct jit_descriptor __jit_debug_descriptor = {
    1, JIT_NOACTION, NULL, NULL
};

Py_EXPORTED_SYMBOL void __attribute__((noinline))
__jit_debug_register_code(void)
{
    /* Keep this call visible to debuggers and not optimized away. */
    (void)__jit_debug_descriptor.action_flag;
#if defined(__GNUC__) || defined(__clang__)
    __asm__ __volatile__("" ::: "memory");
#endif
}

static uint16_t
gdb_jit_machine_id(void)
{
    /* Map the current target to ELF e_machine; return 0 to skip registration. */
#if defined(__x86_64__) || defined(_M_X64)
    return EM_X86_64;
#elif defined(__aarch64__) && !defined(__ILP32__)
    return EM_AARCH64;
#else
    return 0;
#endif
}

static struct jit_code_entry *
gdb_jit_register_code(
    const void *code_addr,
    size_t code_size,
    const char *symname,
    const uint8_t *eh_frame,
    size_t eh_frame_size
)
{
    /*
     * Build a minimal in-memory ELF for GDB's JIT interface and link it into
     * __jit_debug_descriptor so debuggers can resolve JIT code.
     */
    if (code_addr == NULL || code_size == 0 || symname == NULL) {
        return NULL;
    }

    const uint16_t machine = gdb_jit_machine_id();
    if (machine == 0) {
        return NULL;
    }

    enum {
        SH_NULL = 0,
        SH_TEXT,
        SH_EH_FRAME,
        SH_SHSTRTAB,
        SH_STRTAB,
        SH_SYMTAB,
        SH_NUM,
    };
    static const char shstrtab[] =
        "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab";
    _Static_assert(sizeof(shstrtab) ==
        1 + sizeof(".text") + sizeof(".eh_frame") +
            sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"),
        "shstrtab size mismatch");
    const size_t shstrtab_size = sizeof(shstrtab);
    const size_t sh_text = 1;
    const size_t sh_eh_frame = sh_text + sizeof(".text");
    const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame");
    const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab");
    const size_t sh_symtab = sh_strtab + sizeof(".strtab");
    const size_t text_size = code_size;
    const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8);
    const size_t strtab_size = 1 + strlen(symname) + 1;
    const size_t symtab_size = 3 * sizeof(Elf64_Sym);

    size_t offset = sizeof(Elf64_Ehdr);
    offset = _Py_SIZE_ROUND_UP(offset, 16);
    const size_t text_off = offset;
    const size_t eh_off = text_off + text_padded;
    offset = eh_off + eh_frame_size;
    const size_t shstr_off = offset;
    offset += shstrtab_size;
    const size_t str_off = offset;
    offset += strtab_size;
    /* Elf64_Sym requires 8-byte alignment for st_value/st_size. */
    offset = _Py_SIZE_ROUND_UP(offset, 8);
    const size_t sym_off = offset;
    offset += symtab_size;
    offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr));
    const size_t sh_off = offset;

    const size_t shnum = SH_NUM;
    const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr);
    uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size);
    if (buf == NULL) {
        return NULL;
    }
    memset(buf, 0, total_size);

    Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf;
    memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
    ehdr->e_ident[EI_CLASS] = ELFCLASS64;
    ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
    ehdr->e_ident[EI_VERSION] = EV_CURRENT;
    ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE;
    ehdr->e_type = ET_DYN;
    ehdr->e_machine = machine;
    ehdr->e_version = EV_CURRENT;
    ehdr->e_entry = 0;
    ehdr->e_phoff = 0;
    ehdr->e_shoff = sh_off;
    ehdr->e_ehsize = sizeof(Elf64_Ehdr);
    ehdr->e_shentsize = sizeof(Elf64_Shdr);
    ehdr->e_shnum = shnum;
    ehdr->e_shstrndx = SH_SHSTRTAB;

    memcpy(buf + text_off, code_addr, text_size);
    memcpy(buf + eh_off, eh_frame, eh_frame_size);

    char *shstr = (char *)(buf + shstr_off);
    memcpy(shstr, shstrtab, shstrtab_size);

    char *strtab = (char *)(buf + str_off);
    strtab[0] = '\0';
    memcpy(strtab + 1, symname, strlen(symname));
    strtab[strtab_size - 1] = '\0';

    Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off);
    memset(syms, 0, symtab_size);
    /* Section symbol for .text (local) */
    syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION);
    syms[1].st_shndx = 1;
    /* Function symbol */
    syms[2].st_name = 1;
    syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
    syms[2].st_other = STV_DEFAULT;
    syms[2].st_shndx = 1;
    /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */
    syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr;
    syms[2].st_size = code_size;

    Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off);
    memset(shdrs, 0, shnum * sizeof(Elf64_Shdr));

    shdrs[SH_TEXT].sh_name = sh_text;
    shdrs[SH_TEXT].sh_type = SHT_PROGBITS;
    shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
    shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr;
    shdrs[SH_TEXT].sh_offset = text_off;
    shdrs[SH_TEXT].sh_size = text_size;
    shdrs[SH_TEXT].sh_addralign = 16;

    shdrs[SH_EH_FRAME].sh_name = sh_eh_frame;
    shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS;
    shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC;
    shdrs[SH_EH_FRAME].sh_addr =
        (Elf64_Addr)((uintptr_t)code_addr + text_padded);
    shdrs[SH_EH_FRAME].sh_offset = eh_off;
    shdrs[SH_EH_FRAME].sh_size = eh_frame_size;
    shdrs[SH_EH_FRAME].sh_addralign = 8;

    shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab;
    shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB;
    shdrs[SH_SHSTRTAB].sh_offset = shstr_off;
    shdrs[SH_SHSTRTAB].sh_size = shstrtab_size;
    shdrs[SH_SHSTRTAB].sh_addralign = 1;

    shdrs[SH_STRTAB].sh_name = sh_strtab;
    shdrs[SH_STRTAB].sh_type = SHT_STRTAB;
    shdrs[SH_STRTAB].sh_offset = str_off;
    shdrs[SH_STRTAB].sh_size = strtab_size;
    shdrs[SH_STRTAB].sh_addralign = 1;

    shdrs[SH_SYMTAB].sh_name = sh_symtab;
    shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB;
    shdrs[SH_SYMTAB].sh_offset = sym_off;
    shdrs[SH_SYMTAB].sh_size = symtab_size;
    shdrs[SH_SYMTAB].sh_link = SH_STRTAB;
    shdrs[SH_SYMTAB].sh_info = 2;
    shdrs[SH_SYMTAB].sh_addralign = 8;
    shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym);

    struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry));
    if (entry == NULL) {
        PyMem_RawFree(buf);
        return NULL;
    }
    entry->symfile_addr = (const char *)buf;
    entry->symfile_size = total_size;
    entry->code_addr = code_addr;

    PyMutex_Lock(&_Py_jit_debug_mutex);
    entry->prev = NULL;
    entry->next = __jit_debug_descriptor.first_entry;
    if (entry->next != NULL) {
        entry->next->prev = entry;
    }
    __jit_debug_descriptor.first_entry = entry;
    __jit_debug_descriptor.relevant_entry = entry;
    __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
    __jit_debug_register_code();
    __jit_debug_descriptor.action_flag = JIT_NOACTION;
    __jit_debug_descriptor.relevant_entry = NULL;
    PyMutex_Unlock(&_Py_jit_debug_mutex);
    return entry;
}
#endif  // defined(PY_HAVE_JIT_GDB_UNWIND)

void *
_PyJitUnwind_GdbRegisterCode(const void *code_addr,
                             size_t code_size,
                             const char *entry,
                             const char *filename)
{
#if defined(PY_HAVE_JIT_GDB_UNWIND)
    /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */
    if (entry == NULL) {
        entry = "";
    }
    if (filename == NULL) {
        filename = "";
    }
    size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
    char *name = (char *)PyMem_RawMalloc(name_size);
    if (name == NULL) {
        return NULL;
    }
    snprintf(name, name_size, "py::%s:%s", entry, filename);

    uint8_t buffer[1024];
    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
        buffer, sizeof(buffer), code_addr, code_size, 1);
    if (eh_frame_size == 0) {
        PyMem_RawFree(name);
        return NULL;
    }

    void *handle = gdb_jit_register_code(code_addr, code_size, name,
                                         buffer, eh_frame_size);
    PyMem_RawFree(name);
    return handle;
#else
    (void)code_addr;
    (void)code_size;
    (void)entry;
    (void)filename;
    return NULL;
#endif
}

void
_PyJitUnwind_GdbUnregisterCode(void *handle)
{
#if defined(PY_HAVE_JIT_GDB_UNWIND)
    struct jit_code_entry *entry = (struct jit_code_entry *)handle;
    if (entry == NULL) {
        return;
    }

    PyMutex_Lock(&_Py_jit_debug_mutex);
    if (entry->prev != NULL) {
        entry->prev->next = entry->next;
    }
    else {
        __jit_debug_descriptor.first_entry = entry->next;
    }
    if (entry->next != NULL) {
        entry->next->prev = entry->prev;
    }

    __jit_debug_descriptor.relevant_entry = entry;
    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
    __jit_debug_register_code();
    __jit_debug_descriptor.action_flag = JIT_NOACTION;
    __jit_debug_descriptor.relevant_entry = NULL;

    PyMutex_Unlock(&_Py_jit_debug_mutex);

    PyMem_RawFree((void *)entry->symfile_addr);
    PyMem_RawFree(entry);
#else
    (void)handle;
#endif
}

#endif  // defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND)