GH-137959: Replace shim code in jitted code with a single trampoline function. (GH-137961)

This commit is contained in:
Mark Shannon 2025-08-21 10:40:53 +01:00 committed by GitHub
parent c056a089d8
commit a8d9d94784
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 166 additions and 104 deletions

View file

@ -2971,7 +2971,7 @@ dummy_func(
assert(tstate->current_executor == NULL);
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
TIER1_TO_TIER2(executor);
}
}
else {
@ -3037,7 +3037,7 @@ dummy_func(
}
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
TIER1_TO_TIER2(executor);
#else
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
#endif /* _Py_TIER2 */
@ -5257,7 +5257,7 @@ dummy_func(
}
#endif
tstate->jit_exit = exit;
GOTO_TIER_TWO(exit->executor);
TIER2_TO_TIER2(exit->executor);
}
tier2 op(_CHECK_VALIDITY, (--)) {
@ -5353,7 +5353,7 @@ dummy_func(
tier2 op(_START_EXECUTOR, (executor/4 --)) {
#ifndef _Py_JIT
current_executor = (_PyExecutorObject*)executor;
assert(current_executor == (_PyExecutorObject*)executor);
#endif
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
tstate->current_executor = (PyObject *)executor;
@ -5434,7 +5434,7 @@ dummy_func(
}
assert(tstate->jit_exit == exit);
exit->executor = executor;
GOTO_TIER_TWO(exit->executor);
TIER2_TO_TIER2(exit->executor);
}
label(pop_2_error) {

View file

@ -275,7 +275,8 @@ maybe_lltrace_resume_frame(_PyInterpreterFrame *frame, PyObject *globals)
}
int r = PyDict_Contains(globals, &_Py_ID(__lltrace__));
if (r < 0) {
return -1;
PyErr_Clear();
return 0;
}
int lltrace = r * 5; // Levels 1-4 only trace uops
if (!lltrace) {
@ -1109,11 +1110,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
#endif
}
#if defined(_Py_TIER2) && !defined(_Py_JIT)
/* Tier 2 interpreter state */
_PyExecutorObject *current_executor = NULL;
const _PyUOpInstruction *next_uop = NULL;
#endif
#if Py_TAIL_CALL_INTERP
# if Py_STATS
return _TAIL_CALL_start_frame(frame, NULL, tstate, NULL, 0, lastopcode);
@ -1126,14 +1122,41 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
#endif
early_exit:
assert(_PyErr_Occurred(tstate));
_Py_LeaveRecursiveCallPy(tstate);
assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
// GH-99729: We need to unlink the frame *before* clearing it:
_PyInterpreterFrame *dying = frame;
frame = tstate->current_frame = dying->previous;
_PyEval_FrameClearAndPop(tstate, dying);
frame->return_offset = 0;
assert(frame->owner == FRAME_OWNED_BY_INTERPRETER);
/* Restore previous frame and exit */
tstate->current_frame = frame->previous;
return NULL;
}
#ifdef _Py_TIER2
// Tier 2 is also here!
enter_tier_two:
#ifdef _Py_JIT
assert(0);
_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitTrampoline;
#else
_PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter;
#endif
#endif
#if defined(_Py_TIER2) && !defined(_Py_JIT)
_Py_CODEUNIT *
_PyTier2Interpreter(
_PyExecutorObject *current_executor, _PyInterpreterFrame *frame,
_PyStackRef *stack_pointer, PyThreadState *tstate
) {
const _PyUOpInstruction *next_uop;
int oparg;
tier2_start:
next_uop = current_executor->trace;
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT);
#undef LOAD_IP
#define LOAD_IP(UNUSED) (void)0
@ -1151,7 +1174,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
#undef ENABLE_SPECIALIZATION_FT
#define ENABLE_SPECIALIZATION_FT 0
; // dummy statement after a label, before a declaration
uint16_t uopcode;
#ifdef Py_STATS
int lastuop = 0;
@ -1225,24 +1247,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
next_uop = current_executor->trace + target;
goto tier2_dispatch;
#endif // _Py_JIT
}
#endif // _Py_TIER2
early_exit:
assert(_PyErr_Occurred(tstate));
_Py_LeaveRecursiveCallPy(tstate);
assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
// GH-99729: We need to unlink the frame *before* clearing it:
_PyInterpreterFrame *dying = frame;
frame = tstate->current_frame = dying->previous;
_PyEval_FrameClearAndPop(tstate, dying);
frame->return_offset = 0;
assert(frame->owner == FRAME_OWNED_BY_INTERPRETER);
/* Restore previous frame and exit */
tstate->current_frame = frame->previous;
return NULL;
}
#ifdef DO_NOT_OPTIMIZE_INTERP_LOOP
# pragma optimize("", on)

View file

@ -133,9 +133,6 @@ do { \
_PyFrame_SetStackPointer(frame, stack_pointer); \
int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \
stack_pointer = _PyFrame_GetStackPointer(frame); \
if (lltrace < 0) { \
JUMP_TO_LABEL(exit_unwind); \
} \
frame->lltrace = lltrace; \
} while (0)
#else
@ -354,16 +351,10 @@ _PyFrame_SetStackPointer(frame, stack_pointer)
/* Tier-switching macros. */
#ifdef _Py_JIT
#define GOTO_TIER_TWO(EXECUTOR) \
#define TIER1_TO_TIER2(EXECUTOR) \
do { \
OPT_STAT_INC(traces_executed); \
_PyExecutorObject *_executor = (EXECUTOR); \
jit_func jitted = _executor->jit_code; \
/* Keep the shim frame alive via the executor: */ \
Py_INCREF(_executor); \
next_instr = jitted(frame, stack_pointer, tstate); \
Py_DECREF(_executor); \
next_instr = _Py_jit_entry((EXECUTOR), frame, stack_pointer, tstate); \
frame = tstate->current_frame; \
stack_pointer = _PyFrame_GetStackPointer(frame); \
if (next_instr == NULL) { \
@ -372,31 +363,21 @@ do { \
} \
DISPATCH(); \
} while (0)
#else
#define GOTO_TIER_TWO(EXECUTOR) \
do { \
OPT_STAT_INC(traces_executed); \
_PyExecutorObject *_executor = (EXECUTOR); \
next_uop = _executor->trace; \
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
goto enter_tier_two; \
#define TIER2_TO_TIER2(EXECUTOR) \
do { \
OPT_STAT_INC(traces_executed); \
current_executor = (EXECUTOR); \
goto tier2_start; \
} while (0)
#endif
#define GOTO_TIER_ONE(TARGET) \
do \
{ \
tstate->current_executor = NULL; \
next_instr = (TARGET); \
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
_PyFrame_SetStackPointer(frame, stack_pointer); \
stack_pointer = _PyFrame_GetStackPointer(frame); \
if (next_instr == NULL) \
{ \
next_instr = frame->instr_ptr; \
goto error; \
} \
DISPATCH(); \
return TARGET; \
} while (0)
#define CURRENT_OPARG() (next_uop[-1].oparg)

View file

@ -7122,7 +7122,7 @@
}
#endif
tstate->jit_exit = exit;
GOTO_TIER_TWO(exit->executor);
TIER2_TO_TIER2(exit->executor);
break;
}
@ -7400,7 +7400,7 @@
case _START_EXECUTOR: {
PyObject *executor = (PyObject *)CURRENT_OPERAND0();
#ifndef _Py_JIT
current_executor = (_PyExecutorObject*)executor;
assert(current_executor == (_PyExecutorObject*)executor);
#endif
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
tstate->current_executor = (PyObject *)executor;
@ -7503,7 +7503,7 @@
}
assert(tstate->jit_exit == exit);
exit->executor = executor;
GOTO_TIER_TWO(exit->executor);
TIER2_TO_TIER2(exit->executor);
break;
}

View file

@ -5493,7 +5493,7 @@
}
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
TIER1_TO_TIER2(executor);
#else
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
#endif /* _Py_TIER2 */
@ -7667,7 +7667,7 @@
assert(tstate->current_executor == NULL);
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
TIER1_TO_TIER2(executor);
}
}
else {

View file

@ -494,10 +494,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
size_t code_size = 0;
size_t data_size = 0;
jit_state state = {0};
group = &shim;
code_size += group->code_size;
data_size += group->data_size;
combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
for (size_t i = 0; i < length; i++) {
const _PyUOpInstruction *instruction = &trace[i];
group = &stencil_groups[instruction->opcode];
@ -539,13 +535,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
unsigned char *code = memory;
state.trampolines.mem = memory + code_size;
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
// Compile the shim, which handles converting between the native
// calling convention and the calling convention used by jitted code
// (which may be different for efficiency reasons).
group = &shim;
group->emit(code, data, executor, NULL, &state);
code += group->code_size;
data += group->data_size;
assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT);
for (size_t i = 0; i < length; i++) {
const _PyUOpInstruction *instruction = &trace[i];
@ -566,11 +555,75 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
return -1;
}
executor->jit_code = memory;
executor->jit_side_entry = memory + shim.code_size;
executor->jit_size = total_size;
return 0;
}
/* One-off compilation of the jit entry trampoline
* We compile this once only as it effectively a normal
* function, but we need to use the JIT because it needs
* to understand the jit-specific calling convention.
*/
static _PyJitEntryFuncPtr
compile_trampoline(void)
{
_PyExecutorObject dummy;
const StencilGroup *group;
size_t code_size = 0;
size_t data_size = 0;
jit_state state = {0};
group = &trampoline;
code_size += group->code_size;
data_size += group->data_size;
combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
// Round up to the nearest page:
size_t page_size = get_page_size();
assert((page_size & (page_size - 1)) == 0);
size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1));
size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding;
unsigned char *memory = jit_alloc(total_size);
if (memory == NULL) {
return NULL;
}
unsigned char *code = memory;
state.trampolines.mem = memory + code_size;
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
// Compile the shim, which handles converting between the native
// calling convention and the calling convention used by jitted code
// (which may be different for efficiency reasons).
group = &trampoline;
group->emit(code, data, &dummy, NULL, &state);
code += group->code_size;
data += group->data_size;
assert(code == memory + code_size);
assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
if (mark_executable(memory, total_size)) {
jit_free(memory, total_size);
return NULL;
}
return (_PyJitEntryFuncPtr)memory;
}
static PyMutex lazy_jit_mutex = { 0 };
_Py_CODEUNIT *
_Py_LazyJitTrampoline(
_PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
) {
PyMutex_Lock(&lazy_jit_mutex);
if (_Py_jit_entry == _Py_LazyJitTrampoline) {
_PyJitEntryFuncPtr trampoline = compile_trampoline();
if (trampoline == NULL) {
PyMutex_Unlock(&lazy_jit_mutex);
Py_FatalError("Cannot allocate core JIT code");
}
_Py_jit_entry = trampoline;
}
PyMutex_Unlock(&lazy_jit_mutex);
return _Py_jit_entry(executor, frame, stack_pointer, tstate);
}
void
_PyJIT_Free(_PyExecutorObject *executor)
{
@ -578,7 +631,6 @@ _PyJIT_Free(_PyExecutorObject *executor)
size_t size = executor->jit_size;
if (memory) {
executor->jit_code = NULL;
executor->jit_side_entry = NULL;
executor->jit_size = 0;
if (jit_free(memory, size)) {
PyErr_FormatUnraisable("Exception ignored while "

View file

@ -1238,7 +1238,6 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
#endif
#ifdef _Py_JIT
executor->jit_code = NULL;
executor->jit_side_entry = NULL;
executor->jit_size = 0;
// This is initialized to true so we can prevent the executor
// from being immediately detected as cold and invalidated.
@ -1490,7 +1489,6 @@ _PyExecutor_GetColdExecutor(void)
((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT;
#ifdef _Py_JIT
cold->jit_code = NULL;
cold->jit_side_entry = NULL;
cold->jit_size = 0;
// This is initialized to true so we can prevent the executor
// from being immediately detected as cold and invalidated.

View file

@ -494,6 +494,11 @@ free_interpreter(PyInterpreterState *interp)
static inline int check_interpreter_whence(long);
#endif
extern _Py_CODEUNIT *
_Py_LazyJitTrampoline(
struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
);
/* Get the interpreter state to a minimal consistent state.
Further init happens in pylifecycle.c before it can be used.
All fields not initialized here are expected to be zeroed out,