GH-136410: Faster side exits by using a cold exit stub (GH-136411)

This commit is contained in:
Mark Shannon 2025-08-01 16:26:07 +01:00 committed by GitHub
parent 718e0c89ba
commit e7b55f564d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 387 additions and 267 deletions

View file

@ -2964,6 +2964,8 @@ dummy_func(
else {
this_instr[1].counter = initial_jump_backoff_counter();
assert(tstate->current_executor == NULL);
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
}
}
@ -3028,6 +3030,8 @@ dummy_func(
}
DISPATCH_GOTO();
}
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
#else
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
@ -5238,9 +5242,8 @@ dummy_func(
tier2 op(_EXIT_TRACE, (exit_p/4 --)) {
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
#if defined(Py_DEBUG) && !defined(_Py_JIT)
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (frame->lltrace >= 2) {
printf("SIDE EXIT: [UOp ");
@ -5251,32 +5254,7 @@ dummy_func(
_PyOpcode_OpName[target->op.code]);
}
#endif
if (exit->executor && !exit->executor->vm_data.valid) {
exit->temperature = initial_temperature_backoff_counter();
Py_CLEAR(exit->executor);
}
if (exit->executor == NULL) {
_Py_BackoffCounter temperature = exit->temperature;
if (!backoff_counter_triggers(temperature)) {
exit->temperature = advance_backoff_counter(temperature);
GOTO_TIER_ONE(target);
}
_PyExecutorObject *executor;
if (target->op.code == ENTER_EXECUTOR) {
executor = code->co_executors->executors[target->op.arg];
Py_INCREF(executor);
}
else {
int chain_depth = current_executor->vm_data.chain_depth + 1;
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(temperature);
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
}
exit->temperature = initial_temperature_backoff_counter();
}
exit->executor = executor;
}
tstate->jit_exit = exit;
GOTO_TIER_TWO(exit->executor);
}
@ -5375,7 +5353,14 @@ dummy_func(
#ifndef _Py_JIT
current_executor = (_PyExecutorObject*)executor;
#endif
assert(((_PyExecutorObject *)executor)->vm_data.valid);
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
tstate->current_executor = (PyObject *)executor;
if (!current_executor->vm_data.valid) {
assert(tstate->jit_exit->executor == current_executor);
assert(tstate->current_executor == executor);
_PyExecutor_ClearExit(tstate->jit_exit);
DEOPT_IF(true);
}
}
tier2 op(_MAKE_WARM, (--)) {
@ -5414,6 +5399,37 @@ dummy_func(
assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version));
}
tier2 op(_COLD_EXIT, ( -- )) {
_PyExitData *exit = tstate->jit_exit;
assert(exit != NULL);
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
_Py_BackoffCounter temperature = exit->temperature;
if (!backoff_counter_triggers(temperature)) {
exit->temperature = advance_backoff_counter(temperature);
GOTO_TIER_ONE(target);
}
_PyExecutorObject *executor;
if (target->op.code == ENTER_EXECUTOR) {
PyCodeObject *code = _PyFrame_GetCode(frame);
executor = code->co_executors->executors[target->op.arg];
Py_INCREF(executor);
}
else {
_PyExecutorObject *previous_executor = _PyExecutor_FromExit(exit);
assert(tstate->current_executor == (PyObject *)previous_executor);
int chain_depth = previous_executor->vm_data.chain_depth + 1;
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(temperature);
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
}
exit->temperature = initial_temperature_backoff_counter();
}
assert(tstate->jit_exit == exit);
exit->executor = executor;
GOTO_TIER_TWO(exit->executor);
}
label(pop_2_error) {
stack_pointer -= 2;
assert(WITHIN_STACK_BOUNDS());

View file

@ -1158,7 +1158,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
uint64_t trace_uop_execution_counter = 0;
#endif
assert(next_uop->opcode == _START_EXECUTOR);
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT);
tier2_dispatch:
for (;;) {
uopcode = next_uop->opcode;

View file

@ -359,7 +359,6 @@ _PyFrame_SetStackPointer(frame, stack_pointer)
do { \
OPT_STAT_INC(traces_executed); \
_PyExecutorObject *_executor = (EXECUTOR); \
tstate->current_executor = (PyObject *)_executor; \
jit_func jitted = _executor->jit_code; \
/* Keep the shim frame alive via the executor: */ \
Py_INCREF(_executor); \
@ -378,9 +377,8 @@ do { \
do { \
OPT_STAT_INC(traces_executed); \
_PyExecutorObject *_executor = (EXECUTOR); \
tstate->current_executor = (PyObject *)_executor; \
next_uop = _executor->trace; \
assert(next_uop->opcode == _START_EXECUTOR); \
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
goto enter_tier_two; \
} while (0)
#endif
@ -390,7 +388,6 @@ do { \
{ \
tstate->current_executor = NULL; \
next_instr = (TARGET); \
assert(tstate->current_executor == NULL); \
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
_PyFrame_SetStackPointer(frame, stack_pointer); \
stack_pointer = _PyFrame_GetStackPointer(frame); \

View file

@ -7113,9 +7113,8 @@
case _EXIT_TRACE: {
PyObject *exit_p = (PyObject *)CURRENT_OPERAND0();
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
#if defined(Py_DEBUG) && !defined(_Py_JIT)
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (frame->lltrace >= 2) {
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -7128,36 +7127,7 @@
stack_pointer = _PyFrame_GetStackPointer(frame);
}
#endif
if (exit->executor && !exit->executor->vm_data.valid) {
exit->temperature = initial_temperature_backoff_counter();
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_CLEAR(exit->executor);
stack_pointer = _PyFrame_GetStackPointer(frame);
}
if (exit->executor == NULL) {
_Py_BackoffCounter temperature = exit->temperature;
if (!backoff_counter_triggers(temperature)) {
exit->temperature = advance_backoff_counter(temperature);
GOTO_TIER_ONE(target);
}
_PyExecutorObject *executor;
if (target->op.code == ENTER_EXECUTOR) {
executor = code->co_executors->executors[target->op.arg];
Py_INCREF(executor);
}
else {
int chain_depth = current_executor->vm_data.chain_depth + 1;
_PyFrame_SetStackPointer(frame, stack_pointer);
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
stack_pointer = _PyFrame_GetStackPointer(frame);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(temperature);
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
}
exit->temperature = initial_temperature_backoff_counter();
}
exit->executor = executor;
}
tstate->jit_exit = exit;
GOTO_TIER_TWO(exit->executor);
break;
}
@ -7438,7 +7408,19 @@
#ifndef _Py_JIT
current_executor = (_PyExecutorObject*)executor;
#endif
assert(((_PyExecutorObject *)executor)->vm_data.valid);
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
tstate->current_executor = (PyObject *)executor;
if (!current_executor->vm_data.valid) {
assert(tstate->jit_exit->executor == current_executor);
assert(tstate->current_executor == executor);
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyExecutor_ClearExit(tstate->jit_exit);
stack_pointer = _PyFrame_GetStackPointer(frame);
if (true) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
}
break;
}
@ -7487,4 +7469,40 @@
break;
}
case _COLD_EXIT: {
_PyExitData *exit = tstate->jit_exit;
assert(exit != NULL);
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
_Py_BackoffCounter temperature = exit->temperature;
if (!backoff_counter_triggers(temperature)) {
exit->temperature = advance_backoff_counter(temperature);
GOTO_TIER_ONE(target);
}
_PyExecutorObject *executor;
if (target->op.code == ENTER_EXECUTOR) {
PyCodeObject *code = _PyFrame_GetCode(frame);
executor = code->co_executors->executors[target->op.arg];
Py_INCREF(executor);
}
else {
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyExecutorObject *previous_executor = _PyExecutor_FromExit(exit);
stack_pointer = _PyFrame_GetStackPointer(frame);
assert(tstate->current_executor == (PyObject *)previous_executor);
int chain_depth = previous_executor->vm_data.chain_depth + 1;
_PyFrame_SetStackPointer(frame, stack_pointer);
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
stack_pointer = _PyFrame_GetStackPointer(frame);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(temperature);
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
}
exit->temperature = initial_temperature_backoff_counter();
}
assert(tstate->jit_exit == exit);
exit->executor = executor;
GOTO_TIER_TWO(exit->executor);
break;
}
#undef TIER_TWO

View file

@ -5595,6 +5595,8 @@
}
DISPATCH_GOTO();
}
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
#else
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
@ -7793,6 +7795,8 @@
this_instr[1].counter = initial_jump_backoff_counter();
stack_pointer = _PyFrame_GetStackPointer(frame);
assert(tstate->current_executor == NULL);
assert(executor != tstate->interp->cold_executor);
tstate->jit_exit = NULL;
GOTO_TIER_TWO(executor);
}
}

View file

@ -546,7 +546,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
group->emit(code, data, executor, NULL, &state);
code += group->code_size;
data += group->data_size;
assert(trace[0].opcode == _START_EXECUTOR);
assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT);
for (size_t i = 0; i < length; i++) {
const _PyUOpInstruction *instruction = &trace[i];
group = &stencil_groups[instruction->opcode];

View file

@ -205,8 +205,8 @@ static int executor_clear(PyObject *executor);
static void unlink_executor(_PyExecutorObject *executor);
static void
free_executor(_PyExecutorObject *self)
void
_PyExecutor_Free(_PyExecutorObject *self)
{
#ifdef _Py_JIT
_PyJIT_Free(self);
@ -242,7 +242,7 @@ _Py_ClearExecutorDeletionList(PyInterpreterState *interp)
}
else {
*prev_to_next_ptr = exec->vm_data.links.next;
free_executor(exec);
_PyExecutor_Free(exec);
}
exec = *prev_to_next_ptr;
}
@ -1129,7 +1129,7 @@ sanity_check(_PyExecutorObject *executor)
}
bool ended = false;
uint32_t i = 0;
CHECK(executor->trace[0].opcode == _START_EXECUTOR);
CHECK(executor->trace[0].opcode == _START_EXECUTOR || executor->trace[0].opcode == _COLD_EXIT);
for (; i < executor->code_size; i++) {
const _PyUOpInstruction *inst = &executor->trace[i];
uint16_t opcode = inst->opcode;
@ -1182,9 +1182,11 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
}
/* Initialize exits */
_PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
for (int i = 0; i < exit_count; i++) {
executor->exits[i].executor = NULL;
executor->exits[i].index = i;
executor->exits[i].temperature = initial_temperature_backoff_counter();
executor->exits[i].executor = cold;
}
int next_exit = exit_count-1;
_PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
@ -1462,6 +1464,46 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s
link_executor(executor);
}
_PyExecutorObject *
_PyExecutor_GetColdExecutor(void)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->cold_executor != NULL) {
return interp->cold_executor;
}
_PyExecutorObject *cold = allocate_executor(0, 1);
if (cold == NULL) {
Py_FatalError("Cannot allocate core JIT code");
}
((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT;
#ifdef _Py_JIT
cold->jit_code = NULL;
cold->jit_side_entry = NULL;
cold->jit_size = 0;
// This is initialized to true so we can prevent the executor
// from being immediately detected as cold and invalidated.
cold->vm_data.warm = true;
if (_PyJIT_Compile(cold, cold->trace, 1)) {
Py_DECREF(cold);
Py_FatalError("Cannot allocate core JIT code");
}
#endif
_Py_SetImmortal((PyObject *)cold);
interp->cold_executor = cold;
return cold;
}
void
_PyExecutor_ClearExit(_PyExitData *exit)
{
if (exit == NULL) {
return;
}
_PyExecutorObject *old = exit->executor;
exit->executor = _PyExecutor_GetColdExecutor();
Py_DECREF(old);
}
/* Detaches the executor from the code object (if any) that
* holds a reference to it */
void
@ -1492,14 +1534,18 @@ executor_clear(PyObject *op)
assert(executor->vm_data.valid == 1);
unlink_executor(executor);
executor->vm_data.valid = 0;
/* It is possible for an executor to form a reference
* cycle with itself, so decref'ing a side exit could
* free the executor unless we hold a strong reference to it
*/
_PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
Py_INCREF(executor);
for (uint32_t i = 0; i < executor->exit_count; i++) {
executor->exits[i].temperature = initial_unreachable_backoff_counter();
Py_CLEAR(executor->exits[i].executor);
_PyExecutorObject *e = executor->exits[i].executor;
executor->exits[i].executor = cold;
Py_DECREF(e);
}
_Py_ExecutorDetach(executor);
Py_DECREF(executor);
@ -1741,4 +1787,11 @@ _PyDumpExecutors(FILE *out)
return -1;
}
void
_PyExecutor_Free(struct _PyExecutorObject *self)
{
/* This should never be called */
Py_UNREACHABLE();
}
#endif /* _Py_TIER2 */

View file

@ -3206,3 +3206,7 @@
break;
}
case _COLD_EXIT: {
break;
}

View file

@ -815,7 +815,13 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
/* Last garbage collection on this interpreter */
_PyGC_CollectNoFail(tstate);
_PyGC_Fini(interp);
struct _PyExecutorObject *cold = interp->cold_executor;
if (cold != NULL) {
interp->cold_executor = NULL;
assert(cold->vm_data.valid);
assert(cold->vm_data.warm);
_PyExecutor_Free(cold);
}
/* We don't clear sysdict and builtins until the end of this function.
Because clearing other attributes can execute arbitrary Python code
which requires sysdict and builtins. */
@ -1469,6 +1475,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
tstate->datastack_limit = NULL;
tstate->what_event = -1;
tstate->current_executor = NULL;
tstate->jit_exit = NULL;
tstate->dict_global_version = 0;
_tstate->c_stack_soft_limit = UINTPTR_MAX;