mirror of
https://github.com/python/cpython.git
synced 2025-10-20 08:23:47 +00:00
GH-136410: Faster side exits by using a cold exit stub (GH-136411)
This commit is contained in:
parent
718e0c89ba
commit
e7b55f564d
15 changed files with 387 additions and 267 deletions
|
@ -2964,6 +2964,8 @@ dummy_func(
|
|||
else {
|
||||
this_instr[1].counter = initial_jump_backoff_counter();
|
||||
assert(tstate->current_executor == NULL);
|
||||
assert(executor != tstate->interp->cold_executor);
|
||||
tstate->jit_exit = NULL;
|
||||
GOTO_TIER_TWO(executor);
|
||||
}
|
||||
}
|
||||
|
@ -3028,6 +3030,8 @@ dummy_func(
|
|||
}
|
||||
DISPATCH_GOTO();
|
||||
}
|
||||
assert(executor != tstate->interp->cold_executor);
|
||||
tstate->jit_exit = NULL;
|
||||
GOTO_TIER_TWO(executor);
|
||||
#else
|
||||
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
|
||||
|
@ -5238,9 +5242,8 @@ dummy_func(
|
|||
|
||||
tier2 op(_EXIT_TRACE, (exit_p/4 --)) {
|
||||
_PyExitData *exit = (_PyExitData *)exit_p;
|
||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
#if defined(Py_DEBUG) && !defined(_Py_JIT)
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
|
||||
if (frame->lltrace >= 2) {
|
||||
printf("SIDE EXIT: [UOp ");
|
||||
|
@ -5251,32 +5254,7 @@ dummy_func(
|
|||
_PyOpcode_OpName[target->op.code]);
|
||||
}
|
||||
#endif
|
||||
if (exit->executor && !exit->executor->vm_data.valid) {
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
Py_CLEAR(exit->executor);
|
||||
}
|
||||
if (exit->executor == NULL) {
|
||||
_Py_BackoffCounter temperature = exit->temperature;
|
||||
if (!backoff_counter_triggers(temperature)) {
|
||||
exit->temperature = advance_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(target);
|
||||
}
|
||||
_PyExecutorObject *executor;
|
||||
if (target->op.code == ENTER_EXECUTOR) {
|
||||
executor = code->co_executors->executors[target->op.arg];
|
||||
Py_INCREF(executor);
|
||||
}
|
||||
else {
|
||||
int chain_depth = current_executor->vm_data.chain_depth + 1;
|
||||
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
|
||||
if (optimized <= 0) {
|
||||
exit->temperature = restart_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
|
||||
}
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
}
|
||||
exit->executor = executor;
|
||||
}
|
||||
tstate->jit_exit = exit;
|
||||
GOTO_TIER_TWO(exit->executor);
|
||||
}
|
||||
|
||||
|
@ -5375,7 +5353,14 @@ dummy_func(
|
|||
#ifndef _Py_JIT
|
||||
current_executor = (_PyExecutorObject*)executor;
|
||||
#endif
|
||||
assert(((_PyExecutorObject *)executor)->vm_data.valid);
|
||||
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
|
||||
tstate->current_executor = (PyObject *)executor;
|
||||
if (!current_executor->vm_data.valid) {
|
||||
assert(tstate->jit_exit->executor == current_executor);
|
||||
assert(tstate->current_executor == executor);
|
||||
_PyExecutor_ClearExit(tstate->jit_exit);
|
||||
DEOPT_IF(true);
|
||||
}
|
||||
}
|
||||
|
||||
tier2 op(_MAKE_WARM, (--)) {
|
||||
|
@ -5414,6 +5399,37 @@ dummy_func(
|
|||
assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version));
|
||||
}
|
||||
|
||||
tier2 op(_COLD_EXIT, ( -- )) {
|
||||
_PyExitData *exit = tstate->jit_exit;
|
||||
assert(exit != NULL);
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
_Py_BackoffCounter temperature = exit->temperature;
|
||||
if (!backoff_counter_triggers(temperature)) {
|
||||
exit->temperature = advance_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(target);
|
||||
}
|
||||
_PyExecutorObject *executor;
|
||||
if (target->op.code == ENTER_EXECUTOR) {
|
||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||
executor = code->co_executors->executors[target->op.arg];
|
||||
Py_INCREF(executor);
|
||||
}
|
||||
else {
|
||||
_PyExecutorObject *previous_executor = _PyExecutor_FromExit(exit);
|
||||
assert(tstate->current_executor == (PyObject *)previous_executor);
|
||||
int chain_depth = previous_executor->vm_data.chain_depth + 1;
|
||||
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
|
||||
if (optimized <= 0) {
|
||||
exit->temperature = restart_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
|
||||
}
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
}
|
||||
assert(tstate->jit_exit == exit);
|
||||
exit->executor = executor;
|
||||
GOTO_TIER_TWO(exit->executor);
|
||||
}
|
||||
|
||||
label(pop_2_error) {
|
||||
stack_pointer -= 2;
|
||||
assert(WITHIN_STACK_BOUNDS());
|
||||
|
|
|
@ -1158,7 +1158,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
|
|||
uint64_t trace_uop_execution_counter = 0;
|
||||
#endif
|
||||
|
||||
assert(next_uop->opcode == _START_EXECUTOR);
|
||||
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT);
|
||||
tier2_dispatch:
|
||||
for (;;) {
|
||||
uopcode = next_uop->opcode;
|
||||
|
|
|
@ -359,7 +359,6 @@ _PyFrame_SetStackPointer(frame, stack_pointer)
|
|||
do { \
|
||||
OPT_STAT_INC(traces_executed); \
|
||||
_PyExecutorObject *_executor = (EXECUTOR); \
|
||||
tstate->current_executor = (PyObject *)_executor; \
|
||||
jit_func jitted = _executor->jit_code; \
|
||||
/* Keep the shim frame alive via the executor: */ \
|
||||
Py_INCREF(_executor); \
|
||||
|
@ -378,9 +377,8 @@ do { \
|
|||
do { \
|
||||
OPT_STAT_INC(traces_executed); \
|
||||
_PyExecutorObject *_executor = (EXECUTOR); \
|
||||
tstate->current_executor = (PyObject *)_executor; \
|
||||
next_uop = _executor->trace; \
|
||||
assert(next_uop->opcode == _START_EXECUTOR); \
|
||||
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
|
||||
goto enter_tier_two; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
@ -390,7 +388,6 @@ do { \
|
|||
{ \
|
||||
tstate->current_executor = NULL; \
|
||||
next_instr = (TARGET); \
|
||||
assert(tstate->current_executor == NULL); \
|
||||
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer); \
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame); \
|
||||
|
|
84
Python/executor_cases.c.h
generated
84
Python/executor_cases.c.h
generated
|
@ -7113,9 +7113,8 @@
|
|||
case _EXIT_TRACE: {
|
||||
PyObject *exit_p = (PyObject *)CURRENT_OPERAND0();
|
||||
_PyExitData *exit = (_PyExitData *)exit_p;
|
||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
#if defined(Py_DEBUG) && !defined(_Py_JIT)
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
|
||||
if (frame->lltrace >= 2) {
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
|
@ -7128,36 +7127,7 @@
|
|||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
}
|
||||
#endif
|
||||
if (exit->executor && !exit->executor->vm_data.valid) {
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
Py_CLEAR(exit->executor);
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
}
|
||||
if (exit->executor == NULL) {
|
||||
_Py_BackoffCounter temperature = exit->temperature;
|
||||
if (!backoff_counter_triggers(temperature)) {
|
||||
exit->temperature = advance_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(target);
|
||||
}
|
||||
_PyExecutorObject *executor;
|
||||
if (target->op.code == ENTER_EXECUTOR) {
|
||||
executor = code->co_executors->executors[target->op.arg];
|
||||
Py_INCREF(executor);
|
||||
}
|
||||
else {
|
||||
int chain_depth = current_executor->vm_data.chain_depth + 1;
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
if (optimized <= 0) {
|
||||
exit->temperature = restart_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
|
||||
}
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
}
|
||||
exit->executor = executor;
|
||||
}
|
||||
tstate->jit_exit = exit;
|
||||
GOTO_TIER_TWO(exit->executor);
|
||||
break;
|
||||
}
|
||||
|
@ -7438,7 +7408,19 @@
|
|||
#ifndef _Py_JIT
|
||||
current_executor = (_PyExecutorObject*)executor;
|
||||
#endif
|
||||
assert(((_PyExecutorObject *)executor)->vm_data.valid);
|
||||
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
|
||||
tstate->current_executor = (PyObject *)executor;
|
||||
if (!current_executor->vm_data.valid) {
|
||||
assert(tstate->jit_exit->executor == current_executor);
|
||||
assert(tstate->current_executor == executor);
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
_PyExecutor_ClearExit(tstate->jit_exit);
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
if (true) {
|
||||
UOP_STAT_INC(uopcode, miss);
|
||||
JUMP_TO_JUMP_TARGET();
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -7487,4 +7469,40 @@
|
|||
break;
|
||||
}
|
||||
|
||||
case _COLD_EXIT: {
|
||||
_PyExitData *exit = tstate->jit_exit;
|
||||
assert(exit != NULL);
|
||||
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
|
||||
_Py_BackoffCounter temperature = exit->temperature;
|
||||
if (!backoff_counter_triggers(temperature)) {
|
||||
exit->temperature = advance_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(target);
|
||||
}
|
||||
_PyExecutorObject *executor;
|
||||
if (target->op.code == ENTER_EXECUTOR) {
|
||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||
executor = code->co_executors->executors[target->op.arg];
|
||||
Py_INCREF(executor);
|
||||
}
|
||||
else {
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
_PyExecutorObject *previous_executor = _PyExecutor_FromExit(exit);
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
assert(tstate->current_executor == (PyObject *)previous_executor);
|
||||
int chain_depth = previous_executor->vm_data.chain_depth + 1;
|
||||
_PyFrame_SetStackPointer(frame, stack_pointer);
|
||||
int optimized = _PyOptimizer_Optimize(frame, target, &executor, chain_depth);
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
if (optimized <= 0) {
|
||||
exit->temperature = restart_backoff_counter(temperature);
|
||||
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
|
||||
}
|
||||
exit->temperature = initial_temperature_backoff_counter();
|
||||
}
|
||||
assert(tstate->jit_exit == exit);
|
||||
exit->executor = executor;
|
||||
GOTO_TIER_TWO(exit->executor);
|
||||
break;
|
||||
}
|
||||
|
||||
#undef TIER_TWO
|
||||
|
|
4
Python/generated_cases.c.h
generated
4
Python/generated_cases.c.h
generated
|
@ -5595,6 +5595,8 @@
|
|||
}
|
||||
DISPATCH_GOTO();
|
||||
}
|
||||
assert(executor != tstate->interp->cold_executor);
|
||||
tstate->jit_exit = NULL;
|
||||
GOTO_TIER_TWO(executor);
|
||||
#else
|
||||
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
|
||||
|
@ -7793,6 +7795,8 @@
|
|||
this_instr[1].counter = initial_jump_backoff_counter();
|
||||
stack_pointer = _PyFrame_GetStackPointer(frame);
|
||||
assert(tstate->current_executor == NULL);
|
||||
assert(executor != tstate->interp->cold_executor);
|
||||
tstate->jit_exit = NULL;
|
||||
GOTO_TIER_TWO(executor);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -546,7 +546,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
|
|||
group->emit(code, data, executor, NULL, &state);
|
||||
code += group->code_size;
|
||||
data += group->data_size;
|
||||
assert(trace[0].opcode == _START_EXECUTOR);
|
||||
assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT);
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
const _PyUOpInstruction *instruction = &trace[i];
|
||||
group = &stencil_groups[instruction->opcode];
|
||||
|
|
|
@ -205,8 +205,8 @@ static int executor_clear(PyObject *executor);
|
|||
static void unlink_executor(_PyExecutorObject *executor);
|
||||
|
||||
|
||||
static void
|
||||
free_executor(_PyExecutorObject *self)
|
||||
void
|
||||
_PyExecutor_Free(_PyExecutorObject *self)
|
||||
{
|
||||
#ifdef _Py_JIT
|
||||
_PyJIT_Free(self);
|
||||
|
@ -242,7 +242,7 @@ _Py_ClearExecutorDeletionList(PyInterpreterState *interp)
|
|||
}
|
||||
else {
|
||||
*prev_to_next_ptr = exec->vm_data.links.next;
|
||||
free_executor(exec);
|
||||
_PyExecutor_Free(exec);
|
||||
}
|
||||
exec = *prev_to_next_ptr;
|
||||
}
|
||||
|
@ -1129,7 +1129,7 @@ sanity_check(_PyExecutorObject *executor)
|
|||
}
|
||||
bool ended = false;
|
||||
uint32_t i = 0;
|
||||
CHECK(executor->trace[0].opcode == _START_EXECUTOR);
|
||||
CHECK(executor->trace[0].opcode == _START_EXECUTOR || executor->trace[0].opcode == _COLD_EXIT);
|
||||
for (; i < executor->code_size; i++) {
|
||||
const _PyUOpInstruction *inst = &executor->trace[i];
|
||||
uint16_t opcode = inst->opcode;
|
||||
|
@ -1182,9 +1182,11 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
|
|||
}
|
||||
|
||||
/* Initialize exits */
|
||||
_PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
|
||||
for (int i = 0; i < exit_count; i++) {
|
||||
executor->exits[i].executor = NULL;
|
||||
executor->exits[i].index = i;
|
||||
executor->exits[i].temperature = initial_temperature_backoff_counter();
|
||||
executor->exits[i].executor = cold;
|
||||
}
|
||||
int next_exit = exit_count-1;
|
||||
_PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
|
||||
|
@ -1462,6 +1464,46 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s
|
|||
link_executor(executor);
|
||||
}
|
||||
|
||||
_PyExecutorObject *
|
||||
_PyExecutor_GetColdExecutor(void)
|
||||
{
|
||||
PyInterpreterState *interp = _PyInterpreterState_GET();
|
||||
if (interp->cold_executor != NULL) {
|
||||
return interp->cold_executor;
|
||||
}
|
||||
_PyExecutorObject *cold = allocate_executor(0, 1);
|
||||
if (cold == NULL) {
|
||||
Py_FatalError("Cannot allocate core JIT code");
|
||||
}
|
||||
((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT;
|
||||
#ifdef _Py_JIT
|
||||
cold->jit_code = NULL;
|
||||
cold->jit_side_entry = NULL;
|
||||
cold->jit_size = 0;
|
||||
// This is initialized to true so we can prevent the executor
|
||||
// from being immediately detected as cold and invalidated.
|
||||
cold->vm_data.warm = true;
|
||||
if (_PyJIT_Compile(cold, cold->trace, 1)) {
|
||||
Py_DECREF(cold);
|
||||
Py_FatalError("Cannot allocate core JIT code");
|
||||
}
|
||||
#endif
|
||||
_Py_SetImmortal((PyObject *)cold);
|
||||
interp->cold_executor = cold;
|
||||
return cold;
|
||||
}
|
||||
|
||||
void
|
||||
_PyExecutor_ClearExit(_PyExitData *exit)
|
||||
{
|
||||
if (exit == NULL) {
|
||||
return;
|
||||
}
|
||||
_PyExecutorObject *old = exit->executor;
|
||||
exit->executor = _PyExecutor_GetColdExecutor();
|
||||
Py_DECREF(old);
|
||||
}
|
||||
|
||||
/* Detaches the executor from the code object (if any) that
|
||||
* holds a reference to it */
|
||||
void
|
||||
|
@ -1492,14 +1534,18 @@ executor_clear(PyObject *op)
|
|||
assert(executor->vm_data.valid == 1);
|
||||
unlink_executor(executor);
|
||||
executor->vm_data.valid = 0;
|
||||
|
||||
/* It is possible for an executor to form a reference
|
||||
* cycle with itself, so decref'ing a side exit could
|
||||
* free the executor unless we hold a strong reference to it
|
||||
*/
|
||||
_PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
|
||||
Py_INCREF(executor);
|
||||
for (uint32_t i = 0; i < executor->exit_count; i++) {
|
||||
executor->exits[i].temperature = initial_unreachable_backoff_counter();
|
||||
Py_CLEAR(executor->exits[i].executor);
|
||||
_PyExecutorObject *e = executor->exits[i].executor;
|
||||
executor->exits[i].executor = cold;
|
||||
Py_DECREF(e);
|
||||
}
|
||||
_Py_ExecutorDetach(executor);
|
||||
Py_DECREF(executor);
|
||||
|
@ -1741,4 +1787,11 @@ _PyDumpExecutors(FILE *out)
|
|||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
_PyExecutor_Free(struct _PyExecutorObject *self)
|
||||
{
|
||||
/* This should never be called */
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
|
||||
#endif /* _Py_TIER2 */
|
||||
|
|
4
Python/optimizer_cases.c.h
generated
4
Python/optimizer_cases.c.h
generated
|
@ -3206,3 +3206,7 @@
|
|||
break;
|
||||
}
|
||||
|
||||
case _COLD_EXIT: {
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -815,7 +815,13 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
|
|||
/* Last garbage collection on this interpreter */
|
||||
_PyGC_CollectNoFail(tstate);
|
||||
_PyGC_Fini(interp);
|
||||
|
||||
struct _PyExecutorObject *cold = interp->cold_executor;
|
||||
if (cold != NULL) {
|
||||
interp->cold_executor = NULL;
|
||||
assert(cold->vm_data.valid);
|
||||
assert(cold->vm_data.warm);
|
||||
_PyExecutor_Free(cold);
|
||||
}
|
||||
/* We don't clear sysdict and builtins until the end of this function.
|
||||
Because clearing other attributes can execute arbitrary Python code
|
||||
which requires sysdict and builtins. */
|
||||
|
@ -1469,6 +1475,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
|
|||
tstate->datastack_limit = NULL;
|
||||
tstate->what_event = -1;
|
||||
tstate->current_executor = NULL;
|
||||
tstate->jit_exit = NULL;
|
||||
tstate->dict_global_version = 0;
|
||||
|
||||
_tstate->c_stack_soft_limit = UINTPTR_MAX;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue