mirror of
https://github.com/python/cpython.git
synced 2025-10-24 10:23:58 +00:00
452 lines
16 KiB
C
452 lines
16 KiB
C
// Macros and other things needed by ceval.c, and bytecodes.c
|
|
|
|
/* Computed GOTOs, or
|
|
the-optimization-commonly-but-improperly-known-as-"threaded code"
|
|
using gcc's labels-as-values extension
|
|
(http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).
|
|
|
|
The traditional bytecode evaluation loop uses a "switch" statement, which
|
|
decent compilers will optimize as a single indirect branch instruction
|
|
combined with a lookup table of jump addresses. However, since the
|
|
indirect jump instruction is shared by all opcodes, the CPU will have a
|
|
hard time making the right prediction for where to jump next (actually,
|
|
it will be always wrong except in the uncommon case of a sequence of
|
|
several identical opcodes).
|
|
|
|
"Threaded code" in contrast, uses an explicit jump table and an explicit
|
|
indirect jump instruction at the end of each opcode. Since the jump
|
|
instruction is at a different address for each opcode, the CPU will make a
|
|
separate prediction for each of these instructions, which is equivalent to
|
|
predicting the second opcode of each opcode pair. These predictions have
|
|
a much better chance to turn out valid, especially in small bytecode loops.
|
|
|
|
A mispredicted branch on a modern CPU flushes the whole pipeline and
|
|
can cost several CPU cycles (depending on the pipeline depth),
|
|
and potentially many more instructions (depending on the pipeline width).
|
|
A correctly predicted branch, however, is nearly free.
|
|
|
|
At the time of this writing, the "threaded code" version is up to 15-20%
|
|
faster than the normal "switch" version, depending on the compiler and the
|
|
CPU architecture.
|
|
|
|
NOTE: care must be taken that the compiler doesn't try to "optimize" the
|
|
indirect jumps by sharing them between all opcodes. Such optimizations
|
|
can be disabled on gcc by using the -fno-gcse flag (or possibly
|
|
-fno-crossjumping).
|
|
*/
|
|
|
|
/* Use macros rather than inline functions, to make it as clear as possible
|
|
* to the C compiler that the tracing check is a simple test then branch.
|
|
* We want to be sure that the compiler knows this before it generates
|
|
* the CFG.
|
|
*/
|
|
|
|
#ifdef WITH_DTRACE
|
|
#define OR_DTRACE_LINE | (PyDTrace_LINE_ENABLED() ? 255 : 0)
|
|
#else
|
|
#define OR_DTRACE_LINE
|
|
#endif
|
|
|
|
#ifdef HAVE_COMPUTED_GOTOS
|
|
#ifndef USE_COMPUTED_GOTOS
|
|
#define USE_COMPUTED_GOTOS 1
|
|
#endif
|
|
#else
|
|
#if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
|
|
#error "Computed gotos are not supported on this compiler."
|
|
#endif
|
|
#undef USE_COMPUTED_GOTOS
|
|
#define USE_COMPUTED_GOTOS 0
|
|
#endif
|
|
|
|
#ifdef Py_STATS
|
|
#define INSTRUCTION_STATS(op) \
|
|
do { \
|
|
OPCODE_EXE_INC(op); \
|
|
if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \
|
|
lastopcode = op; \
|
|
} while (0)
|
|
#else
|
|
#define INSTRUCTION_STATS(op) ((void)0)
|
|
#endif
|
|
|
|
#define TAIL_CALL_PARAMS _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate, _Py_CODEUNIT *next_instr, int oparg
|
|
#define TAIL_CALL_ARGS frame, stack_pointer, tstate, next_instr, oparg
|
|
|
|
#if Py_TAIL_CALL_INTERP
|
|
// Note: [[clang::musttail]] works for GCC 15, but not __attribute__((musttail)) at the moment.
|
|
# define Py_MUSTTAIL [[clang::musttail]]
|
|
# define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
|
|
Py_PRESERVE_NONE_CC typedef PyObject* (*py_tail_call_funcptr)(TAIL_CALL_PARAMS);
|
|
|
|
# define TARGET(op) Py_PRESERVE_NONE_CC PyObject *_TAIL_CALL_##op(TAIL_CALL_PARAMS)
|
|
# define DISPATCH_GOTO() \
|
|
do { \
|
|
Py_MUSTTAIL return (INSTRUCTION_TABLE[opcode])(TAIL_CALL_ARGS); \
|
|
} while (0)
|
|
# define JUMP_TO_LABEL(name) \
|
|
do { \
|
|
Py_MUSTTAIL return (_TAIL_CALL_##name)(TAIL_CALL_ARGS); \
|
|
} while (0)
|
|
# define JUMP_TO_PREDICTED(name) \
|
|
do { \
|
|
Py_MUSTTAIL return (_TAIL_CALL_##name)(frame, stack_pointer, tstate, this_instr, oparg); \
|
|
} while (0)
|
|
# define LABEL(name) TARGET(name)
|
|
#elif USE_COMPUTED_GOTOS
|
|
# define TARGET(op) TARGET_##op:
|
|
# define DISPATCH_GOTO() goto *opcode_targets[opcode]
|
|
# define JUMP_TO_LABEL(name) goto name;
|
|
# define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
|
|
# define LABEL(name) name:
|
|
#else
|
|
# define TARGET(op) case op: TARGET_##op:
|
|
# define DISPATCH_GOTO() goto dispatch_opcode
|
|
# define JUMP_TO_LABEL(name) goto name;
|
|
# define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
|
|
# define LABEL(name) name:
|
|
#endif
|
|
|
|
/* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
|
|
#ifdef Py_DEBUG
|
|
#define PRE_DISPATCH_GOTO() if (frame->lltrace >= 5) { \
|
|
lltrace_instruction(frame, stack_pointer, next_instr, opcode, oparg); }
|
|
#else
|
|
#define PRE_DISPATCH_GOTO() ((void)0)
|
|
#endif
|
|
|
|
#ifdef Py_DEBUG
|
|
#define LLTRACE_RESUME_FRAME() \
|
|
do { \
|
|
_PyFrame_SetStackPointer(frame, stack_pointer); \
|
|
int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \
|
|
stack_pointer = _PyFrame_GetStackPointer(frame); \
|
|
if (lltrace < 0) { \
|
|
JUMP_TO_LABEL(exit_unwind); \
|
|
} \
|
|
frame->lltrace = lltrace; \
|
|
} while (0)
|
|
#else
|
|
#define LLTRACE_RESUME_FRAME() ((void)0)
|
|
#endif
|
|
|
|
#ifdef Py_GIL_DISABLED
|
|
#define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
|
|
#else
|
|
#define QSBR_QUIESCENT_STATE(tstate)
|
|
#endif
|
|
|
|
|
|
/* Do interpreter dispatch accounting for tracing and instrumentation */
|
|
#define DISPATCH() \
|
|
{ \
|
|
assert(frame->stackpointer == NULL); \
|
|
NEXTOPARG(); \
|
|
PRE_DISPATCH_GOTO(); \
|
|
DISPATCH_GOTO(); \
|
|
}
|
|
|
|
#define DISPATCH_SAME_OPARG() \
|
|
{ \
|
|
opcode = next_instr->op.code; \
|
|
PRE_DISPATCH_GOTO(); \
|
|
DISPATCH_GOTO(); \
|
|
}
|
|
|
|
#define DISPATCH_INLINED(NEW_FRAME) \
|
|
do { \
|
|
assert(tstate->interp->eval_frame == NULL); \
|
|
_PyFrame_SetStackPointer(frame, stack_pointer); \
|
|
assert((NEW_FRAME)->previous == frame); \
|
|
frame = tstate->current_frame = (NEW_FRAME); \
|
|
CALL_STAT_INC(inlined_py_calls); \
|
|
JUMP_TO_LABEL(start_frame); \
|
|
} while (0)
|
|
|
|
/* Tuple access macros */
|
|
|
|
#ifndef Py_DEBUG
|
|
#define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
|
|
#else
|
|
static inline PyObject *
|
|
GETITEM(PyObject *v, Py_ssize_t i) {
|
|
assert(PyTuple_Check(v));
|
|
assert(i >= 0);
|
|
assert(i < PyTuple_GET_SIZE(v));
|
|
return PyTuple_GET_ITEM(v, i);
|
|
}
|
|
#endif
|
|
|
|
/* Code access macros */
|
|
|
|
/* The integer overflow is checked by an assertion below. */
|
|
#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
|
|
#define NEXTOPARG() do { \
|
|
_Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
|
|
opcode = word.op.code; \
|
|
oparg = word.op.arg; \
|
|
} while (0)
|
|
|
|
/* JUMPBY makes the generator identify the instruction as a jump. SKIP_OVER is
|
|
* for advancing to the next instruction, taking into account cache entries
|
|
* and skipped instructions.
|
|
*/
|
|
#define JUMPBY(x) (next_instr += (x))
|
|
#define SKIP_OVER(x) (next_instr += (x))
|
|
|
|
|
|
/* Stack manipulation macros */
|
|
|
|
/* The stack can grow at most MAXINT deep, as co_nlocals and
|
|
co_stacksize are ints. */
|
|
#define STACK_LEVEL() ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
|
|
#define STACK_SIZE() (_PyFrame_GetCode(frame)->co_stacksize)
|
|
#define EMPTY() (STACK_LEVEL() == 0)
|
|
#define TOP() (stack_pointer[-1])
|
|
#define SECOND() (stack_pointer[-2])
|
|
#define THIRD() (stack_pointer[-3])
|
|
#define FOURTH() (stack_pointer[-4])
|
|
#define PEEK(n) (stack_pointer[-(n)])
|
|
#define POKE(n, v) (stack_pointer[-(n)] = (v))
|
|
#define SET_TOP(v) (stack_pointer[-1] = (v))
|
|
#define SET_SECOND(v) (stack_pointer[-2] = (v))
|
|
#define BASIC_STACKADJ(n) (stack_pointer += n)
|
|
#define BASIC_PUSH(v) (*stack_pointer++ = (v))
|
|
#define BASIC_POP() (*--stack_pointer)
|
|
|
|
#ifdef Py_DEBUG
|
|
#define PUSH(v) do { \
|
|
BASIC_PUSH(v); \
|
|
assert(STACK_LEVEL() <= STACK_SIZE()); \
|
|
} while (0)
|
|
#define POP() (assert(STACK_LEVEL() > 0), BASIC_POP())
|
|
#define STACK_GROW(n) do { \
|
|
assert(n >= 0); \
|
|
BASIC_STACKADJ(n); \
|
|
assert(STACK_LEVEL() <= STACK_SIZE()); \
|
|
} while (0)
|
|
#define STACK_SHRINK(n) do { \
|
|
assert(n >= 0); \
|
|
assert(STACK_LEVEL() >= n); \
|
|
BASIC_STACKADJ(-(n)); \
|
|
} while (0)
|
|
#else
|
|
#define PUSH(v) BASIC_PUSH(v)
|
|
#define POP() BASIC_POP()
|
|
#define STACK_GROW(n) BASIC_STACKADJ(n)
|
|
#define STACK_SHRINK(n) BASIC_STACKADJ(-(n))
|
|
#endif
|
|
|
|
#define WITHIN_STACK_BOUNDS() \
|
|
(frame->owner == FRAME_OWNED_BY_INTERPRETER || (STACK_LEVEL() >= 0 && STACK_LEVEL() <= STACK_SIZE()))
|
|
|
|
/* Data access macros */
|
|
#define FRAME_CO_CONSTS (_PyFrame_GetCode(frame)->co_consts)
|
|
#define FRAME_CO_NAMES (_PyFrame_GetCode(frame)->co_names)
|
|
|
|
/* Local variable macros */
|
|
|
|
#define LOCALS_ARRAY (frame->localsplus)
|
|
#define GETLOCAL(i) (frame->localsplus[i])
|
|
|
|
|
|
#ifdef Py_STATS
|
|
#define UPDATE_MISS_STATS(INSTNAME) \
|
|
do { \
|
|
STAT_INC(opcode, miss); \
|
|
STAT_INC((INSTNAME), miss); \
|
|
/* The counter is always the first cache entry: */ \
|
|
if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) { \
|
|
STAT_INC((INSTNAME), deopt); \
|
|
} \
|
|
} while (0)
|
|
#else
|
|
#define UPDATE_MISS_STATS(INSTNAME) ((void)0)
|
|
#endif
|
|
|
|
|
|
// Try to lock an object in the free threading build, if it's not already
|
|
// locked. Use with a DEOPT_IF() to deopt if the object is already locked.
|
|
// These are no-ops in the default GIL build. The general pattern is:
|
|
//
|
|
// DEOPT_IF(!LOCK_OBJECT(op));
|
|
// if (/* condition fails */) {
|
|
// UNLOCK_OBJECT(op);
|
|
// DEOPT_IF(true);
|
|
// }
|
|
// ...
|
|
// UNLOCK_OBJECT(op);
|
|
//
|
|
// NOTE: The object must be unlocked on every exit code path and you should
|
|
// avoid any potentially escaping calls (like PyStackRef_CLOSE) while the
|
|
// object is locked.
|
|
#ifdef Py_GIL_DISABLED
|
|
# define LOCK_OBJECT(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex)
|
|
# define UNLOCK_OBJECT(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex)
|
|
#else
|
|
# define LOCK_OBJECT(op) (1)
|
|
# define UNLOCK_OBJECT(op) ((void)0)
|
|
#endif
|
|
|
|
#define GLOBALS() frame->f_globals
|
|
#define BUILTINS() frame->f_builtins
|
|
#define LOCALS() frame->f_locals
|
|
#define CONSTS() _PyFrame_GetCode(frame)->co_consts
|
|
#define NAMES() _PyFrame_GetCode(frame)->co_names
|
|
|
|
#define DTRACE_FUNCTION_ENTRY() \
|
|
if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
|
|
dtrace_function_entry(frame); \
|
|
}
|
|
|
|
/* This takes a uint16_t instead of a _Py_BackoffCounter,
|
|
* because it is used directly on the cache entry in generated code,
|
|
* which is always an integral type. */
|
|
#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
|
|
backoff_counter_triggers(forge_backoff_counter((COUNTER)))
|
|
|
|
#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
|
|
do { \
|
|
(COUNTER) = advance_backoff_counter((COUNTER)); \
|
|
} while (0);
|
|
|
|
#define PAUSE_ADAPTIVE_COUNTER(COUNTER) \
|
|
do { \
|
|
(COUNTER) = pause_backoff_counter((COUNTER)); \
|
|
} while (0);
|
|
|
|
#ifdef ENABLE_SPECIALIZATION_FT
|
|
/* Multiple threads may execute these concurrently if thread-local bytecode is
|
|
* disabled and they all execute the main copy of the bytecode. Specialization
|
|
* is disabled in that case so the value is unused, but the RMW cycle should be
|
|
* free of data races.
|
|
*/
|
|
#define RECORD_BRANCH_TAKEN(bitset, flag) \
|
|
FT_ATOMIC_STORE_UINT16_RELAXED( \
|
|
bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag))
|
|
#else
|
|
#define RECORD_BRANCH_TAKEN(bitset, flag)
|
|
#endif
|
|
|
|
#define UNBOUNDLOCAL_ERROR_MSG \
|
|
"cannot access local variable '%s' where it is not associated with a value"
|
|
#define UNBOUNDFREE_ERROR_MSG \
|
|
"cannot access free variable '%s' where it is not associated with a value" \
|
|
" in enclosing scope"
|
|
#define NAME_ERROR_MSG "name '%.200s' is not defined"
|
|
|
|
// If a trace function sets a new f_lineno and
|
|
// *then* raises, we use the destination when searching
|
|
// for an exception handler, displaying the traceback, and so on
|
|
#define INSTRUMENTED_JUMP(src, dest, event) \
|
|
do { \
|
|
if (tstate->tracing) {\
|
|
next_instr = dest; \
|
|
} else { \
|
|
_PyFrame_SetStackPointer(frame, stack_pointer); \
|
|
next_instr = _Py_call_instrumentation_jump(this_instr, tstate, event, frame, src, dest); \
|
|
stack_pointer = _PyFrame_GetStackPointer(frame); \
|
|
if (next_instr == NULL) { \
|
|
next_instr = (dest)+1; \
|
|
JUMP_TO_LABEL(error); \
|
|
} \
|
|
} \
|
|
} while (0);
|
|
|
|
|
|
static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
|
|
return (tstate->py_recursion_remaining-- <= 0) &&
|
|
_Py_CheckRecursiveCallPy(tstate);
|
|
}
|
|
|
|
static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) {
|
|
tstate->py_recursion_remaining++;
|
|
}
|
|
|
|
/* Implementation of "macros" that modify the instruction pointer,
|
|
* stack pointer, or frame pointer.
|
|
* These need to treated differently by tier 1 and 2.
|
|
* The Tier 1 version is here; Tier 2 is inlined in ceval.c. */
|
|
|
|
#define LOAD_IP(OFFSET) do { \
|
|
next_instr = frame->instr_ptr + (OFFSET); \
|
|
} while (0)
|
|
|
|
/* There's no STORE_IP(), it's inlined by the code generator. */
|
|
|
|
#define LOAD_SP() \
|
|
stack_pointer = _PyFrame_GetStackPointer(frame)
|
|
|
|
#define SAVE_SP() \
|
|
_PyFrame_SetStackPointer(frame, stack_pointer)
|
|
|
|
/* Tier-switching macros. */
|
|
|
|
#ifdef _Py_JIT
|
|
#define GOTO_TIER_TWO(EXECUTOR) \
|
|
do { \
|
|
OPT_STAT_INC(traces_executed); \
|
|
_PyExecutorObject *_executor = (EXECUTOR); \
|
|
jit_func jitted = _executor->jit_code; \
|
|
/* Keep the shim frame alive via the executor: */ \
|
|
Py_INCREF(_executor); \
|
|
next_instr = jitted(frame, stack_pointer, tstate); \
|
|
Py_DECREF(_executor); \
|
|
Py_CLEAR(tstate->previous_executor); \
|
|
frame = tstate->current_frame; \
|
|
stack_pointer = _PyFrame_GetStackPointer(frame); \
|
|
if (next_instr == NULL) { \
|
|
next_instr = frame->instr_ptr; \
|
|
JUMP_TO_LABEL(error); \
|
|
} \
|
|
DISPATCH(); \
|
|
} while (0)
|
|
#else
|
|
#define GOTO_TIER_TWO(EXECUTOR) \
|
|
do { \
|
|
OPT_STAT_INC(traces_executed); \
|
|
next_uop = (EXECUTOR)->trace; \
|
|
assert(next_uop->opcode == _START_EXECUTOR); \
|
|
goto enter_tier_two; \
|
|
} while (0)
|
|
#endif
|
|
|
|
#define GOTO_TIER_ONE(TARGET) \
|
|
do \
|
|
{ \
|
|
next_instr = (TARGET); \
|
|
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
|
|
_PyFrame_SetStackPointer(frame, stack_pointer); \
|
|
Py_CLEAR(tstate->previous_executor); \
|
|
stack_pointer = _PyFrame_GetStackPointer(frame); \
|
|
if (next_instr == NULL) \
|
|
{ \
|
|
next_instr = frame->instr_ptr; \
|
|
goto error; \
|
|
} \
|
|
DISPATCH(); \
|
|
} while (0)
|
|
|
|
#define CURRENT_OPARG() (next_uop[-1].oparg)
|
|
#define CURRENT_OPERAND0() (next_uop[-1].operand0)
|
|
#define CURRENT_OPERAND1() (next_uop[-1].operand1)
|
|
#define CURRENT_TARGET() (next_uop[-1].target)
|
|
|
|
#define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target
|
|
#define JUMP_TO_ERROR() goto jump_to_error_target
|
|
|
|
/* Stackref macros */
|
|
|
|
/* How much scratch space to give stackref to PyObject* conversion. */
|
|
#define MAX_STACKREF_SCRATCH 10
|
|
|
|
#define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \
|
|
/* +1 because vectorcall might use -1 to write self */ \
|
|
PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \
|
|
PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1);
|
|
|
|
#define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \
|
|
/* +1 because we +1 previously */ \
|
|
_PyObjectArray_Free(NAME - 1, NAME##_temp);
|
|
|
|
#define CONVERSION_FAILED(NAME) ((NAME) == NULL)
|