GH-139109: Partial reworking of JIT data structures (GH-144105)

* Halve size of buffers by reusing combined trace + optimizer buffers for TOS caching
* Add simple buffer struct for more maintainable handling of buffers
* Decouple JIT structs from thread state struct
* Ensure terminator is added to trace, when optimizer gives up
This commit is contained in:
Mark Shannon 2026-01-22 10:55:49 +00:00 committed by GitHub
parent fb690c38ca
commit d77aaa7311
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 228 additions and 215 deletions

View file

@ -203,14 +203,14 @@ static inline void
add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr,
uint16_t opcode, uint16_t oparg, uintptr_t operand0)
{
_PyUOpInstruction *out = &ctx->out_buffer[ctx->out_len];
_PyUOpInstruction *out = ctx->out_buffer.next;
out->opcode = (opcode);
out->format = this_instr->format;
out->oparg = (oparg);
out->target = this_instr->target;
out->operand0 = (operand0);
out->operand1 = this_instr->operand1;
ctx->out_len++;
ctx->out_buffer.next++;
}
/* Shortened forms for convenience, used in optimizer_bytecodes.c */
@ -430,6 +430,7 @@ optimize_uops(
_PyUOpInstruction *trace,
int trace_len,
int curr_stacklen,
_PyUOpInstruction *output,
_PyBloomFilter *dependencies
)
{
@ -440,7 +441,7 @@ optimize_uops(
JitOptContext *ctx = &tstate->jit_tracer_state->opt_context;
uint32_t opcode = UINT16_MAX;
ctx->out_buffer = tstate->jit_tracer_state->out_buffer;
uop_buffer_init(&ctx->out_buffer, output, UOP_MAX_TRACE_LENGTH);
// Make sure that watchers are set up
PyInterpreterState *interp = _PyInterpreterState_GET();
@ -458,14 +459,20 @@ optimize_uops(
ctx->curr_frame_depth++;
ctx->frame = frame;
ctx->out_len = 0;
_PyUOpInstruction *this_instr = NULL;
JitOptRef *stack_pointer = ctx->frame->stack_pointer;
for (int i = 0; !ctx->done; i++) {
assert(i < trace_len);
for (int i = 0; i < trace_len; i++) {
this_instr = &trace[i];
if (ctx->done) {
// Don't do any more optimization, but
// we still need to reach a terminator for corrctness.
*(ctx->out_buffer.next++) = *this_instr;
if (is_terminator_uop(this_instr)) {
break;
}
continue;
}
int oparg = this_instr->oparg;
opcode = this_instr->opcode;
@ -485,6 +492,8 @@ optimize_uops(
}
#endif
_PyUOpInstruction *out_ptr = ctx->out_buffer.next;
switch (opcode) {
#include "optimizer_cases.c.h"
@ -494,8 +503,8 @@ optimize_uops(
Py_UNREACHABLE();
}
// If no ADD_OP was called during this iteration, copy the original instruction
if (ctx->out_len == i) {
ctx->out_buffer[ctx->out_len++] = *this_instr;
if (ctx->out_buffer.next == out_ptr) {
*(ctx->out_buffer.next++) = *this_instr;
}
assert(ctx->frame != NULL);
if (!CURRENT_FRAME_IS_INIT_SHIM()) {
@ -526,20 +535,11 @@ optimize_uops(
* would be no benefit in retrying later */
_Py_uop_abstractcontext_fini(ctx);
// Check that the trace ends with a proper terminator
if (ctx->out_len > 0) {
_PyUOpInstruction *last_uop = &ctx->out_buffer[ctx->out_len - 1];
if (!is_terminator_uop(last_uop)) {
// Copy remaining uops from original trace until we find a terminator
for (int i = ctx->out_len; i < trace_len; i++) {
ctx->out_buffer[ctx->out_len++] = trace[i];
if (is_terminator_uop(&trace[i])) {
break;
}
}
}
if (uop_buffer_length(&ctx->out_buffer) > 0) {
assert(is_terminator_uop(uop_buffer_last(&ctx->out_buffer)));
}
return ctx->out_len;
return uop_buffer_length(&ctx->out_buffer);
error:
DPRINTF(3, "\n");
@ -696,14 +696,15 @@ _Py_uop_analyze_and_optimize(
_PyUOpInstruction *buffer,
int length,
int curr_stacklen,
_PyUOpInstruction *output,
_PyBloomFilter *dependencies
)
{
OPT_STAT_INC(optimizer_attempts);
length = optimize_uops(
tstate, buffer,
length, curr_stacklen, dependencies);
tstate, buffer, length, curr_stacklen,
output, dependencies);
if (length == 0) {
return length;
@ -711,7 +712,7 @@ _Py_uop_analyze_and_optimize(
assert(length > 0);
length = remove_unneeded_uops(tstate->jit_tracer_state->out_buffer, length);
length = remove_unneeded_uops(output, length);
assert(length > 0);
OPT_STAT_INC(optimizer_successes);