gh-104584: Emit macro expansions to opcode_metadata.h (#106163)

This produces longer traces (superblocks?).

Also improved debug output (uop names are now printed instead of numeric opcodes). This would be simpler if the numeric opcode values were generated by generate_cases.py, but that's another project.

Refactored some code in generate_cases.py so the essential algorithm for cache effects is only run once. (Deciding which effects are used and what the total cache size is, regardless of what's used.)
This commit is contained in:
Guido van Rossum 2023-06-28 11:28:07 -07:00 committed by GitHub
parent c283a0cff5
commit 11731434df
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 153 additions and 78 deletions

View file

@ -2817,10 +2817,10 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
oparg = (int)operand;
#ifdef LLTRACE
if (lltrace >= 3) {
const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : "";
const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode];
int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
fprintf(stderr, " uop %s %d, operand %" PRIu64 ", stack_level %d\n",
opname, opcode, operand, stack_level);
fprintf(stderr, " uop %s, operand %" PRIu64 ", stack_level %d\n",
opname, operand, stack_level);
}
#endif
pc++;

View file

@ -913,6 +913,9 @@ struct opcode_macro_expansion {
#ifndef NEED_OPCODE_METADATA
extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
#ifdef Py_DEBUG
extern const char * const _PyOpcode_uop_name[512];
#endif
#else
const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
[NOP] = { true, INSTR_FMT_IX, 0 },
@ -1131,10 +1134,18 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[STORE_FAST] = { .nuops = 1, .uops = { { STORE_FAST, 0, 0 } } },
[POP_TOP] = { .nuops = 1, .uops = { { POP_TOP, 0, 0 } } },
[PUSH_NULL] = { .nuops = 1, .uops = { { PUSH_NULL, 0, 0 } } },
[END_FOR] = { .nuops = 2, .uops = { { POP_TOP, 0, 0 }, { POP_TOP, 0, 0 } } },
[END_SEND] = { .nuops = 1, .uops = { { END_SEND, 0, 0 } } },
[UNARY_NEGATIVE] = { .nuops = 1, .uops = { { UNARY_NEGATIVE, 0, 0 } } },
[UNARY_NOT] = { .nuops = 1, .uops = { { UNARY_NOT, 0, 0 } } },
[UNARY_INVERT] = { .nuops = 1, .uops = { { UNARY_INVERT, 0, 0 } } },
[BINARY_OP_MULTIPLY_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_MULTIPLY_INT, 0, 0 } } },
[BINARY_OP_ADD_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_ADD_INT, 0, 0 } } },
[BINARY_OP_SUBTRACT_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_SUBTRACT_INT, 0, 0 } } },
[BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, 0, 0 } } },
[BINARY_OP_ADD_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_ADD_FLOAT, 0, 0 } } },
[BINARY_OP_SUBTRACT_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_SUBTRACT_FLOAT, 0, 0 } } },
[BINARY_OP_ADD_UNICODE] = { .nuops = 2, .uops = { { _GUARD_BOTH_UNICODE, 0, 0 }, { _BINARY_OP_ADD_UNICODE, 0, 0 } } },
[BINARY_SLICE] = { .nuops = 1, .uops = { { BINARY_SLICE, 0, 0 } } },
[STORE_SLICE] = { .nuops = 1, .uops = { { STORE_SLICE, 0, 0 } } },
[BINARY_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_LIST_INT, 0, 0 } } },
@ -1162,6 +1173,9 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[DELETE_ATTR] = { .nuops = 1, .uops = { { DELETE_ATTR, 0, 0 } } },
[STORE_GLOBAL] = { .nuops = 1, .uops = { { STORE_GLOBAL, 0, 0 } } },
[DELETE_GLOBAL] = { .nuops = 1, .uops = { { DELETE_GLOBAL, 0, 0 } } },
[LOAD_LOCALS] = { .nuops = 1, .uops = { { _LOAD_LOCALS, 0, 0 } } },
[LOAD_NAME] = { .nuops = 2, .uops = { { _LOAD_LOCALS, 0, 0 }, { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
[LOAD_FROM_DICT_OR_GLOBALS] = { .nuops = 1, .uops = { { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
[DELETE_DEREF] = { .nuops = 1, .uops = { { DELETE_DEREF, 0, 0 } } },
[LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
[LOAD_DEREF] = { .nuops = 1, .uops = { { LOAD_DEREF, 0, 0 } } },
@ -1207,4 +1221,22 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[COPY] = { .nuops = 1, .uops = { { COPY, 0, 0 } } },
[SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
};
#ifdef Py_DEBUG
const char * const _PyOpcode_uop_name[512] = {
[300] = "EXIT_TRACE",
[301] = "SET_IP",
[302] = "_GUARD_BOTH_INT",
[303] = "_BINARY_OP_MULTIPLY_INT",
[304] = "_BINARY_OP_ADD_INT",
[305] = "_BINARY_OP_SUBTRACT_INT",
[306] = "_GUARD_BOTH_FLOAT",
[307] = "_BINARY_OP_MULTIPLY_FLOAT",
[308] = "_BINARY_OP_ADD_FLOAT",
[309] = "_BINARY_OP_SUBTRACT_FLOAT",
[310] = "_GUARD_BOTH_UNICODE",
[311] = "_BINARY_OP_ADD_UNICODE",
[312] = "_LOAD_LOCALS",
[313] = "_LOAD_FROM_DICT_OR_GLOBALS",
};
#endif
#endif

View file

@ -325,8 +325,8 @@ translate_bytecode_to_trace(
}
#define ADD_TO_TRACE(OPCODE, OPERAND) \
if (lltrace >= 2) { \
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : ""; \
fprintf(stderr, " ADD_TO_TRACE(%s %d, %" PRIu64 ")\n", opname, (OPCODE), (uint64_t)(OPERAND)); \
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
fprintf(stderr, " ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
} \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
@ -474,6 +474,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
}
opt->optimize = uop_optimize;
opt->resume_threshold = UINT16_MAX;
opt->backedge_threshold = 0;
// Need at least 3 iterations to settle specializations.
// A few lower bits of the counter are reserved for other flags.
opt->backedge_threshold = 3 << OPTIMIZER_BITS_IN_COUNTER;
return (PyObject *)opt;
}

View file

@ -300,6 +300,13 @@ def emit_macros(cls, out: Formatter):
f"(_PyOpcode_opcode_metadata[(OP)].flags & ({name}))")
@dataclasses.dataclass
class ActiveCacheEffect:
"""Wraps a CacheEffect that is actually used, in context."""
effect: parser.CacheEffect
offset: int
FORBIDDEN_NAMES_IN_UOPS = (
"resume_with_error", # Proxy for "goto", which isn't an IDENTIFIER
"unbound_local_error",
@ -344,6 +351,7 @@ class Instruction:
unmoved_names: frozenset[str]
instr_fmt: str
instr_flags: InstructionFlags
active_caches: list[ActiveCacheEffect]
# Set later
family: parser.Family | None = None
@ -375,15 +383,19 @@ def __init__(self, inst: parser.InstDef):
self.instr_flags = InstructionFlags.fromInstruction(inst)
self.active_caches = []
offset = 0
for effect in self.cache_effects:
if effect.name != UNUSED:
self.active_caches.append(ActiveCacheEffect(effect, offset))
offset += effect.size
if self.instr_flags.HAS_ARG_FLAG:
fmt = "IB"
else:
fmt = "IX"
cache = "C"
for ce in self.cache_effects:
for _ in range(ce.size):
fmt += cache
cache = "0"
if offset:
fmt += "C" + "0"*(offset-1)
self.instr_fmt = fmt
def is_viable_uop(self) -> bool:
@ -392,18 +404,11 @@ def is_viable_uop(self) -> bool:
return False
if self.instr_flags.HAS_ARG_FLAG:
# If the instruction uses oparg, it cannot use any caches
for c in self.cache_effects:
if c.name != UNUSED:
if self.active_caches:
return False
else:
# If it doesn't use oparg, it can have one cache entry
caches: list[parser.CacheEffect] = []
cache_offset = 0
for c in self.cache_effects:
if c.name != UNUSED:
caches.append(c)
cache_offset += c.size
if len(caches) > 1:
if len(self.active_caches) > 1:
return False
for forbidden in FORBIDDEN_NAMES_IN_UOPS:
# TODO: Don't check in '#ifdef ENABLE_SPECIALIZATION' regions
@ -458,7 +463,7 @@ def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None:
# out.emit(f"next_instr += OPSIZE({self.inst.name}) - 1;")
self.write_body(out, 0, tier=tier)
self.write_body(out, 0, self.active_caches, tier=tier)
# Skip the rest if the block always exits
if self.always_exits:
@ -492,14 +497,13 @@ def write_body(
self,
out: Formatter,
dedent: int,
cache_adjust: int = 0,
active_caches: list[ActiveCacheEffect],
tier: Tiers = TIER_ONE,
) -> None:
"""Write the instruction body."""
# Write cache effect variable declarations and initializations
cache_offset = cache_adjust
for ceffect in self.cache_effects:
if ceffect.name != UNUSED:
for active in active_caches:
ceffect = active.effect
bits = ceffect.size * BITS_PER_CODE_UNIT
if bits == 64:
# NOTE: We assume that 64-bit data in the cache
@ -513,12 +517,10 @@ def write_body(
func = f"read_u{bits}"
if tier == TIER_ONE:
out.emit(
f"{typ}{ceffect.name} = {func}(&next_instr[{cache_offset}].cache);"
f"{typ}{ceffect.name} = {func}(&next_instr[{active.offset}].cache);"
)
else:
out.emit(f"{typ}{ceffect.name} = operand;")
cache_offset += ceffect.size
assert cache_offset == self.cache_offset + cache_adjust
# Write the body, substituting a goto for ERROR_IF() and other stuff
assert dedent <= 0
@ -583,8 +585,9 @@ class Component:
instr: Instruction
input_mapping: StackEffectMapping
output_mapping: StackEffectMapping
active_caches: list[ActiveCacheEffect]
def write_body(self, out: Formatter, cache_adjust: int) -> None:
def write_body(self, out: Formatter) -> None:
with out.block(""):
input_names = {ieffect.name for _, ieffect in self.input_mapping}
for var, ieffect in self.input_mapping:
@ -593,7 +596,7 @@ def write_body(self, out: Formatter, cache_adjust: int) -> None:
if oeffect.name not in input_names:
out.declare(oeffect, None)
self.instr.write_body(out, dedent=-4, cache_adjust=cache_adjust)
self.instr.write_body(out, -4, self.active_caches)
for var, oeffect in self.output_mapping:
out.assign(var, oeffect)
@ -611,6 +614,7 @@ class MacroInstruction:
instr_flags: InstructionFlags
macro: parser.Macro
parts: list[Component | parser.CacheEffect]
cache_offset: int
predicted: bool = False
@ -873,11 +877,11 @@ def effect_counts(self, name: str) -> tuple[int, int, int]:
cache = instr.cache_offset
input = len(instr.input_effects)
output = len(instr.output_effects)
elif macro := self.macro_instrs.get(name):
cache, input, output = 0, 0, 0
for part in macro.parts:
elif mac := self.macro_instrs.get(name):
cache = mac.cache_offset
input, output = 0, 0
for part in mac.parts:
if isinstance(part, Component):
cache += part.instr.cache_offset
# A component may pop what the previous component pushed,
# so we offset the input/output counts by that.
delta_i = len(part.instr.input_effects)
@ -885,9 +889,6 @@ def effect_counts(self, name: str) -> tuple[int, int, int]:
offset = min(delta_i, output)
input += delta_i - offset
output += delta_o - offset
else:
assert isinstance(part, parser.CacheEffect), part
cache += part.size
else:
assert False, f"Unknown instruction {name!r}"
return cache, input, output
@ -906,29 +907,25 @@ def analyze_macro(self, macro: parser.Macro) -> MacroInstruction:
stack, initial_sp = self.stack_analysis(components)
sp = initial_sp
parts: list[Component | parser.CacheEffect] = []
format = "IB"
flags = InstructionFlags.newEmpty()
cache = "C"
offset = 0
for component in components:
match component:
case parser.CacheEffect() as ceffect:
parts.append(ceffect)
for _ in range(ceffect.size):
format += cache
cache = "0"
offset += ceffect.size
case Instruction() as instr:
part, sp = self.analyze_instruction(instr, stack, sp)
part, sp, offset = self.analyze_instruction(instr, stack, sp, offset)
parts.append(part)
for ce in instr.cache_effects:
for _ in range(ce.size):
format += cache
cache = "0"
flags.add(instr.instr_flags)
case _:
typing.assert_never(component)
final_sp = sp
format = "IB"
if offset:
format += "C" + "0"*(offset-1)
return MacroInstruction(
macro.name, stack, initial_sp, final_sp, format, flags, macro, parts
macro.name, stack, initial_sp, final_sp, format, flags, macro, parts, offset
)
def analyze_pseudo(self, pseudo: parser.Pseudo) -> PseudoInstruction:
@ -941,8 +938,8 @@ def analyze_pseudo(self, pseudo: parser.Pseudo) -> PseudoInstruction:
return PseudoInstruction(pseudo.name, targets, fmts[0], targets[0].instr_flags)
def analyze_instruction(
self, instr: Instruction, stack: list[StackEffect], sp: int
) -> tuple[Component, int]:
self, instr: Instruction, stack: list[StackEffect], sp: int, offset: int
) -> tuple[Component, int, int]:
input_mapping: StackEffectMapping = []
for ieffect in reversed(instr.input_effects):
sp -= 1
@ -951,7 +948,12 @@ def analyze_instruction(
for oeffect in instr.output_effects:
output_mapping.append((stack[sp], oeffect))
sp += 1
return Component(instr, input_mapping, output_mapping), sp
active_effects: list[ActiveCacheEffect] = []
for ceffect in instr.cache_effects:
if ceffect.name != UNUSED:
active_effects.append(ActiveCacheEffect(ceffect, offset))
offset += ceffect.size
return Component(instr, input_mapping, output_mapping, active_effects), sp, offset
def check_macro_components(
self, macro: parser.Macro
@ -1030,7 +1032,7 @@ def stack_analysis(
def get_stack_effect_info(
self, thing: parser.InstDef | parser.Macro | parser.Pseudo
) -> tuple[AnyInstruction | None, str, str]:
) -> tuple[AnyInstruction | None, str | None, str | None]:
def effect_str(effects: list[StackEffect]) -> str:
n_effect, sym_effect = list_effect_size(effects)
if sym_effect:
@ -1108,6 +1110,7 @@ def write_stack_effect_functions(self) -> None:
continue
instr, popped, pushed = self.get_stack_effect_info(thing)
if instr is not None:
assert popped is not None and pushed is not None
popped_data.append((instr, popped))
pushed_data.append((instr, pushed))
@ -1182,7 +1185,8 @@ def write_metadata(self) -> None:
self.write_pseudo_instrs()
self.write_uop_defines()
self.out.emit("")
self.write_uop_items(lambda name, counter: f"#define {name} {counter}")
self.write_stack_effect_functions()
@ -1213,6 +1217,9 @@ def write_metadata(self) -> None:
self.out.emit("#ifndef NEED_OPCODE_METADATA")
self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
self.out.emit("#ifdef Py_DEBUG")
self.out.emit("extern const char * const _PyOpcode_uop_name[512];")
self.out.emit("#endif")
self.out.emit("#else")
self.out.emit("const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {")
@ -1246,19 +1253,27 @@ def write_metadata(self) -> None:
pass
case parser.InstDef(name=name):
instr = self.instrs[name]
# Since an 'op' is not a bytecode, it has no expansion
if instr.kind != "op" and instr.is_viable_uop():
# Double check there aren't any used cache effects.
# If this fails, see write_macro_expansions().
assert not instr.active_caches, (instr.name, instr.cache_effects)
self.out.emit(
f"[{name}] = "
f"{{ .nuops = 1, .uops = {{ {{ {name}, 0, 0 }} }} }},"
)
case parser.Macro():
# TODO: emit expansion if all parts are viable uops
pass
self.write_macro_expansions(self.macro_instrs[thing.name])
case parser.Pseudo():
pass
case _:
typing.assert_never(thing)
self.out.emit("#ifdef Py_DEBUG")
with self.out.block("const char * const _PyOpcode_uop_name[512] =", ";"):
self.write_uop_items(lambda name, counter: f"[{counter}] = \"{name}\",")
self.out.emit("#endif")
self.out.emit("#endif")
with open(self.pymetadata_filename, "w") as f:
@ -1300,13 +1315,12 @@ def write_pseudo_instrs(self) -> None:
self.out.emit(f" ((OP) == {op}) || \\")
self.out.emit(f" 0")
def write_uop_defines(self) -> None:
def write_uop_items(self, make_text: typing.Callable[[str, int], str]) -> None:
"""Write '#define XXX NNN' for each uop"""
self.out.emit("")
counter = 300
counter = 300 # TODO: Avoid collision with pseudo instructions
def add(name: str) -> None:
nonlocal counter
self.out.emit(f"#define {name} {counter}")
self.out.emit(make_text(name, counter))
counter += 1
add("EXIT_TRACE")
add("SET_IP")
@ -1314,6 +1328,32 @@ def add(name: str) -> None:
if instr.kind == "op" and instr.is_viable_uop():
add(instr.name)
def write_macro_expansions(self, mac: MacroInstruction) -> None:
"""Write the macro expansions for a macro-instruction."""
# TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc.
offset = 0 # Cache effect offset
expansions: list[tuple[str, int, int]] = [] # [(name, size, offset), ...]
for part in mac.parts:
if isinstance(part, Component):
# All component instructions must be viable uops
if not part.instr.is_viable_uop():
print(f"NOTE: Part {part.instr.name} of {mac.name} is not a viable uop")
return
if part.instr.instr_flags.HAS_ARG_FLAG or not part.active_caches:
size, offset = 0, 0
else:
# If this assert triggers, is_viable_uops() lied
assert len(part.active_caches) == 1, (mac.name, part.instr.name)
cache = part.active_caches[0]
size, offset = cache.effect.size, cache.offset
expansions.append((part.instr.name, size, offset))
assert len(expansions) > 0, f"Macro {mac.name} has empty expansion?!"
pieces = [f"{{ {name}, {size}, {offset} }}" for name, size, offset in expansions]
self.out.emit(
f"[{mac.name}] = "
f"{{ .nuops = {len(expansions)}, .uops = {{ {', '.join(pieces)} }} }},"
)
def emit_metadata_entry(
self, name: str, fmt: str, flags: InstructionFlags
) -> None:
@ -1379,6 +1419,7 @@ def write_executor_instructions(self) -> None:
for thing in self.everything:
match thing:
case OverriddenInstructionPlaceHolder():
# TODO: Is this helpful?
self.write_overridden_instr_place_holder(thing)
case parser.InstDef():
instr = self.instrs[thing.name]
@ -1388,7 +1429,7 @@ def write_executor_instructions(self) -> None:
instr.write(self.out, tier=TIER_TWO)
self.out.emit("break;")
case parser.Macro():
pass # TODO
pass
case parser.Pseudo():
pass
case _:
@ -1429,7 +1470,7 @@ def write_macro(self, mac: MacroInstruction) -> None:
cache_adjust += size
case Component() as comp:
last_instr = comp.instr
comp.write_body(self.out, cache_adjust)
comp.write_body(self.out)
cache_adjust += comp.instr.cache_offset
if cache_adjust: