"""Target-specific code generation, parsing, and processing.""" import asyncio import dataclasses import hashlib import json import os import pathlib import re import sys import tempfile import typing import shlex import _llvm import _optimizers import _schema import _stencils import _writer if sys.version_info < (3, 11): raise RuntimeError("Building the JIT compiler requires Python 3.11 or newer!") TOOLS_JIT_BUILD = pathlib.Path(__file__).resolve() TOOLS_JIT = TOOLS_JIT_BUILD.parent TOOLS = TOOLS_JIT.parent CPYTHON = TOOLS.parent EXTERNALS = CPYTHON / "externals" PYTHON_EXECUTOR_CASES_C_H = CPYTHON / "Python" / "executor_cases.c.h" TOOLS_JIT_TEMPLATE_C = TOOLS_JIT / "template.c" ASYNCIO_RUNNER = asyncio.Runner() _S = typing.TypeVar("_S", _schema.COFFSection, _schema.ELFSection, _schema.MachOSection) _R = typing.TypeVar( "_R", _schema.COFFRelocation, _schema.ELFRelocation, _schema.MachORelocation ) @dataclasses.dataclass class _Target(typing.Generic[_S, _R]): triple: str condition: str _: dataclasses.KW_ONLY args: typing.Sequence[str] = () optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer label_prefix: typing.ClassVar[str] symbol_prefix: typing.ClassVar[str] re_global: typing.ClassVar[re.Pattern[str]] stable: bool = False debug: bool = False verbose: bool = False cflags: str = "" llvm_version: str = _llvm._LLVM_VERSION known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) pyconfig_dir: pathlib.Path = pathlib.Path.cwd().resolve() def _get_nop(self) -> bytes: if re.fullmatch(r"aarch64-.*", self.triple): nop = b"\x1f\x20\x03\xd5" elif re.fullmatch(r"x86_64-.*|i686.*", self.triple): nop = b"\x90" else: raise ValueError(f"NOP not defined for {self.triple}") return nop def _compute_digest(self) -> str: hasher = hashlib.sha256() hasher.update(self.triple.encode()) hasher.update(self.debug.to_bytes()) hasher.update(self.cflags.encode()) # These dependencies are also reflected in _JITSources in regen.targets: hasher.update(PYTHON_EXECUTOR_CASES_C_H.read_bytes()) hasher.update((self.pyconfig_dir / "pyconfig.h").read_bytes()) for dirpath, _, filenames in sorted(os.walk(TOOLS_JIT)): # Exclude cache files from digest computation to ensure reproducible builds. if dirpath.endswith("__pycache__"): continue for filename in sorted(filenames): hasher.update(pathlib.Path(dirpath, filename).read_bytes()) return hasher.hexdigest() async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: group = _stencils.StencilGroup() args = ["--disassemble", "--reloc", f"{path}"] output = await _llvm.maybe_run( "llvm-objdump", args, echo=self.verbose, llvm_version=self.llvm_version ) if output is not None: # Make sure that full paths don't leak out (for reproducibility): long, short = str(path), str(path.name) group.code.disassembly.extend( line.expandtabs().strip().replace(long, short) for line in output.splitlines() ) args = [ "--elf-output-style=JSON", "--expand-relocs", # "--pretty-print", "--section-data", "--section-relocations", "--section-symbols", "--sections", f"{path}", ] output = await _llvm.run( "llvm-readobj", args, echo=self.verbose, llvm_version=self.llvm_version ) # --elf-output-style=JSON is only *slightly* broken on Mach-O... output = output.replace("PrivateExtern\n", "\n") output = output.replace("Extern\n", "\n") # ...and also COFF: output = output[output.index("[", 1, None) :] output = output[: output.rindex("]", None, -1) + 1] sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output) for wrapped_section in sections: self._handle_section(wrapped_section["Section"], group) assert group.symbols["_JIT_ENTRY"] == (_stencils.HoleValue.CODE, 0) if group.data.body: line = f"0: {str(bytes(group.data.body)).removeprefix('b')}" group.data.disassembly.append(line) return group def _handle_section(self, section: _S, group: _stencils.StencilGroup) -> None: raise NotImplementedError(type(self)) def _handle_relocation( self, base: int, relocation: _R, raw: bytearray ) -> _stencils.Hole: raise NotImplementedError(type(self)) async def _compile( self, opname: str, c: pathlib.Path, tempdir: pathlib.Path ) -> _stencils.StencilGroup: s = tempdir / f"{opname}.s" o = tempdir / f"{opname}.o" args_s = [ f"--target={self.triple}", "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", f"-DSUPPORTS_SMALL_CONSTS={1 if self.optimizer.supports_small_constants else 0}", f"-D_JIT_OPCODE={opname}", "-D_PyJIT_ACTIVE", "-D_Py_JIT", f"-I{self.pyconfig_dir}", f"-I{CPYTHON / 'Include'}", f"-I{CPYTHON / 'Include' / 'internal'}", f"-I{CPYTHON / 'Include' / 'internal' / 'mimalloc'}", f"-I{CPYTHON / 'Python'}", f"-I{CPYTHON / 'Tools' / 'jit'}", # -O2 and -O3 include some optimizations that make sense for # standalone functions, but not for snippets of code that are going # to be laid out end-to-end (like ours)... common examples include # passes like tail-duplication, or aligning jump targets with nops. # -Os is equivalent to -O2 with many of these problematic passes # disabled. Based on manual review, for *our* purposes it usually # generates better code than -O2 (and -O2 usually generates better # code than -O3). As a nice benefit, it uses less memory too: "-Os", "-S", # Shorten full absolute file paths in the generated code (like the # __FILE__ macro and assert failure messages) for reproducibility: f"-ffile-prefix-map={CPYTHON}=.", f"-ffile-prefix-map={tempdir}=.", # This debug info isn't necessary, and bloats out the JIT'ed code. # We *may* be able to re-enable this, process it, and JIT it for a # nicer debugging experience... but that needs a lot more research: "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-std=c11", "-o", f"{s}", f"{c}", *self.args, # Allow user-provided CFLAGS to override any defaults *shlex.split(self.cflags), ] await _llvm.run( "clang", args_s, echo=self.verbose, llvm_version=self.llvm_version ) self.optimizer( s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix, re_global=self.re_global, ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] await _llvm.run( "clang", args_o, echo=self.verbose, llvm_version=self.llvm_version ) return await self._parse(o) async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text() cases_and_opnames = sorted( re.findall( r"\n {8}(case (\w+): \{\n.*?\n {8}\})", generated_cases, flags=re.DOTALL ) ) tasks = [] with tempfile.TemporaryDirectory() as tempdir: work = pathlib.Path(tempdir).resolve() async with asyncio.TaskGroup() as group: coro = self._compile("shim", TOOLS_JIT / "shim.c", work) tasks.append(group.create_task(coro, name="shim")) template = TOOLS_JIT_TEMPLATE_C.read_text() for case, opname in cases_and_opnames: # Write out a copy of the template with *only* this case # inserted. This is about twice as fast as #include'ing all # of executor_cases.c.h each time we compile (since the C # compiler wastes a bunch of time parsing the dead code for # all of the other cases): c = work / f"{opname}.c" c.write_text(template.replace("CASE", case)) coro = self._compile(opname, c, work) tasks.append(group.create_task(coro, name=opname)) stencil_groups = {task.get_name(): task.result() for task in tasks} for stencil_group in stencil_groups.values(): stencil_group.convert_labels_to_relocations() stencil_group.process_relocations(self.known_symbols) return stencil_groups def build( self, *, comment: str = "", force: bool = False, jit_stencils: pathlib.Path, ) -> None: """Build jit_stencils.h in the given directory.""" jit_stencils.parent.mkdir(parents=True, exist_ok=True) if not self.stable: warning = f"JIT support for {self.triple} is still experimental!" request = "Please report any issues you encounter.".center(len(warning)) if self.llvm_version != _llvm._LLVM_VERSION: request = f"Warning! Building with an LLVM version other than {_llvm._LLVM_VERSION} is not supported." outline = "=" * len(warning) print("\n".join(["", outline, warning, request, outline, ""])) digest = f"// {self._compute_digest()}\n" if ( not force and jit_stencils.exists() and jit_stencils.read_text().startswith(digest) ): return stencil_groups = ASYNCIO_RUNNER.run(self._build_stencils()) jit_stencils_new = jit_stencils.parent / "jit_stencils.h.new" try: with jit_stencils_new.open("w") as file: file.write(digest) if comment: file.write(f"// {comment}\n") file.write("\n") for line in _writer.dump(stencil_groups, self.known_symbols): file.write(f"{line}\n") try: jit_stencils_new.replace(jit_stencils) except FileNotFoundError: # another process probably already moved the file if not jit_stencils.is_file(): raise finally: jit_stencils_new.unlink(missing_ok=True) class _COFF( _Target[_schema.COFFSection, _schema.COFFRelocation] ): # pylint: disable = too-few-public-methods def _handle_section( self, section: _schema.COFFSection, group: _stencils.StencilGroup ) -> None: name = section["Name"]["Value"] if name == ".debug$S": # skip debug sections return flags = {flag["Name"] for flag in section["Characteristics"]["Flags"]} if "SectionData" in section: section_data_bytes = section["SectionData"]["Bytes"] else: # Zeroed BSS data, seen with printf debugging calls: section_data_bytes = [0] * section["RawDataSize"] if "IMAGE_SCN_MEM_EXECUTE" in flags: value = _stencils.HoleValue.CODE stencil = group.code elif "IMAGE_SCN_MEM_READ" in flags: value = _stencils.HoleValue.DATA stencil = group.data else: return base = len(stencil.body) group.symbols[section["Number"]] = value, base stencil.body.extend(section_data_bytes) for wrapped_symbol in section["Symbols"]: symbol = wrapped_symbol["Symbol"] offset = base + symbol["Value"] name = symbol["Name"] name = name.removeprefix(self.symbol_prefix) if name not in group.symbols: group.symbols[name] = value, offset for wrapped_relocation in section["Relocations"]: relocation = wrapped_relocation["Relocation"] hole = self._handle_relocation(base, relocation, stencil.body) stencil.holes.append(hole) def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]: if name.startswith("__imp_"): name = name.removeprefix("__imp_") name = name.removeprefix(self.symbol_prefix) return _stencils.HoleValue.GOT, name name = name.removeprefix(self.symbol_prefix) return _stencils.symbol_to_value(name) def _handle_relocation( self, base: int, relocation: _schema.COFFRelocation, raw: bytearray ) -> _stencils.Hole: match relocation: case { "Offset": offset, "Symbol": s, "Type": {"Name": "IMAGE_REL_I386_DIR32" as kind}, }: offset += base value, symbol = self._unwrap_dllimport(s) addend = int.from_bytes(raw[offset : offset + 4], "little") case { "Offset": offset, "Symbol": s, "Type": { "Name": "IMAGE_REL_AMD64_REL32" | "IMAGE_REL_I386_REL32" as kind }, }: offset += base value, symbol = self._unwrap_dllimport(s) addend = ( int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 ) case { "Offset": offset, "Symbol": s, "Type": { "Name": "IMAGE_REL_ARM64_BRANCH19" | "IMAGE_REL_ARM64_BRANCH26" | "IMAGE_REL_ARM64_PAGEBASE_REL21" | "IMAGE_REL_ARM64_PAGEOFFSET_12A" | "IMAGE_REL_ARM64_PAGEOFFSET_12L" as kind }, }: offset += base value, symbol = self._unwrap_dllimport(s) addend = 0 case _: raise NotImplementedError(relocation) return _stencils.Hole(offset, kind, value, symbol, addend) class _COFF32(_COFF): # These mangle like Mach-O and other "older" formats: label_prefix = "L" symbol_prefix = "_" re_global = re.compile(r'\s*\.def\s+(?P