diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index ee8eb1095fe541..f9507fda1606db 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -121,18 +121,11 @@ _PyEval_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwfl } #ifdef _Py_TIER2 -#ifdef _Py_JIT -_Py_CODEUNIT *_Py_LazyJitShim( - struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, - _PyStackRef *stack_pointer, PyThreadState *tstate -); -#else _Py_CODEUNIT *_PyTier2Interpreter( struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate ); #endif -#endif extern _PyJitEntryFuncPtr _Py_jit_entry; diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h index 70bccce4166c18..b3cadcce8247d0 100644 --- a/Include/internal/pycore_jit.h +++ b/Include/internal/pycore_jit.h @@ -23,9 +23,13 @@ typedef _Py_CODEUNIT *(*jit_func)( _PyStackRef _tos_cache0, _PyStackRef _tos_cache1, _PyStackRef _tos_cache2 ); +_Py_CODEUNIT *_PyJIT( + _PyExecutorObject *executor, _PyInterpreterFrame *frame, + _PyStackRef *stack_pointer, PyThreadState *tstate +); + int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length); void _PyJIT_Free(_PyExecutorObject *executor); -void _PyJIT_Fini(void); PyAPI_FUNC(int) _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr); #endif // _Py_JIT diff --git a/Makefile.pre.in b/Makefile.pre.in index f869c1f7c93776..c0f6656ea6fc27 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -290,6 +290,7 @@ LDLIBRARYDIR= @LDLIBRARYDIR@ INSTSONAME= @INSTSONAME@ LIBRARY_DEPS= @LIBRARY_DEPS@ LINK_PYTHON_DEPS=@LINK_PYTHON_DEPS@ +JIT_OBJS= @JIT_SHIM_O@ PY_ENABLE_SHARED= @PY_ENABLE_SHARED@ STATIC_LIBPYTHON= @STATIC_LIBPYTHON@ @@ -469,6 +470,7 @@ PYTHON_OBJS= \ Python/instruction_sequence.o \ Python/intrinsics.o \ Python/jit.o \ + $(JIT_OBJS) \ Python/legacy_tracing.o \ Python/lock.o \ Python/marshal.o \ @@ -3203,21 +3205,37 @@ Python/emscripten_trampoline_inner.wasm: $(srcdir)/Python/emscripten_trampoline_ Python/emscripten_trampoline_wasm.c: Python/emscripten_trampoline_inner.wasm $(PYTHON_FOR_REGEN) $(srcdir)/Platforms/emscripten/prepare_external_wasm.py $< $@ getWasmTrampolineModule +JIT_SHIM_BUILD_OBJS= @JIT_SHIM_BUILD_O@ +JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_SHIM_BUILD_OBJS) +JIT_TARGETS= $(JIT_BUILD_TARGETS) $(filter-out $(JIT_SHIM_BUILD_OBJS),$(JIT_OBJS)) +JIT_GENERATED_STAMP= .jit-stamp + JIT_DEPS = \ $(srcdir)/Tools/jit/*.c \ + $(srcdir)/Tools/jit/*.h \ $(srcdir)/Tools/jit/*.py \ $(srcdir)/Python/executor_cases.c.h \ pyconfig.h -jit_stencils.h @JIT_STENCILS_H@: $(JIT_DEPS) +$(JIT_GENERATED_STAMP): $(JIT_DEPS) @REGEN_JIT_COMMAND@ + @touch $@ + +$(JIT_BUILD_TARGETS): $(JIT_GENERATED_STAMP) + @if test ! -f "$@"; then \ + rm -f $(JIT_GENERATED_STAMP); \ + $(MAKE) $(JIT_GENERATED_STAMP); \ + test -f "$@"; \ + fi + +jit_shim-universal2-apple-darwin.o: jit_shim-aarch64-apple-darwin.o jit_shim-x86_64-apple-darwin.o + lipo -create -output $@ jit_shim-aarch64-apple-darwin.o jit_shim-x86_64-apple-darwin.o Python/jit.o: $(srcdir)/Python/jit.c @JIT_STENCILS_H@ $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< .PHONY: regen-jit -regen-jit: - @REGEN_JIT_COMMAND@ +regen-jit: $(JIT_TARGETS) # Some make's put the object file in the current directory .c.o: @@ -3341,7 +3359,7 @@ clean-profile: clean-retain-profile clean-bolt # gh-141808: The JIT stencils are deliberately kept in clean-profile .PHONY: clean-jit-stencils clean-jit-stencils: - -rm -f jit_stencils*.h + -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_shim*.o .PHONY: clean clean: clean-profile clean-jit-stencils diff --git a/PCbuild/pyproject.props b/PCbuild/pyproject.props index 94ae718d58c4ba..f79608e1d58dbc 100644 --- a/PCbuild/pyproject.props +++ b/PCbuild/pyproject.props @@ -12,8 +12,9 @@ $(IntDir.Replace(`\\`, `\`)) $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_frozen\ $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)$(ArchName)_$(Configuration)\zlib-ng\ - $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_$(Configuration) - $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_PGInstrument + $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_$(Configuration)\ + $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_PGInstrument\ + $(GeneratedJitStencilsDir.Replace(`\\`, `\`)) $(ProjectName) $(TargetName)$(PyDebugExt) false diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 61bee29c0af3d6..5bae504b474665 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -115,6 +115,9 @@ version.lib;ws2_32.lib;pathcch.lib;bcrypt.lib;%(AdditionalDependencies) zlib-ng$(PyDebugExt).lib;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-aarch64-pc-windows-msvc.o;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-i686-pc-windows-msvc.o;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-x86_64-pc-windows-msvc.o;%(AdditionalDependencies) diff --git a/PCbuild/regen.targets b/PCbuild/regen.targets index bb059f382eb375..9552e73ef6a2ec 100644 --- a/PCbuild/regen.targets +++ b/PCbuild/regen.targets @@ -35,6 +35,9 @@ <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-aarch64-pc-windows-msvc.h" Condition="$(Platform) == 'ARM64'"/> <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-i686-pc-windows-msvc.h" Condition="$(Platform) == 'Win32'"/> <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-x86_64-pc-windows-msvc.h" Condition="$(Platform) == 'x64'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-aarch64-pc-windows-msvc.o" Condition="$(Platform) == 'ARM64'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-i686-pc-windows-msvc.o" Condition="$(Platform) == 'Win32'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-x86_64-pc-windows-msvc.o" Condition="$(Platform) == 'x64'"/> <_CasesSources Include="$(PySourcePath)Python\bytecodes.c;$(PySourcePath)Python\optimizer_bytecodes.c;"/> <_CasesOutputs Include="$(PySourcePath)Python\generated_cases.c.h;$(PySourcePath)Include\opcode_ids.h;$(PySourcePath)Include\internal\pycore_uop_ids.h;$(PySourcePath)Python\opcode_targets.h;$(PySourcePath)Include\internal\pycore_opcode_metadata.h;$(PySourcePath)Include\internal\pycore_uop_metadata.h;$(PySourcePath)Python\optimizer_cases.c.h;$(PySourcePath)Lib\_opcode_metadata.py"/> <_SbomSources Include="$(PySourcePath)PCbuild\get_externals.bat" /> @@ -129,7 +132,7 @@ x86_64-pc-windows-msvc $(JITArgs) --debug - + diff --git a/Python/ceval.c b/Python/ceval.c index 967d92f4ea6855..506ea591c385c0 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1305,7 +1305,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } #ifdef _Py_TIER2 #ifdef _Py_JIT -_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitShim; +_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT; #else _PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter; #endif diff --git a/Python/jit.c b/Python/jit.c index af75acf1ff2bb3..26e01b25d48c04 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -60,8 +60,6 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } -static size_t _Py_jit_shim_size = 0; - static int address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) { @@ -104,13 +102,6 @@ _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr) if (interp == NULL) { return 0; } - if (_Py_jit_entry != _Py_LazyJitShim && _Py_jit_shim_size != 0) { - uintptr_t start = (uintptr_t)_Py_jit_entry; - uintptr_t end = start + _Py_jit_shim_size; - if (addr >= start && addr < end) { - return 1; - } - } if (address_in_executor_array(interp->executor_ptrs, interp->executor_count, addr)) { return 1; } @@ -727,75 +718,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz return 0; } -/* One-off compilation of the jit entry shim - * We compile this once only as it effectively a normal - * function, but we need to use the JIT because it needs - * to understand the jit-specific calling convention. - * Don't forget to call _PyJIT_Fini later! - */ -static _PyJitEntryFuncPtr -compile_shim(void) -{ - _PyExecutorObject dummy; - const StencilGroup *group; - size_t code_size = 0; - size_t data_size = 0; - jit_state state = {0}; - group = &shim; - code_size += group->code_size; - data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); - combine_symbol_mask(group->got_mask, state.got_symbols.mask); - // Round up to the nearest page: - size_t page_size = get_page_size(); - assert((page_size & (page_size - 1)) == 0); - size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1)); - size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size) & (page_size - 1)); - size_t total_size = code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size + padding; - unsigned char *memory = jit_alloc(total_size); - if (memory == NULL) { - return NULL; - } - unsigned char *code = memory; - state.trampolines.mem = memory + code_size; - unsigned char *data = memory + code_size + state.trampolines.size + code_padding; - state.got_symbols.mem = data + data_size; - // Compile the shim, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). - group = &shim; - group->emit(code, data, &dummy, NULL, &state); - code += group->code_size; - data += group->data_size; - assert(code == memory + code_size); - assert(data == memory + code_size + state.trampolines.size + code_padding + data_size); - if (mark_executable(memory, total_size)) { - jit_free(memory, total_size); - return NULL; - } - _Py_jit_shim_size = total_size; - return (_PyJitEntryFuncPtr)memory; -} - -static PyMutex lazy_jit_mutex = { 0 }; - -_Py_CODEUNIT * -_Py_LazyJitShim( - _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate -) { - PyMutex_Lock(&lazy_jit_mutex); - if (_Py_jit_entry == _Py_LazyJitShim) { - _PyJitEntryFuncPtr shim = compile_shim(); - if (shim == NULL) { - PyMutex_Unlock(&lazy_jit_mutex); - Py_FatalError("Cannot allocate core JIT code"); - } - _Py_jit_entry = shim; - } - PyMutex_Unlock(&lazy_jit_mutex); - return _Py_jit_entry(executor, frame, stack_pointer, tstate); -} - // Free executor's memory allocated with _PyJIT_Compile void _PyJIT_Free(_PyExecutorObject *executor) @@ -812,22 +734,4 @@ _PyJIT_Free(_PyExecutorObject *executor) } } -// Free shim memory allocated with compile_shim -void -_PyJIT_Fini(void) -{ - PyMutex_Lock(&lazy_jit_mutex); - unsigned char *memory = (unsigned char *)_Py_jit_entry; - size_t size = _Py_jit_shim_size; - if (size) { - _Py_jit_entry = _Py_LazyJitShim; - _Py_jit_shim_size = 0; - if (jit_free(memory, size)) { - PyErr_FormatUnraisable("Exception ignored while " - "freeing JIT entry code"); - } - } - PyMutex_Unlock(&lazy_jit_mutex); -} - #endif // _Py_JIT diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0232ed6c382c61..0a88e32bb6b65e 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -37,9 +37,6 @@ #include "pycore_uniqueid.h" // _PyObject_FinalizeUniqueIdPool() #include "pycore_warnings.h" // _PyWarnings_InitState() #include "pycore_weakref.h" // _PyWeakref_GET_REF() -#ifdef _Py_JIT -#include "pycore_jit.h" // _PyJIT_Fini() -#endif #if defined(PYMALLOC_USE_HUGEPAGES) && defined(MS_WINDOWS) #include @@ -2531,11 +2528,6 @@ _Py_Finalize(_PyRuntimeState *runtime) finalize_interp_clear(tstate); -#ifdef _Py_JIT - /* Free JIT shim memory */ - _PyJIT_Fini(); -#endif - #ifdef Py_TRACE_REFS /* Display addresses (& refcnts) of all objects still alive. * An address can be used to find the repr of the object, printed diff --git a/Python/pystate.c b/Python/pystate.c index d6a26f3339b863..b7c838a1c156ae 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -489,11 +489,6 @@ free_interpreter(PyInterpreterState *interp) static inline int check_interpreter_whence(long); #endif -extern _Py_CODEUNIT * -_Py_LazyJitShim( - struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate -); - /* Get the interpreter state to a minimal consistent state. Further init happens in pylifecycle.c before it can be used. All fields not initialized here are expected to be zeroed out, diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index f78e80db165fc8..15cac3de3fe11f 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -57,6 +57,12 @@ class _Target(typing.Generic[_S, _R]): known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) pyconfig_dir: pathlib.Path = pathlib.Path.cwd().resolve() + def _compile_args(self) -> list[str]: + return list(self.args) + + def _shim_compile_args(self) -> list[str]: + return [] + def _get_nop(self) -> bytes: if re.fullmatch(r"aarch64-.*", self.triple): nop = b"\x1f\x20\x03\xd5" @@ -139,12 +145,8 @@ def _handle_relocation( ) -> _stencils.Hole: raise NotImplementedError(type(self)) - async def _compile( - self, opname: str, c: pathlib.Path, tempdir: pathlib.Path - ) -> _stencils.StencilGroup: - s = tempdir / f"{opname}.s" - o = tempdir / f"{opname}.o" - args_s = [ + def _base_clang_args(self, opname: str, tempdir: pathlib.Path) -> list[str]: + return [ f"--target={self.triple}", "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", @@ -167,29 +169,38 @@ async def _compile( # generates better code than -O2 (and -O2 usually generates better # code than -O3). As a nice benefit, it uses less memory too: "-Os", - "-S", # Shorten full absolute file paths in the generated code (like the # __FILE__ macro and assert failure messages) for reproducibility: f"-ffile-prefix-map={CPYTHON}=.", f"-ffile-prefix-map={tempdir}=.", - # This debug info isn't necessary, and bloats out the JIT'ed code. - # We *may* be able to re-enable this, process it, and JIT it for a - # nicer debugging experience... but that needs a lot more research: - "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-std=c11", + ] + + async def _build_stencil_group( + self, opname: str, c: pathlib.Path, tempdir: pathlib.Path + ) -> _stencils.StencilGroup: + s = tempdir / f"{opname}.s" + o = tempdir / f"{opname}.o" + args_s = self._base_clang_args(opname, tempdir) + args_s += [ + "-S", + # Stencils do not need unwind info, and the optimizer does not + # preserve .cfi_* directives correctly. On Darwin, + # -fno-asynchronous-unwind-tables alone still leaves synchronous + # unwind directives in the assembly, so disable both forms here. + "-fno-unwind-tables", + "-fno-asynchronous-unwind-tables", "-o", f"{s}", f"{c}", ] - is_shim = opname == "shim" if self.frame_pointers: - frame_pointer = "all" if is_shim else "reserved" - args_s += ["-Xclang", f"-mframe-pointer={frame_pointer}"] - args_s += self.args + args_s += ["-Xclang", "-mframe-pointer=reserved"] + args_s += self._compile_args() # Allow user-provided CFLAGS to override any defaults args_s += shlex.split(self.cflags) await _llvm.run( @@ -199,14 +210,13 @@ async def _compile( llvm_version=self.llvm_version, llvm_tools_install_dir=self.llvm_tools_install_dir, ) - if not is_shim: - self.optimizer( - s, - label_prefix=self.label_prefix, - symbol_prefix=self.symbol_prefix, - re_global=self.re_global, - frame_pointers=self.frame_pointers, - ).run() + self.optimizer( + s, + label_prefix=self.label_prefix, + symbol_prefix=self.symbol_prefix, + re_global=self.re_global, + frame_pointers=self.frame_pointers, + ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] await _llvm.run( "clang", @@ -217,6 +227,30 @@ async def _compile( ) return await self._parse(o) + async def _build_shim_object(self, output: pathlib.Path) -> None: + with tempfile.TemporaryDirectory() as tempdir: + work = pathlib.Path(tempdir).resolve() + args_o = self._base_clang_args("shim", work) + args_o += self._shim_compile_args() + args_o += [ + "-c", + # The linked shim is a real function in the final binary, so + # keep unwind info for debuggers and stack walkers. + "-fasynchronous-unwind-tables", + ] + if self.frame_pointers: + args_o += ["-Xclang", "-mframe-pointer=all"] + args_o += self._compile_args() + args_o += shlex.split(self.cflags) + args_o += ["-o", f"{output}", f"{TOOLS_JIT / 'shim.c'}"] + await _llvm.run( + "clang", + args_o, + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text() cases_and_opnames = sorted( @@ -231,8 +265,6 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: with tempfile.TemporaryDirectory() as tempdir: work = pathlib.Path(tempdir).resolve() async with asyncio.TaskGroup() as group: - coro = self._compile("shim", TOOLS_JIT / "shim.c", work) - tasks.append(group.create_task(coro, name="shim")) template = TOOLS_JIT_TEMPLATE_C.read_text() for case, opname in cases_and_opnames: # Write out a copy of the template with *only* this case @@ -242,7 +274,7 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: # all of the other cases): c = work / f"{opname}.c" c.write_text(template.replace("CASE", case)) - coro = self._compile(opname, c, work) + coro = self._build_stencil_group(opname, c, work) tasks.append(group.create_task(coro, name=opname)) stencil_groups = {task.get_name(): task.result() for task in tasks} for stencil_group in stencil_groups.values(): @@ -256,8 +288,9 @@ def build( comment: str = "", force: bool = False, jit_stencils: pathlib.Path, + jit_shim_object: pathlib.Path, ) -> None: - """Build jit_stencils.h in the given directory.""" + """Build jit_stencils.h and the shim object in the given directory.""" jit_stencils.parent.mkdir(parents=True, exist_ok=True) if not self.stable: warning = f"JIT support for {self.triple} is still experimental!" @@ -271,8 +304,10 @@ def build( not force and jit_stencils.exists() and jit_stencils.read_text().startswith(digest) + and jit_shim_object.exists() ): return + ASYNCIO_RUNNER.run(self._build_shim_object(jit_shim_object)) stencil_groups = ASYNCIO_RUNNER.run(self._build_stencils()) jit_stencils_new = jit_stencils.parent / "jit_stencils.h.new" try: @@ -296,6 +331,13 @@ def build( class _COFF( _Target[_schema.COFFSection, _schema.COFFRelocation] ): # pylint: disable = too-few-public-methods + def _shim_compile_args(self) -> list[str]: + # The linked shim is part of pythoncore, not a shared extension. + # On Windows, Py_BUILD_CORE_MODULE makes public APIs import from + # pythonXY.lib, which creates a self-dependency when linking + # pythoncore.dll. Build the shim with builtin/core semantics. + return ["-UPy_BUILD_CORE_MODULE", "-DPy_BUILD_CORE_BUILTIN"] + def _handle_section( self, section: _schema.COFFSection, group: _stencils.StencilGroup ) -> None: @@ -396,6 +438,10 @@ class _COFF64(_COFF): symbol_prefix = "" re_global = re.compile(r'\s*\.def\s+(?P