deepmodeling · wanghan-iapcm · Jun 10, 2026 · Jun 3, 2026 · Jun 6, 2026 · Jun 7, 2026
diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
@@ -142,6 +142,13 @@ def get_pt_requirement(pt_version: str = "") -> dict:
             # under the torch extra rather than the core deps (conda-forge has
             # vesin but not vesin-torch).
             "vesin[torch]",
+            # GPU O(N) cell-list neighbor list for large systems. Restricted to
+            # Python >= 3.11 (the package requires it while deepmd-kit still
+            # supports 3.10) and to Linux: it is a CUDA package, and its
+            # dependency warp-lang ships no macosx_x86_64 wheel, which otherwise
+            # makes the macOS x86_64 wheel build's dependency resolution
+            # unsatisfiable.
+            "nvalchemi-toolkit-ops>=0.3.1; python_version >= '3.11' and platform_system == 'Linux'",
             *mpi_requirement,
             *cibw_requirement,
         ],

diff --git a/deepmd/pt/entrypoints/freeze_pt2.py b/deepmd/pt/entrypoints/freeze_pt2.py
@@ -43,12 +43,18 @@
 from deepmd.dpmodel.utils.region import (
     normalize_coord,
 )
+from deepmd.pt.model.descriptor.sezm_nn.so2 import (
+    SO2Linear,
+)
 from deepmd.pt.model.model import (
     get_model,
 )
 from deepmd.pt.train.wrapper import (
     ModelWrapper,
 )
+from deepmd.pt.utils.compile_compat import (
+    build_inductor_compile_options,
+)
 from deepmd.pt.utils.env import (
     DEVICE,
 )
@@ -219,6 +225,7 @@ def _collect_metadata(
     model: torch.nn.Module,
     output_keys: list[str],
     is_spin: bool | None = None,
+    do_atomic_virial: bool = False,
 ) -> dict:
     """Assemble the flat metadata dict expected by :class:`DeepPotPTExpt`.
 
@@ -261,6 +268,8 @@ def _collect_metadata(
         "mixed_types": bool(model.mixed_types()),
         "has_message_passing": _model_has_message_passing(model),
         "has_comm_artifact": False,
+        "do_atomic_virial": bool(do_atomic_virial),
+        "nnei": int(sum(model.get_sel())),
         "has_default_fparam": bool(model.has_default_fparam()),
         "default_fparam": _to_py_list(model.get_default_fparam()),
         "default_chg_spin": _to_py_list(model.get_default_chg_spin()),
@@ -468,6 +477,7 @@ def freeze_sezm_to_pt2(
     *,
     device: torch.device | None = None,
     head: str | None = None,
+    atomic_virial: bool = False,
 ) -> None:
     """Freeze a SeZM checkpoint into an AOTInductor ``.pt2`` archive.
 
@@ -484,10 +494,14 @@ def freeze_sezm_to_pt2(
         Model head to export from a multi-task checkpoint. If omitted, the
         ``Default`` head is used when present; otherwise multi-task checkpoints
         must pass an explicit head. Single-task checkpoints must pass ``None``.
+    atomic_virial
+        Whether to include per-atom virial outputs in the exported graph.
+        Disable this for fastest LAMMPS force/energy/total-virial inference.
     """
     from torch._inductor import (
         aoti_compile_and_package,
     )
+    from torch._inductor import config as inductor_config
 
     target_device = device if device is not None else DEVICE
 
@@ -507,16 +521,24 @@ def freeze_sezm_to_pt2(
     model.eval()
     model.to("cpu")
 
+    # The SO(2) linear mixer selects its block-diagonal vs dense matmul from a
+    # Python device branch that make_fx resolves at trace time. Since tracing
+    # always runs on CPU, pin the choice to the AOTI target device: non-CPU
+    # targets bake the block-diagonal contraction (which skips the structural
+    # off-|m| zeros); CPU targets keep the dense einsum that dodges the Inductor
+    # AVX2 codegen bug.
+    force_block_diag = target_device.type != "cpu"
+    for module in model.modules():
+        if isinstance(module, SO2Linear):
+            module._force_block_diag_matmul = force_block_diag
+
     _, sample_inputs_cpu = _resolve_nframes(
         model,
         nloc=7,
         device=torch.device("cpu"),
         has_spin=is_spin,
     )
 
-    # do_atomic_virial=True pulls every key that DeepPotPTExpt may read
-    # (energy, energy_redu, energy_derv_r, energy_derv_c, energy_derv_c_redu)
-    # into the traced graph.
     if is_spin:
         (
             ext_coord,
@@ -537,7 +559,7 @@ def freeze_sezm_to_pt2(
             fparam=fparam,
             aparam=aparam,
             charge_spin=charge_spin,
-            do_atomic_virial=True,
+            do_atomic_virial=atomic_virial,
         )
     else:
         (
@@ -557,7 +579,7 @@ def freeze_sezm_to_pt2(
             fparam=fparam,
             aparam=aparam,
             charge_spin=charge_spin,
-            do_atomic_virial=True,
+            do_atomic_virial=atomic_virial,
         )
 
     # Output key order is taken from a concrete run; Python dict order
@@ -587,9 +609,19 @@ def freeze_sezm_to_pt2(
         exported = move_to_device_pass(exported, target_device)
 
     out_path_str = str(out_path)
-    aoti_compile_and_package(exported, package_path=out_path_str)
-
-    metadata = _collect_metadata(model, output_keys=output_keys, is_spin=is_spin)
+    compile_options = build_inductor_compile_options()
+    # Keep AOTInductor aligned with the eval compile path.  ``triton.max_tiles=1``
+    # keeps data-dependent edge axes on Triton's x grid, whose bound is large
+    # enough for production-scale neighbor lists.
+    with inductor_config.patch({**compile_options, "triton.max_tiles": 1}):
+        aoti_compile_and_package(exported, package_path=out_path_str)
+
+    metadata = _collect_metadata(
+        model,
+        output_keys=output_keys,
+        is_spin=is_spin,
+        do_atomic_virial=atomic_virial,
+    )
     with zipfile.ZipFile(out_path_str, "a") as zf:
         zf.writestr("model/extra/metadata.json", json.dumps(metadata))
         # The raw training params are preserved so `dp change-bias` and

diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
@@ -66,6 +66,10 @@
     GLOBAL_PT_FLOAT_PRECISION,
     RESERVED_PRECISION_DICT,
 )
+from deepmd.pt.utils.nv_nlist import (
+    NvNeighborList,
+    is_nv_available,
+)
 from deepmd.pt.utils.utils import (
     to_numpy_array,
     to_torch_tensor,
@@ -255,63 +259,94 @@ def __init__(
     def _setup_nlist_backend(self, nlist_backend: str) -> None:
         """Resolve the neighbor-list construction strategy from a user choice.
 
-        ``"native"`` uses the dense all-pairs builder; ``"vesin"`` forces the
-        O(N) ``vesin.torch`` cell list (raising if it is unavailable or the
-        model/inputs are unsupported); ``"auto"`` uses vesin when applicable and
-        silently falls back to the native builder otherwise.  Results are
-        unchanged either way -- only the neighbor-search cost differs.
+        ``"native"`` uses the dense all-pairs builder; ``"vesin"`` / ``"nv"``
+        force the O(N) ``vesin.torch`` / ``nvalchemiops`` cell list (raising if
+        unavailable or the model/inputs are unsupported); ``"auto"`` picks the
+        first available O(N) builder (vesin, then nv) and otherwise falls back to
+        the native builder.  Results are unchanged either way -- only the
+        neighbor-search cost differs.
         """
-        if nlist_backend not in ("auto", "vesin", "native"):
+        inner = self.dp.model["Default"]
+        self_built = getattr(inner, "use_self_built_nlist", None)
+        if callable(self_built) and self_built():
+            # The model builds its own neighbor list and runs the native path;
+            # an external strategy would bypass it, so always use native.
+            log.info(
+                "Ignoring nlist_backend=%r: %s uses its own built-in neighbor list.",
+                nlist_backend,
+                type(inner).__name__,
+            )
+            self._nlist_builder = None
+            return
+        if nlist_backend not in ("auto", "vesin", "nv", "native"):
             raise ValueError(
                 f"Unknown nlist_backend '{nlist_backend}'; "
-                "expected 'auto', 'vesin', or 'native'."
+                "expected 'auto', 'vesin', 'nv', or 'native'."
             )
-        # reason vesin cannot be used (None means it can)
+
+        # reason an external strategy cannot be used (None means it can)
         unsupported = None
         if self._has_spin:
             unsupported = "spin models"
         elif self._has_hessian:
             unsupported = "hessian models"
         elif self.modifier is not None:
-            # the vesin path runs forward_common_lower directly, bypassing
+            # the strategy path runs forward_common_lower directly, bypassing
             # ModelWrapper.forward (which applies the data modifier); fall back
             # to the native path so the modifier is still applied.
             unsupported = "models with a data modifier"
-        elif "energy" not in self.dp.model["Default"].model_output_type():
-            # _eval_lower_vesin reconstructs the backend output from the
+        elif "energy" not in inner.model_output_type():
+            # _eval_lower_strategy reconstructs the backend output from the
             # forward_common_lower / communicate keys via _OUTDEF_DP2BACKEND,
             # which matches the model's own translation only for the energy
             # model (e.g. the polar fitting key is "polarizability" but the
-            # backend output is "polar").  Restrict vesin to energy models --
-            # the large-system inference target -- and fall back to native
-            # for the other fitting types.
+            # backend output is "polar").  Restrict strategies to energy models
+            # and fall back to native for the other fitting types.
             unsupported = "non-energy models"
         ase_provided = self.neighbor_list is not None
-        if nlist_backend == "native":
-            self._use_vesin = False
-        elif nlist_backend == "vesin":
-            if not is_vesin_torch_available():
-                raise ImportError(
-                    "nlist_backend='vesin' was requested but 'vesin.torch' is "
-                    "not installed. Install it (`pip install vesin[torch]`) or "
-                    "use nlist_backend='native' (or 'auto')."
-                )
+
+        builder = None
+        if nlist_backend in ("vesin", "nv"):
             if unsupported is not None:
                 raise ValueError(
-                    f"nlist_backend='vesin' is not supported for {unsupported}; "
-                    "use nlist_backend='native' (or 'auto')."
+                    f"nlist_backend='{nlist_backend}' is not supported for "
+                    f"{unsupported}; use nlist_backend='native' (or 'auto')."
                 )
             if ase_provided:
                 raise ValueError(
-                    "nlist_backend='vesin' conflicts with an explicitly "
-                    "supplied ASE neighbor_list; pass only one."
+                    f"nlist_backend='{nlist_backend}' conflicts with an "
+                    "explicitly supplied ASE neighbor_list; pass only one."
                 )
-            self._use_vesin = True
-        else:  # auto: use vesin when possible, otherwise fall back silently
-            self._use_vesin = (
-                is_vesin_torch_available() and unsupported is None and not ase_provided
-            )
-        self._nlist_builder = VesinNeighborList() if self._use_vesin else None
+            if nlist_backend == "vesin":
+                if not is_vesin_torch_available():
+                    raise ImportError(
+                        "nlist_backend='vesin' was requested but 'vesin.torch' "
+                        "is not installed. Install it (`pip install "
+                        "vesin[torch]`) or use nlist_backend='native' (or 'auto')."
+                    )
+                builder = VesinNeighborList()
+            elif DEVICE.type != "cuda":
+                raise ValueError(
+                    "nlist_backend='nv' requires CUDA inference tensors; "
+                    f"current DEVICE is {DEVICE!s}. Use nlist_backend='native' "
+                    "(or 'auto') for CPU inference."
+                )
+            elif not is_nv_available():
+                raise ImportError(
+                    "nlist_backend='nv' was requested but 'nvalchemi-toolkit-ops'"
+                    " is not installed. Install it (`pip install "
+                    "nvalchemi-toolkit-ops`) or use nlist_backend='native' "
+                    "(or 'auto')."
+                )
+            else:
+                builder = NvNeighborList()
+        elif nlist_backend == "auto" and unsupported is None and not ase_provided:
+            # Pick the first available O(N) builder; nv is GPU-only.
+            if is_vesin_torch_available():
+                builder = VesinNeighborList()
+            elif is_nv_available() and DEVICE.type == "cuda":
+                builder = NvNeighborList()
+        self._nlist_builder = builder
 
     def get_rcut(self) -> float:
         """Get the cutoff radius of this model."""
@@ -659,8 +694,8 @@ def _eval_model(
         do_atomic_virial = any(
             x.category == OutputVariableCategory.DERV_C for x in request_defs
         )
-        if self._use_vesin:
-            batch_output = self._eval_lower_vesin(
+        if self._nlist_builder is not None:
+            batch_output = self._eval_lower_strategy(
                 coord_input,
                 type_input,
                 box_input,
@@ -696,7 +731,7 @@ def _eval_model(
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _eval_lower_vesin(
+    def _eval_lower_strategy(
         self,
         coord: torch.Tensor,
         atype: torch.Tensor,
@@ -706,15 +741,15 @@ def _eval_lower_vesin(
         charge_spin: torch.Tensor | None,
         do_atomic_virial: bool,
     ) -> dict[str, torch.Tensor]:
-        """Evaluate via the O(N) vesin-built ``(i,j,S)`` extended neighbor list.
-
-        Builds the extended representation with the vesin cell list, runs the
-        model's ``forward_common_lower``, and maps the extended outputs back to
-        local atoms with ``communicate_extended_output``.  Returns a dict keyed
-        by backend names, matching the normal ``model()`` output so the caller's
-        extraction is unchanged.  ``forward_common_atomic`` sets
-        ``requires_grad`` on the extended coordinates internally, exactly as on
-        the native path, so forces/virials are produced identically.
+        """Evaluate via the selected O(N) ``NeighborList`` strategy.
+
+        Builds the extended representation with ``self._nlist_builder`` (vesin or
+        nv), runs the model's ``forward_common_lower``, and maps the extended
+        outputs back to local atoms with ``communicate_extended_output``.
+        Returns a dict keyed by backend names, matching the normal ``model()``
+        output so the caller's extraction is unchanged.  ``requires_grad`` is set
+        on the extended coordinates internally, exactly as on the native path, so
+        forces/virials are produced identically.
         """
         inner = self.dp.model["Default"]
         ext_coord, ext_atype, nlist, mapping = self._nlist_builder.build(

diff --git a/deepmd/pt/model/atomic_model/sezm_atomic_model.py b/deepmd/pt/model/atomic_model/sezm_atomic_model.py
@@ -727,8 +727,9 @@ def _build_dens_fitting_kwargs(self) -> dict[str, Any]:
         """Reconstruct SeZM `dens`-head kwargs from energy head and descriptor."""
         descriptor = self.descriptor
         kwargs = self._build_ener_fitting_kwargs()
-        kwargs["condition_lmax"] = int(descriptor.l_schedule[0])
-        kwargs["latent_lmax"] = int(descriptor.l_schedule[-1])
+        node_l_schedule = getattr(descriptor, "node_l_schedule", descriptor.l_schedule)
+        kwargs["condition_lmax"] = int(node_l_schedule[0])
+        kwargs["latent_lmax"] = int(node_l_schedule[-1])
         kwargs["channels"] = int(descriptor.channels)
         return kwargs