From 9a83e1dec9de4ae67cb144bedf1ae22cc2d9bfde Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 11:11:16 -0500
Subject: [PATCH 01/10] Migrate to /benchmarks directory

---
 benchmarks/cuda_bindings/.gitignore           |   16 +
 benchmarks/cuda_bindings/AGENTS.md            |    4 +
 benchmarks/cuda_bindings/README.md            |   74 +
 .../benchmarks/bench_ctx_device.py            |   62 +
 .../cuda_bindings/benchmarks/bench_event.py   |   62 +
 .../cuda_bindings/benchmarks/bench_launch.py  |  133 ++
 .../benchmarks/bench_pointer_attributes.py    |   25 +
 .../cuda_bindings/benchmarks/bench_stream.py  |   45 +
 .../benchmarks/cpp/CMakeLists.txt             |   92 +
 .../benchmarks/cpp/bench_ctx_device.cpp       |   87 +
 .../benchmarks/cpp/bench_event.cpp            |   90 +
 .../benchmarks/cpp/bench_launch.cpp           |  216 ++
 .../cpp/bench_pointer_attributes.cpp          |   59 +
 .../benchmarks/cpp/bench_stream.cpp           |   74 +
 .../benchmarks/cpp/bench_support.hpp          |  309 +++
 benchmarks/cuda_bindings/compare.py           |  118 ++
 benchmarks/cuda_bindings/pixi.lock            | 1767 +++++++++++++++++
 benchmarks/cuda_bindings/pixi.toml            |   87 +
 .../cuda_bindings/pytest-legacy/conftest.py   |   93 +
 .../cuda_bindings/pytest-legacy/kernels.py    |  159 ++
 .../cuda_bindings/pytest-legacy/test_cupy.py  |  199 ++
 .../pytest-legacy/test_launch_latency.py      |  336 ++++
 .../cuda_bindings/pytest-legacy/test_numba.py |   52 +
 .../pytest-legacy/test_pointer_attributes.py  |  112 ++
 benchmarks/cuda_bindings/run_cpp.py           |    8 +
 benchmarks/cuda_bindings/run_pyperf.py        |    8 +
 benchmarks/cuda_bindings/runner/__init__.py   |    3 +
 benchmarks/cuda_bindings/runner/cpp.py        |  180 ++
 benchmarks/cuda_bindings/runner/main.py       |  217 ++
 benchmarks/cuda_bindings/runner/runtime.py    |  105 +
 benchmarks/cuda_bindings/tests/test_runner.py |  166 ++
 31 files changed, 4958 insertions(+)
 create mode 100644 benchmarks/cuda_bindings/.gitignore
 create mode 100644 benchmarks/cuda_bindings/AGENTS.md
 create mode 100644 benchmarks/cuda_bindings/README.md
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_event.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_launch.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_stream.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
 create mode 100644 benchmarks/cuda_bindings/compare.py
 create mode 100644 benchmarks/cuda_bindings/pixi.lock
 create mode 100644 benchmarks/cuda_bindings/pixi.toml
 create mode 100644 benchmarks/cuda_bindings/pytest-legacy/conftest.py
 create mode 100644 benchmarks/cuda_bindings/pytest-legacy/kernels.py
 create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
 create mode 100755 benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
 create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_numba.py
 create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
 create mode 100644 benchmarks/cuda_bindings/run_cpp.py
 create mode 100644 benchmarks/cuda_bindings/run_pyperf.py
 create mode 100644 benchmarks/cuda_bindings/runner/__init__.py
 create mode 100644 benchmarks/cuda_bindings/runner/cpp.py
 create mode 100644 benchmarks/cuda_bindings/runner/main.py
 create mode 100644 benchmarks/cuda_bindings/runner/runtime.py
 create mode 100644 benchmarks/cuda_bindings/tests/test_runner.py

diff --git a/benchmarks/cuda_bindings/.gitignore b/benchmarks/cuda_bindings/.gitignore
new file mode 100644
index 0000000000..b795782a32
--- /dev/null
+++ b/benchmarks/cuda_bindings/.gitignore
@@ -0,0 +1,16 @@
+# Build artifacts
+.build/
+__pycache__/
+
+# Benchmark results
+*.json
+.benchmarks/
+
+# Pixi environments
+.pixi/
+
+# Override root .gitignore *.cpp rule (which targets Cython-generated files)
+!benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md
new file mode 100644
index 0000000000..04f2f713fa
--- /dev/null
+++ b/benchmarks/cuda_bindings/AGENTS.md
@@ -0,0 +1,4 @@
+# cuda.bindings benchmarks
+
+When generating code verify that that the code is correct based on the source for cuda-bindings
+that can be found in ../cuda_bindings
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
new file mode 100644
index 0000000000..75e16db031
--- /dev/null
+++ b/benchmarks/cuda_bindings/README.md
@@ -0,0 +1,74 @@
+# cuda.bindings benchmarks
+
+These benchmarks are intended to measure the latency overhead of calling CUDA
+Driver APIs through cuda.bindings, relative to a similar C++ baseline.
+
+The goal is to benchmark how much overhead does the Python layer adds to calling
+CUDA APIs and what operations are not in our target of less than 1us of overhead.
+
+Each Python benchmark has a C++ counterpart, which is used to compare the
+operations. We try to make each implementation perform small operations
+and nearly the same work as possible and are run under similar conditions.
+
+These are **not** throughput benchmarks to measure the overall performance
+of kernels and applications.
+
+## Usage
+
+Requires pixi.
+
+There are a couple of environments defined based on how `cuda.bindings` is installed:
+
+- `wheel`: Installs from conda packages
+- `source`: Installs from source
+
+There are a couple of tasks defined:
+
+- `bench`: Runs the Python benchmarks
+- `bench-cpp`: Runs the C++ benchmarks
+
+### System tuning
+
+For more stable results on Linux, tune the system before running benchmarks.
+See: https://pyperf.readthedocs.io/en/latest/system.html#system
+
+```bash
+# Show current system state
+pixi run -e wheel -- python -m pyperf system show
+
+# Apply tuning (may require root)
+sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+```
+
+### Running benchmarks
+
+To run the benchmarks combine the environment and task:
+
+```bash
+# Run the Python benchmarks in the wheel environment
+pixi run -e wheel bench
+
+# Run the Python benchmarks in the source environment
+pixi run -e source bench
+
+# Run the C++ benchmarks
+pixi run -e wheel bench-cpp
+```
+
+Both runners automatically save results to JSON files in the benchmarks
+directory: `results-python.json` and `results-cpp.json`.
+
+## Output JSON and analysis
+
+The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
+Both Python and C++ results are saved in pyperf-compatible JSON format,
+which can be analyzed with pyperf commands:
+
+```bash
+# Show results and statistics
+pixi run -e wheel -- python -m pyperf stats results-python.json
+pixi run -e wheel -- python -m pyperf stats results-cpp.json
+
+# Compare C++ vs Python results
+pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
+```
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
new file mode 100644
index 0000000000..1c82cd4046
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+CTX = ensure_context()
+
+_, DEVICE = cuda.cuDeviceGet(0)
+ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
+
+
+def bench_ctx_get_current(loops: int) -> float:
+    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetCurrent()
+    return time.perf_counter() - t0
+
+
+def bench_ctx_set_current(loops: int) -> float:
+    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _ctx = CTX
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxSetCurrent(_ctx)
+    return time.perf_counter() - t0
+
+
+def bench_ctx_get_device(loops: int) -> float:
+    _cuCtxGetDevice = cuda.cuCtxGetDevice
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetDevice()
+    return time.perf_counter() - t0
+
+
+def bench_device_get(loops: int) -> float:
+    _cuDeviceGet = cuda.cuDeviceGet
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGet(0)
+    return time.perf_counter() - t0
+
+
+def bench_device_get_attribute(loops: int) -> float:
+    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _attr = ATTRIBUTE
+    _dev = DEVICE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGetAttribute(_attr, _dev)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py
new file mode 100644
index 0000000000..e8e319115d
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
+
+cuda.cuEventRecord(EVENT, STREAM)
+cuda.cuStreamSynchronize(STREAM)
+
+EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
+
+
+def bench_event_create_destroy(loops: int) -> float:
+    _cuEventCreate = cuda.cuEventCreate
+    _cuEventDestroy = cuda.cuEventDestroy
+    _flags = EVENT_FLAGS
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, e = _cuEventCreate(_flags)
+        _cuEventDestroy(e)
+    return time.perf_counter() - t0
+
+
+def bench_event_record(loops: int) -> float:
+    _cuEventRecord = cuda.cuEventRecord
+    _event = EVENT
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventRecord(_event, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_event_query(loops: int) -> float:
+    _cuEventQuery = cuda.cuEventQuery
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventQuery(_event)
+    return time.perf_counter() - t0
+
+
+def bench_event_synchronize(loops: int) -> float:
+    _cuEventSynchronize = cuda.cuEventSynchronize
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventSynchronize(_event)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
new file mode 100644
index 0000000000..931194fbd3
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+import time
+
+from runner.runtime import alloc_persistent, assert_drv, compile_and_load
+
+from cuda.bindings import driver as cuda
+
+# Compile kernels lazily so benchmark discovery does not need NVRTC.
+KERNEL_SOURCE = """\
+extern "C" __global__ void empty_kernel() { return; }
+extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
+
+#define ITEM_PARAM(x, T) T x
+#define REP1(x, T)   , ITEM_PARAM(x, T)
+#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
+#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
+#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
+#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
+
+extern "C" __global__
+void small_kernel_16_args(
+    ITEM_PARAM(F, int*)
+    REP1(A, int*)
+    REP2(A, int*)
+    REP4(A, int*)
+    REP8(A, int*))
+{ *F = 0; }
+"""
+
+MODULE = None
+EMPTY_KERNEL = None
+SMALL_KERNEL = None
+KERNEL_16_ARGS = None
+STREAM = None
+FLOAT_PTR = None
+INT_PTRS = None
+_VAL_PS = None
+PACKED_16 = None
+
+
+def _ensure_launch_state() -> None:
+    global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
+    global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
+
+    if EMPTY_KERNEL is not None:
+        return
+
+    module = compile_and_load(KERNEL_SOURCE)
+
+    err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    assert_drv(err)
+    err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
+    assert_drv(err)
+    err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    assert_drv(err)
+
+    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+    assert_drv(err)
+
+    float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
+    int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
+
+    val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
+    packed_16 = (ctypes.c_void_p * 16)()
+    for index, value_ptr in enumerate(val_ps):
+        packed_16[index] = ctypes.addressof(value_ptr)
+
+    MODULE = module
+    EMPTY_KERNEL = empty_kernel
+    SMALL_KERNEL = small_kernel
+    KERNEL_16_ARGS = kernel_16_args
+    STREAM = stream
+    FLOAT_PTR = float_ptr
+    INT_PTRS = int_ptrs
+    _VAL_PS = val_ps
+    PACKED_16 = packed_16
+
+
+def bench_launch_empty_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = EMPTY_KERNEL
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_small_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = SMALL_KERNEL
+    _stream = STREAM
+    _args = (FLOAT_PTR,)
+    _arg_types = (None,)
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_16_args(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = KERNEL_16_ARGS
+    _stream = STREAM
+    _args = INT_PTRS
+    _arg_types = (None,) * 16
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_16_args_pre_packed(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = KERNEL_16_ARGS
+    _stream = STREAM
+    _packed = PACKED_16
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
new file mode 100644
index 0000000000..a02b82c399
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import alloc_persistent
+
+from cuda.bindings import driver as cuda
+
+# Allocate memory used by the tests
+PTR = alloc_persistent(1 << 18)
+ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+
+
+def bench_pointer_get_attribute(loops: int) -> float:
+    # Local references to avoid global lookups in the hot loop
+    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
+    _attr = ATTRIBUTE
+    _ptr = PTR
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuPointerGetAttribute(_attr, _ptr)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
new file mode 100644
index 0000000000..d816099ed5
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_stream_create_destroy(loops: int) -> float:
+    _cuStreamCreate = cuda.cuStreamCreate
+    _cuStreamDestroy = cuda.cuStreamDestroy
+    _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, s = _cuStreamCreate(_flags)
+        _cuStreamDestroy(s)
+    return time.perf_counter() - t0
+
+
+def bench_stream_query(loops: int) -> float:
+    _cuStreamQuery = cuda.cuStreamQuery
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuStreamQuery(_stream)
+    return time.perf_counter() - t0
+
+
+def bench_stream_synchronize(loops: int) -> float:
+    _cuStreamSynchronize = cuda.cuStreamSynchronize
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuStreamSynchronize(_stream)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..83326911af
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.24)
+project(cuda_bindings_cpp_benchmarks LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CUDA_HOME_HINT "$ENV{CUDA_HOME}")
+set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}")
+
+# Find cuda.h (driver API header)
+find_path(
+    CUDA_DRIVER_INCLUDE_DIR
+    cuda.h
+    HINTS
+        "${CUDA_HOME_HINT}/include"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
+        "${CONDA_PREFIX_HINT}/include"
+)
+
+# Find libcuda (driver API library) — lives on the system, not in toolkit
+find_library(
+    CUDA_DRIVER_LIBRARY
+    NAMES cuda
+    HINTS
+        "/usr/lib/x86_64-linux-gnu"
+        "/usr/lib64"
+        "${CUDA_HOME_HINT}/lib64/stubs"
+        "${CUDA_HOME_HINT}/lib/stubs"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs"
+        "${CONDA_PREFIX_HINT}/lib/stubs"
+)
+
+# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks)
+find_path(
+    NVRTC_INCLUDE_DIR
+    nvrtc.h
+    HINTS
+        "${CUDA_HOME_HINT}/include"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
+        "${CONDA_PREFIX_HINT}/include"
+)
+
+find_library(
+    NVRTC_LIBRARY
+    NAMES nvrtc
+    HINTS
+        "${CUDA_HOME_HINT}/lib64"
+        "${CUDA_HOME_HINT}/lib"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib"
+        "${CONDA_PREFIX_HINT}/lib"
+)
+
+if(NOT CUDA_DRIVER_INCLUDE_DIR)
+    message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
+endif()
+
+if(NOT CUDA_DRIVER_LIBRARY)
+    message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
+endif()
+
+# Helper: add a benchmark that only needs the driver API
+function(add_driver_benchmark name)
+    add_executable(${name}_cpp ${name}.cpp)
+    target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
+    target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
+endfunction()
+
+# Helper: add a benchmark that needs driver API + NVRTC
+function(add_nvrtc_benchmark name)
+    add_executable(${name}_cpp ${name}.cpp)
+    target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}")
+    target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}")
+endfunction()
+
+# Driver-only benchmarks
+add_driver_benchmark(bench_pointer_attributes)
+add_driver_benchmark(bench_ctx_device)
+add_driver_benchmark(bench_stream)
+add_driver_benchmark(bench_event)
+add_driver_benchmark(bench_memory)
+
+# NVRTC benchmarks (require nvrtc for kernel compilation)
+if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
+    add_nvrtc_benchmark(bench_launch)
+else()
+    message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.")
+endif()
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
new file mode 100644
index 0000000000..052df9cc1d
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup: init CUDA and create a context
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- ctx_get_current ---
+    {
+        CUcontext current_ctx = nullptr;
+        suite.run("ctx_device.ctx_get_current", [&]() {
+            check_cu(cuCtxGetCurrent(&current_ctx), "cuCtxGetCurrent failed");
+        });
+    }
+
+    // --- ctx_set_current ---
+    {
+        suite.run("ctx_device.ctx_set_current", [&]() {
+            check_cu(cuCtxSetCurrent(ctx), "cuCtxSetCurrent failed");
+        });
+    }
+
+    // --- ctx_get_device ---
+    {
+        CUdevice dev;
+        suite.run("ctx_device.ctx_get_device", [&]() {
+            check_cu(cuCtxGetDevice(&dev), "cuCtxGetDevice failed");
+        });
+    }
+
+    // --- device_get ---
+    {
+        CUdevice dev;
+        suite.run("ctx_device.device_get", [&]() {
+            check_cu(cuDeviceGet(&dev, 0), "cuDeviceGet failed");
+        });
+    }
+
+    // --- device_get_attribute ---
+    {
+        int value = 0;
+        suite.run("ctx_device.device_get_attribute", [&]() {
+            check_cu(
+                cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device),
+                "cuDeviceGetAttribute failed"
+            );
+        });
+    }
+
+    // Cleanup
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    // Write all results
+    suite.write();
+
+    return 0;
+}
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
new file mode 100644
index 0000000000..44cd617778
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
@@ -0,0 +1,90 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    // Persistent event for query/synchronize/record benchmarks
+    CUevent event;
+    check_cu(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed");
+
+    // Record and sync so the event starts in a completed state
+    check_cu(cuEventRecord(event, stream), "cuEventRecord failed");
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- event_create_destroy ---
+    {
+        CUevent e;
+        suite.run("event.event_create_destroy", [&]() {
+            check_cu(cuEventCreate(&e, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed");
+            check_cu(cuEventDestroy(e), "cuEventDestroy failed");
+        });
+    }
+
+    // --- event_record ---
+    {
+        suite.run("event.event_record", [&]() {
+            check_cu(cuEventRecord(event, stream), "cuEventRecord failed");
+        });
+    }
+
+    // Re-sync so event is in a known completed state after the record benchmark
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    {
+        suite.run("event.event_query", [&]() {
+            // Returns CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if not
+            cuEventQuery(event);
+        });
+    }
+
+    // --- event_synchronize ---
+    {
+        suite.run("event.event_synchronize", [&]() {
+            check_cu(cuEventSynchronize(event), "cuEventSynchronize failed");
+        });
+    }
+
+    // Cleanup
+    check_cu(cuEventDestroy(event), "cuEventDestroy failed");
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
new file mode 100644
index 0000000000..fb65da6d74
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
@@ -0,0 +1,216 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+static void check_nvrtc(nvrtcResult status, const char* message) {
+    if (status != NVRTC_SUCCESS) {
+        std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n';
+        std::exit(1);
+    }
+}
+
+static CUmodule compile_and_load(const char* source, CUdevice device) {
+    int major = 0, minor = 0;
+    check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device),
+             "cuDeviceGetAttribute failed");
+    check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device),
+             "cuDeviceGetAttribute failed");
+
+    nvrtcProgram prog;
+    check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr),
+                "nvrtcCreateProgram failed");
+
+    std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor);
+    const char* opts[] = {"--fmad=false", arch.c_str()};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts);
+
+    // Print log on failure
+    if (compile_result != NVRTC_SUCCESS) {
+        size_t log_size = 0;
+        nvrtcGetProgramLogSize(prog, &log_size);
+        std::vector<char> log(log_size);
+        nvrtcGetProgramLog(prog, log.data());
+        std::cerr << "NVRTC compile failed:\n" << log.data() << '\n';
+        std::exit(1);
+    }
+
+    size_t cubin_size = 0;
+    check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed");
+    std::vector<char> cubin(cubin_size);
+    check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed");
+    nvrtcDestroyProgram(&prog);
+
+    CUmodule module;
+    check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed");
+    return module;
+}
+
+
+static const char* KERNEL_SOURCE = R"(
+extern "C" __global__ void empty_kernel() { return; }
+extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
+
+extern "C" __global__
+void small_kernel_16_args(
+    int* a0,  int* a1,  int* a2,  int* a3,
+    int* a4,  int* a5,  int* a6,  int* a7,
+    int* a8,  int* a9,  int* a10, int* a11,
+    int* a12, int* a13, int* a14, int* a15)
+{ *a0 = 0; }
+)";
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUmodule module = compile_and_load(KERNEL_SOURCE, device);
+
+    CUfunction empty_kernel, small_kernel, kernel_16_args;
+    check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed");
+    check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed");
+    check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed");
+
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    // Allocate device memory for arguments
+    CUdeviceptr float_ptr;
+    check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed");
+
+    CUdeviceptr int_ptrs[16];
+    for (int i = 0; i < 16; ++i) {
+        check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed");
+    }
+
+    // Pre-pack kernel params for the pre-packed benchmark
+    void* packed_16[16];
+    for (int i = 0; i < 16; ++i) {
+        packed_16[i] = &int_ptrs[i];
+    }
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- launch_empty_kernel ---
+    {
+        suite.run("launch.launch_empty_kernel", [&]() {
+            check_cu(
+                cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    // Drain the stream between benchmarks so each starts with a clean queue
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    {
+        void* params[] = {&float_ptr};
+        suite.run("launch.launch_small_kernel", [&]() {
+            check_cu(
+                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    {
+        suite.run("launch.launch_16_args", [&]() {
+            check_cu(
+                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
+    // We include it for naming parity with the Python benchmark.
+    {
+        suite.run("launch.launch_16_args_pre_packed", [&]() {
+            check_cu(
+                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    // --- launch_small_kernel ---
+    {
+        void* params[] = {&float_ptr};
+        suite.run("launch.launch_small_kernel", [&]() {
+            check_cu(
+                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    // --- launch_16_args ---
+    {
+        suite.run("launch.launch_16_args", [&]() {
+            check_cu(
+                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
+    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
+    // We include it for naming parity with the Python benchmark.
+    {
+        suite.run("launch.launch_16_args_pre_packed", [&]() {
+            check_cu(
+                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
+                "cuLaunchKernel failed"
+            );
+        });
+    }
+
+    // Cleanup
+    for (int i = 0; i < 16; ++i) {
+        check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
+    }
+    check_cu(cuMemFree(float_ptr), "cuMemFree failed");
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuModuleUnload(module), "cuModuleUnload failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
new file mode 100644
index 0000000000..4d9afc6566
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup: init CUDA, allocate memory
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUdeviceptr ptr;
+    check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed");
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- pointer_get_attribute ---
+    {
+        unsigned int memory_type = 0;
+        suite.run("pointer_attributes.pointer_get_attribute", [&]() {
+            check_cu(
+                cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr),
+                "cuPointerGetAttribute failed"
+            );
+        });
+    }
+
+    // Cleanup
+    check_cu(cuMemFree(ptr), "cuMemFree failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
new file mode 100644
index 0000000000..702e86aef0
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    // Persistent stream for query/synchronize benchmarks
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- stream_create_destroy ---
+    {
+        CUstream s;
+        suite.run("stream.stream_create_destroy", [&]() {
+            check_cu(cuStreamCreate(&s, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+            check_cu(cuStreamDestroy(s), "cuStreamDestroy failed");
+        });
+    }
+
+    // --- stream_query ---
+    {
+        suite.run("stream.stream_query", [&]() {
+            // cuStreamQuery returns CUDA_SUCCESS if stream is idle,
+            // CUDA_ERROR_NOT_READY if busy — both are valid here.
+            cuStreamQuery(stream);
+        });
+    }
+
+    // --- stream_synchronize ---
+    {
+        suite.run("stream.stream_synchronize", [&]() {
+            check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+        });
+    }
+
+    // Cleanup
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
new file mode 100644
index 0000000000..837c15a9d1
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
@@ -0,0 +1,309 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace bench {
+
+struct Options {
+    std::uint64_t loops = 1000;
+    std::uint64_t warmups = 5;
+    std::uint64_t values = 20;
+    std::uint64_t runs = 20;
+    std::string output_path;
+    std::string benchmark_name;
+};
+
+// A single run result: warmup values and timed values (seconds per loop)
+struct RunResult {
+    std::string date;
+    double duration_sec;
+    std::vector<double> warmup_values;  // seconds per loop
+    std::vector<double> values;         // seconds per loop
+};
+
+inline Options parse_args(int argc, char** argv) {
+    Options options;
+    for (int i = 1; i < argc; ++i) {
+        const std::string arg(argv[i]);
+        if (arg == "--loops" && i + 1 < argc) {
+            options.loops = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--warmups" && i + 1 < argc) {
+            options.warmups = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--values" && i + 1 < argc) {
+            options.values = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--runs" && i + 1 < argc) {
+            options.runs = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if ((arg == "-o" || arg == "--output") && i + 1 < argc) {
+            options.output_path = argv[++i];
+            continue;
+        }
+        if (arg == "--name" && i + 1 < argc) {
+            options.benchmark_name = argv[++i];
+            continue;
+        }
+        if (arg == "--help" || arg == "-h") {
+            std::cout << "Usage: benchmark [options]\n"
+                      << "  --loops N       Loop iterations per value (default: 1000)\n"
+                      << "  --warmups N     Warmup values per run (default: 5)\n"
+                      << "  --values N      Timed values per run (default: 20)\n"
+                      << "  --runs N        Number of runs (default: 20)\n"
+                      << "  -o, --output F  Write pyperf-compatible JSON to file\n"
+                      << "  --name S        Benchmark name (overrides default)\n";
+            std::exit(0);
+        }
+
+        std::cerr << "Unknown argument: " << arg << '\n';
+        std::exit(2);
+    }
+    return options;
+}
+
+inline std::string iso_now() {
+    const auto now = std::chrono::system_clock::now();
+    const std::time_t t = std::chrono::system_clock::to_time_t(now);
+    std::tm tm{};
+#ifdef _WIN32
+    gmtime_s(&tm, &t);
+#else
+    gmtime_r(&t, &tm);
+#endif
+    char buf[64];
+    std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm);
+    return std::string(buf);
+}
+
+// Run a benchmark function. The function signature is: void fn() — one call = one operation.
+// The harness calls fn() in a tight loop `loops` times per value.
+template <typename Fn>
+std::vector<RunResult> run_benchmark(const Options& options, Fn&& fn) {
+    std::vector<RunResult> results;
+    results.reserve(options.runs);
+
+    for (std::uint64_t r = 0; r < options.runs; ++r) {
+        RunResult run;
+        run.date = iso_now();
+        const auto run_start = std::chrono::steady_clock::now();
+
+        // Warmups
+        for (std::uint64_t w = 0; w < options.warmups; ++w) {
+            const auto t0 = std::chrono::steady_clock::now();
+            for (std::uint64_t i = 0; i < options.loops; ++i) {
+                fn();
+            }
+            const auto t1 = std::chrono::steady_clock::now();
+            const double elapsed = std::chrono::duration<double>(t1 - t0).count();
+            run.warmup_values.push_back(elapsed / static_cast<double>(options.loops));
+        }
+
+        // Timed values
+        for (std::uint64_t v = 0; v < options.values; ++v) {
+            const auto t0 = std::chrono::steady_clock::now();
+            for (std::uint64_t i = 0; i < options.loops; ++i) {
+                fn();
+            }
+            const auto t1 = std::chrono::steady_clock::now();
+            const double elapsed = std::chrono::duration<double>(t1 - t0).count();
+            run.values.push_back(elapsed / static_cast<double>(options.loops));
+        }
+
+        const auto run_end = std::chrono::steady_clock::now();
+        run.duration_sec = std::chrono::duration<double>(run_end - run_start).count();
+        results.push_back(std::move(run));
+    }
+
+    return results;
+}
+
+inline void print_summary(const std::string& name, const std::vector<RunResult>& results) {
+    // Collect all timed values
+    std::vector<double> all_values;
+    for (const auto& run : results) {
+        for (double v : run.values) {
+            all_values.push_back(v);
+        }
+    }
+    if (all_values.empty())
+        return;
+
+    double sum = 0;
+    for (double v : all_values)
+        sum += v;
+
+    double mean = sum / static_cast<double>(all_values.size());
+
+    double sq_sum = 0;
+    for (double v : all_values) {
+        double diff = v - mean;
+        sq_sum += diff * diff;
+    }
+    double stdev = std::sqrt(sq_sum / static_cast<double>(all_values.size()));
+
+    std::cout << name << ": Mean +- std dev: "
+              << std::fixed << std::setprecision(0)
+              << (mean * 1e9) << " ns +- "
+              << (stdev * 1e9) << " ns\n";
+}
+
+// Escape a JSON string (minimal — no control chars expected)
+inline std::string json_str(const std::string& s) {
+    return "\"" + s + "\"";
+}
+
+inline void write_pyperf_json(
+    const std::string& output_path,
+    const std::string& name,
+    std::uint64_t loops,
+    const std::vector<RunResult>& results
+) {
+    std::ofstream out(output_path);
+    if (!out) {
+        std::cerr << "Failed to open output file: " << output_path << '\n';
+        std::exit(3);
+    }
+
+    out << std::setprecision(17);
+
+    out << "{\"version\": \"1.0\", ";
+    out << "\"metadata\": {";
+    out << "\"name\": " << json_str(name) << ", ";
+    out << "\"loops\": " << loops << ", ";
+    out << "\"unit\": \"second\"";
+    out << "}, ";
+
+    out << "\"benchmarks\": [{\"runs\": [";
+
+    for (std::size_t r = 0; r < results.size(); ++r) {
+        const auto& run = results[r];
+        if (r > 0) out << ", ";
+
+        out << "{\"metadata\": {";
+        out << "\"date\": " << json_str(run.date) << ", ";
+        out << "\"duration\": " << run.duration_sec;
+        out << "}, ";
+
+        // Warmups: array of [loops, value] pairs
+        out << "\"warmups\": [";
+        for (std::size_t w = 0; w < run.warmup_values.size(); ++w) {
+            if (w > 0) out << ", ";
+            out << "[" << loops << ", " << run.warmup_values[w] << "]";
+        }
+        out << "], ";
+
+        // Values
+        out << "\"values\": [";
+        for (std::size_t v = 0; v < run.values.size(); ++v) {
+            if (v > 0) out << ", ";
+            out << run.values[v];
+        }
+        out << "]}";
+    }
+
+    out << "]}]}\n";
+}
+
+// A collected benchmark entry: name, loops, and run results
+struct BenchmarkEntry {
+    std::string name;
+    std::uint64_t loops;
+    std::vector<RunResult> results;
+};
+
+// Collect multiple benchmarks from a single binary and write them all
+// to one pyperf-compatible JSON file.
+class BenchmarkSuite {
+public:
+    explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}
+
+    // Run a benchmark and record it. The name is used as the benchmark ID.
+    template <typename Fn>
+    void run(const std::string& name, Fn&& fn) {
+        auto results = run_benchmark(options_, std::forward<Fn>(fn));
+        print_summary(name, results);
+        entries_.push_back({name, options_.loops, std::move(results)});
+    }
+
+    // Write all collected benchmarks to the output file (if -o was given).
+    void write() const {
+        if (options_.output_path.empty() || entries_.empty())
+            return;
+        write_multi_pyperf_json(options_.output_path, entries_);
+    }
+
+private:
+    Options options_;
+    std::vector<BenchmarkEntry> entries_;
+
+    static void write_multi_pyperf_json(
+        const std::string& output_path,
+        const std::vector<BenchmarkEntry>& entries
+    ) {
+        std::ofstream out(output_path);
+        if (!out) {
+            std::cerr << "Failed to open output file: " << output_path << '\n';
+            std::exit(3);
+        }
+
+        out << std::setprecision(17);
+        out << "{\"version\": \"1.0\", \"benchmarks\": [";
+
+        for (std::size_t e = 0; e < entries.size(); ++e) {
+            const auto& entry = entries[e];
+            if (e > 0) out << ", ";
+
+            out << "{\"metadata\": {";
+            out << "\"name\": " << json_str(entry.name) << ", ";
+            out << "\"loops\": " << entry.loops << ", ";
+            out << "\"unit\": \"second\"";
+            out << "}, \"runs\": [";
+
+            for (std::size_t r = 0; r < entry.results.size(); ++r) {
+                const auto& run = entry.results[r];
+                if (r > 0) out << ", ";
+
+                out << "{\"metadata\": {";
+                out << "\"date\": " << json_str(run.date) << ", ";
+                out << "\"duration\": " << run.duration_sec;
+                out << "}, ";
+
+                out << "\"warmups\": [";
+                for (std::size_t w = 0; w < run.warmup_values.size(); ++w) {
+                    if (w > 0) out << ", ";
+                    out << "[" << entry.loops << ", " << run.warmup_values[w] << "]";
+                }
+                out << "], ";
+
+                out << "\"values\": [";
+                for (std::size_t v = 0; v < run.values.size(); ++v) {
+                    if (v > 0) out << ", ";
+                    out << run.values[v];
+                }
+                out << "]}";
+            }
+            out << "]}";
+        }
+        out << "]}\n";
+    }
+};
+
+}  // namespace bench
diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
new file mode 100644
index 0000000000..6a3e94f344
--- /dev/null
+++ b/benchmarks/cuda_bindings/compare.py
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Compare Python and C++ benchmark results in a summary table."""
+
+import argparse
+import json
+import statistics
+import sys
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json"
+DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json"
+
+
+def load_benchmarks(path: Path) -> dict[str, list[float]]:
+    """Load a pyperf JSON file and return {name: [values]}."""
+    with open(path) as f:
+        data = json.load(f)
+
+    results: dict[str, list[float]] = {}
+    for bench in data.get("benchmarks", []):
+        name = bench.get("metadata", {}).get("name", "")
+        if not name:
+            # Try to find name in run metadata
+            for run in bench.get("runs", []):
+                name = run.get("metadata", {}).get("name", "")
+                if name:
+                    break
+        values = []
+        for run in bench.get("runs", []):
+            values.extend(run.get("values", []))
+        if name and values:
+            results[name] = values
+    return results
+
+
+def fmt_ns(seconds: float) -> str:
+    ns = seconds * 1e9
+    if ns >= 1000:
+        return f"{ns / 1000:.2f} us"
+    return f"{ns:.0f} ns"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results")
+    parser.add_argument(
+        "--python",
+        type=Path,
+        default=DEFAULT_PYTHON,
+        help=f"Python results JSON (default: {DEFAULT_PYTHON.name})",
+    )
+    parser.add_argument(
+        "--cpp",
+        type=Path,
+        default=DEFAULT_CPP,
+        help=f"C++ results JSON (default: {DEFAULT_CPP.name})",
+    )
+    args = parser.parse_args()
+
+    if not args.python.exists():
+        print(f"Python results not found: {args.python}", file=sys.stderr)
+        print("Run: pixi run -e wheel bench", file=sys.stderr)
+        sys.exit(1)
+
+    py_benchmarks = load_benchmarks(args.python)
+    cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {}
+
+    if not py_benchmarks:
+        print("No benchmarks found in Python results.", file=sys.stderr)
+        sys.exit(1)
+
+    # Column widths
+    all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks))
+    name_width = max(len(n) for n in all_names)
+    name_width = max(name_width, len("Benchmark"))
+
+    # Header
+    if cpp_benchmarks:
+        header = f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'Python (mean)':>14}  {'Overhead':>10}"
+        sep = "-" * len(header)
+        print(sep)
+        print(header)
+        print(sep)
+    else:
+        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}"
+        sep = "-" * len(header)
+        print(sep)
+        print(header)
+        print(sep)
+
+    for name in all_names:
+        py_vals = py_benchmarks.get(name)
+        cpp_vals = cpp_benchmarks.get(name)
+
+        py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
+        cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
+
+        if py_vals and cpp_vals:
+            py_mean = statistics.mean(py_vals)
+            cpp_mean = statistics.mean(cpp_vals)
+            overhead_ns = (py_mean - cpp_mean) * 1e9
+            overhead_str = f"+{overhead_ns:.0f} ns"
+        else:
+            overhead_str = "-"
+
+        if cpp_benchmarks:
+            print(f"{name:<{name_width}}  {cpp_str:>12}  {py_str:>14}  {overhead_str:>10}")
+        else:
+            print(f"{name:<{name_width}}  {py_str:>14}")
+
+    print(sep)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_bindings/pixi.lock b/benchmarks/cuda_bindings/pixi.lock
new file mode 100644
index 0000000000..c610db2f45
--- /dev/null
+++ b/benchmarks/cuda_bindings/pixi.lock
@@ -0,0 +1,1767 @@
+version: 6
+environments:
+  default:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages: {}
+  source:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+      - conda: ..
+      - conda: ../../cuda_pathfinder
+  wheel:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+packages:
+- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+  build_number: 20
+  sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9
+  md5: a9f577daf3de00bca7c3c76c0ecbd1de
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgomp >=7.5.0
+  constrains:
+  - openmp_impl <0.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 28948
+  timestamp: 1770939786096
+- conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
+  sha256: a9c114cbfeda42a226e2db1809a538929d2f118ef855372293bd188f71711c48
+  md5: 791365c5f65975051e4e017b5da3abf5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: GPL-2.0-or-later
+  license_family: GPL
+  size: 68072
+  timestamp: 1756738968573
+- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
+  sha256: 2851d34944b056d028543f0440fb631aeeff204151ea09589d8d9c13882395de
+  md5: 9902aeb08445c03fb31e01beeb173988
+  depends:
+  - binutils_impl_linux-64 >=2.45.1,<2.45.2.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 35128
+  timestamp: 1770267175160
+- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
+  sha256: 74341b26a2b9475dc14ba3cf12432fcd10a23af285101883e720216d81d44676
+  md5: 83aa53cb3f5fc849851a84d777a60551
+  depends:
+  - ld_impl_linux-64 2.45.1 default_hbd61a6d_101
+  - sysroot_linux-64
+  - zstd >=1.5.7,<1.6.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 3744895
+  timestamp: 1770267152681
+- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
+  sha256: 4826f97d33cbe54459970a1e84500dbe0cccf8326aaf370e707372ae20ec5a47
+  md5: dec96579f9a7035a59492bf6ee613b53
+  depends:
+  - binutils_impl_linux-64 2.45.1 default_hfdba357_101
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 36060
+  timestamp: 1770267177798
+- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+  sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6
+  md5: d2ffd7602c02f2b316fd921d39876885
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 260182
+  timestamp: 1771350215188
+- conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
+  sha256: cc9accf72fa028d31c2a038460787751127317dcfa991f8d1f1babf216bb454e
+  md5: 920bb03579f15389b9e512095ad995b7
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: MIT
+  license_family: MIT
+  size: 207882
+  timestamp: 1765214722852
+- conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
+  sha256: 8e7a40f16400d7839c82581410aa05c1f8324a693c9d50079f8c50dc9fb241f0
+  md5: abd85120de1187b0d1ec305c2173c71b
+  depends:
+  - binutils
+  - gcc
+  - gcc_linux-64 14.*
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6693
+  timestamp: 1753098721814
+- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
+  sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc
+  md5: 4492fd26db29495f0ba23f146cd5638d
+  depends:
+  - __unix
+  license: ISC
+  size: 147413
+  timestamp: 1772006283803
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+  sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e
+  md5: cf45f4278afd6f4e6d03eda0f435d527
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libffi >=3.5.2,<3.6.0a0
+  - libgcc >=14
+  - pycparser
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  license: MIT
+  license_family: MIT
+  size: 300271
+  timestamp: 1761203085220
+- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+  sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449
+  md5: 381bd45fb7aa032691f3063aff47e3a1
+  depends:
+  - python >=3.10
+  license: MIT
+  license_family: MIT
+  size: 13589
+  timestamp: 1763607964133
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
+  sha256: 5ece78754577b8d9030ec1f09ce1cd481125f27d8d6fcdcfe2c1017661830c61
+  md5: 51d37989c1758b5edfe98518088bf700
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - bzip2 >=1.0.8,<2.0a0
+  - libcurl >=8.18.0,<9.0a0
+  - libexpat >=2.7.4,<3.0a0
+  - libgcc >=14
+  - liblzma >=5.8.2,<6.0a0
+  - libstdcxx >=14
+  - libuv >=1.51.0,<2.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - ncurses >=6.5,<7.0a0
+  - rhash >=1.4.6,<2.0a0
+  - zstd >=1.5.7,<1.6.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 22330508
+  timestamp: 1771383666798
+- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+  sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287
+  md5: 962b9857ee8e7018c22f2776ffa0b2d7
+  depends:
+  - python >=3.9
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 27011
+  timestamp: 1733218222191
+- conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
+  sha256: b90ec0e6a9eb22f7240b3584fe785457cff961fec68d40e6aece5d596f9bbd9a
+  md5: 0e3e144115c43c9150d18fa20db5f31c
+  depends:
+  - gcc_impl_linux-64 >=14.3.0,<14.3.1.0a0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 31705
+  timestamp: 1771378159534
+- conda: ..
+  name: cuda-bindings
+  version: 13.2.0
+  build: hb0f4dca_0
+  subdir: linux-64
+  variants:
+    target_platform: linux-64
+  depends:
+  - python
+  - cuda-version
+  - cuda-pathfinder
+  - libnvjitlink
+  - cuda-nvrtc
+  - cuda-nvrtc >=13.2.51,<14.0a0
+  - cuda-nvvm
+  - libnvfatbin
+  - libcufile
+  - libcufile >=1.17.0.44,<2.0a0
+  - libgcc >=15
+  - libgcc >=15
+  - libstdcxx >=15
+  - python_abi 3.14.* *_cp314
+  license: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+  sources:
+    cuda-pathfinder:
+      path: ../cuda_pathfinder
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda
+  sha256: aecfbbc9a687e5daba66b896613a00c617e3eadc21a31b19e53e8e642e83d7a7
+  md5: 3bd3abdf71e1b8c53310195677bf00be
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc >=13,<14.0a0
+  - cuda-nvvm-impl >=13,<14.0a0
+  - cuda-pathfinder >=1.1.0,<2
+  - cuda-version >=13,<14.0a0
+  - libcufile >=1,<2.0a0
+  - libgcc >=14
+  - libnvjitlink >=13.0,<14.0a0
+  - libstdcxx >=14
+  - numpy
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  constrains:
+  - cuda-python >=13.1.0,<13.2.0a0
+  - cuda-cudart >=13,<14.0a0
+  license: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+  size: 7267159
+  timestamp: 1764919647948
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda
+  sha256: 0715f15da71587238600f0584bc8d243d8fde602c3d8856f421b58dff3fb9422
+  md5: a179486129ff28d053bb16fdb533568e
+  depends:
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1277295
+  timestamp: 1768272295906
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda
+  sha256: e539baa32e3be63f89bd11d421911363faac322903caf58a15a46ba68ae29867
+  md5: 4910b7b709f1168baffc2a742b39a222
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1415308
+  timestamp: 1773098874302
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda
+  sha256: 82ae1f3e492146722e258e237daa537f4d4df8157b2dfa49a0869eb41a11d284
+  md5: 3723bca2a84e6cc0f0a98427b71bec73
+  depends:
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 96480
+  timestamp: 1768280269206
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda
+  sha256: dd9a74a40b196b1ea150b17ca8fb539dd8f75edd349af354a7bae6dbb43e43b4
+  md5: 6f4a609f3d142d4b22728823955249e9
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 97122
+  timestamp: 1773115163637
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda
+  sha256: 00acb7564e7c7dd60be431bd2a1a937856e38a86535d72281461cd193500a0a4
+  md5: 2e2b71c8d67f6ceb1d3820aa438f3580
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart_linux-64 13.1.80 h376f20c_0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24159
+  timestamp: 1764883525821
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda
+  sha256: 9cc44fd4914738a32cf5c801925a08c61ce45b5534833cf1df1621236a9a321d
+  md5: 29f5b46965bd82b0e9cc27a96d13f2bd
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart_linux-64 13.2.51 h376f20c_0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24534
+  timestamp: 1773104357094
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda
+  sha256: 12aa5dcf82cdf863be18a48a9ad4d271aa864ef985752bc9707371b84085f0c8
+  md5: e3cbe24bf8ae135e9f82450be520e886
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart 13.1.80 hecca717_0
+  - cuda-cudart-dev_linux-64 13.1.80 h376f20c_0
+  - cuda-cudart-static 13.1.80 hecca717_0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24597
+  timestamp: 1764883573873
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda
+  sha256: f6d81c961b6212389c07ffc9dc1268966db63aa351d46875effee40447eb9dd8
+  md5: 9b35a56418b6cbbde5ea5f7d84c26317
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart 13.2.51 hecca717_0
+  - cuda-cudart-dev_linux-64 13.2.51 h376f20c_0
+  - cuda-cudart-static 13.2.51 hecca717_0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24961
+  timestamp: 1773104406956
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda
+  sha256: 41a1cc86f2759ef6ae47cc68e2180baaeb4b989709931366ee0cdc90f8e10f5f
+  md5: a36776a49ae0e47a26e129bdc82aeb3e
+  depends:
+  - cuda-cccl_linux-64
+  - cuda-cudart-static_linux-64
+  - cuda-cudart_linux-64
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 392459
+  timestamp: 1764883538793
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda
+  sha256: 86dd0dc301bab5263d63f13d47b02507e0cf2fd22ff9aefa37dea2dd03c6df83
+  md5: 7e5cf4b991525b7b1a2cfa3f1c81462e
+  depends:
+  - cuda-cccl_linux-64
+  - cuda-cudart-static_linux-64
+  - cuda-cudart_linux-64
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 399921
+  timestamp: 1773104368666
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
+  sha256: 7cbf145b3e59d360052556bfe9425753b119c33cbba0c1f20f0191a7330ced5c
+  md5: 0e5edde73725a13f7d62ddf96b7656b9
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart-static_linux-64 13.1.80 h376f20c_0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24119
+  timestamp: 1764883551735
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda
+  sha256: d4a316038b02161e04a864c8cd146d2ec62cbd114eb951197c6ef6042d3c46c4
+  md5: daec4c4dc0355adcdf009dceb3b94259
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart-static_linux-64 13.2.51 h376f20c_0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24494
+  timestamp: 1773104383494
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
+  sha256: 2252e12fa9a806f685684b6395a660d845dc95bdc95e52a6bc09dba8a9eccec3
+  md5: be9f8ef5a01fca1f28c8d523f8501771
+  depends:
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1121385
+  timestamp: 1764883490595
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
+  sha256: e3cc51809bd8be0a96bbe01a668f08e6e611c8fba60426c4d9f10926f3159456
+  md5: aa9c7d5cd427042ffbd59c9ef6014f98
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1103784
+  timestamp: 1773104321614
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
+  sha256: fca2951815564c36cf5a4e0f7ed0222429d206fda3d4e1aa3d52a969a293b868
+  md5: 4dc4c3a1e010e06035f01d661c1b70bd
+  depends:
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 199654
+  timestamp: 1764883502803
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
+  sha256: e1d943a5582c8e171c9dcf2c0c72ddd5bf0a2ac9acd6ed15898d69d618cf53c6
+  md5: 51a1624c7e26d8821b5d959ee7ecb517
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 203460
+  timestamp: 1773104333900
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda
+  sha256: 83bf37d5a3b4a85853cded6a8b90db302b014845b7d9461ccdb84db8c2abfbc3
+  md5: 1d7073905d0359ff234545494a933d59
+  depends:
+  - cuda-version >=13.1,<13.2.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 38992
+  timestamp: 1764883514338
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
+  sha256: 1b372b7af937a3a2fdb1cbd5356e6b365f3495d899a413ebf98369ab0c5c0c79
+  md5: 970891239574056829fc1cfc208278a7
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 39485
+  timestamp: 1773104345638
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+  sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b
+  md5: df16c9049d882cdaf4f83a5b90079589
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 35339417
+  timestamp: 1768272955912
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
+  sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5
+  md5: 0413baaa73be1a39d5d8e442184acc78
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 35736655
+  timestamp: 1773100338749
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
+  sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3
+  md5: 1096fce4abad7dd975ce6d9953fceb6a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc 13.1.115 hecca717_0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  constrains:
+  - cuda-nvrtc-static >=13.1.115
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 35845
+  timestamp: 1768273073971
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
+  sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879
+  md5: 83535200a9e77165d5291b4ac82ebf6a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc 13.2.51 hecca717_0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  constrains:
+  - cuda-nvrtc-static >=13.2.51
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 36305
+  timestamp: 1773100458841
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
+  sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b
+  md5: 03cd3639b8e13623c7b91b1cb0136402
+  depends:
+  - cuda-nvvm-dev_linux-64 13.2.51.*
+  - cuda-nvvm-impl 13.2.51.*
+  - cuda-nvvm-tools 13.2.51.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 25494
+  timestamp: 1773157399568
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
+  sha256: f00fce92bf7f1da314654f7693f571a014aaa2ba1fae3762634f3e5be254da83
+  md5: 57724ac113f7435762d0c39e1b1ad341
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 28399
+  timestamp: 1773115185916
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
+  sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c
+  md5: bf76661bc0de83a60537c4913f339fb3
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=12
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21873791
+  timestamp: 1768280315627
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
+  sha256: bea7cbd2ff0f8bf07e0b90d522b4834533b4024237322c09f1b3875970c4abc9
+  md5: 3c3872ff2bd6cc6368dcd4b35bb995f2
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=12
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 22202489
+  timestamp: 1773115209641
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda
+  sha256: da5fd2dc57df2047215ff76f295685b1e1e586a46c2e46214120458cee18ee80
+  md5: 2df6cd3b3d6d1365a2979285703056f9
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=12
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 25988523
+  timestamp: 1773115248060
+- conda: ../../cuda_pathfinder
+  name: cuda-pathfinder
+  version: 1.3.4a0
+  build: pyh4616a5c_0
+  subdir: noarch
+  variants:
+    target_platform: noarch
+  depends:
+  - python >=3.10
+  - python *
+  license: Apache-2.0
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
+  sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151
+  md5: 2d13e524da66b60e6e7d5c6585729ea8
+  depends:
+  - python >=3.10
+  - cuda-version >=12.0,<14
+  - python
+  license: Apache-2.0
+  license_family: APACHE
+  size: 39327
+  timestamp: 1772059437166
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
+  sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992
+  md5: d85448460c25ee43ff2f8346bb9ad52b
+  constrains:
+  - cudatoolkit 13.1|13.1.*
+  - __cuda >=13
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21511
+  timestamp: 1757017115788
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
+  sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679
+  md5: 45676e3dd76b30ec613f1f822d450eff
+  constrains:
+  - __cuda >=13
+  - cudatoolkit 13.2|13.2.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21908
+  timestamp: 1773093709154
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
+  sha256: 3fcc97ae3e89c150401a50a4de58794ffc67b1ed0e1851468fcc376980201e25
+  md5: 5da8c935dca9186673987f79cef0b2a5
+  depends:
+  - c-compiler 1.11.0 h4d9bdce_0
+  - gxx
+  - gxx_linux-64 14.*
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6635
+  timestamp: 1753098722177
+- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+  sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e
+  md5: 003b8ba0a94e2f1e117d0bd46aebc901
+  depends:
+  - python >=3.9
+  license: Apache-2.0
+  license_family: APACHE
+  size: 275642
+  timestamp: 1752823081585
+- conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
+  sha256: ee6cf346d017d954255bbcbdb424cddea4d14e4ed7e9813e429db1d795d01144
+  md5: 8e662bd460bda79b1ea39194e3c4c9ab
+  depends:
+  - python >=3.10
+  - typing_extensions >=4.6.0
+  license: MIT and PSF-2.0
+  size: 21333
+  timestamp: 1763918099466
+- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
+  sha256: dddea9ec53d5e179de82c24569d41198f98db93314f0adae6b15195085d5567f
+  md5: f58064cec97b12a7136ebb8a6f8a129b
+  depends:
+  - python >=3.10
+  license: Unlicense
+  size: 25845
+  timestamp: 1773314012590
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
+  sha256: 9b34b57b06b485e33a40d430f71ac88c8f381673592507cf7161c50ff0832772
+  md5: 52d6457abc42e320787ada5f9033fa99
+  depends:
+  - conda-gcc-specs
+  - gcc_impl_linux-64 14.3.0 hbdf3cc3_18
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 29506
+  timestamp: 1771378321585
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
+  sha256: 3b31a273b806c6851e16e9cf63ef87cae28d19be0df148433f3948e7da795592
+  md5: 30bb690150536f622873758b0e8d6712
+  depends:
+  - binutils_impl_linux-64 >=2.45
+  - libgcc >=14.3.0
+  - libgcc-devel_linux-64 14.3.0 hf649bbc_118
+  - libgomp >=14.3.0
+  - libsanitizer 14.3.0 h8f1669f_18
+  - libstdcxx >=14.3.0
+  - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118
+  - sysroot_linux-64
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 76302378
+  timestamp: 1771378056505
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
+  sha256: 27ad0cd10dccffca74e20fb38c9f8643ff8fce56eee260bf89fa257d5ab0c90a
+  md5: 1403ed5fe091bd7442e4e8a229d14030
+  depends:
+  - gcc_impl_linux-64 14.3.0.*
+  - binutils_linux-64
+  - sysroot_linux-64
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 28946
+  timestamp: 1770908213807
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
+  sha256: 1b490c9be9669f9c559db7b2a1f7d8b973c58ca0c6f21a5d2ba3f0ab2da63362
+  md5: 19189121d644d4ef75fed05383bc75f5
+  depends:
+  - gcc 14.3.0 h0dff253_18
+  - gxx_impl_linux-64 14.3.0 h2185e75_18
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 28883
+  timestamp: 1771378355605
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
+  sha256: 38ffca57cc9c264d461ac2ce9464a9d605e0f606d92d831de9075cb0d95fc68a
+  md5: 6514b3a10e84b6a849e1b15d3753eb22
+  depends:
+  - gcc_impl_linux-64 14.3.0 hbdf3cc3_18
+  - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118
+  - sysroot_linux-64
+  - tzdata
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 14566100
+  timestamp: 1771378271421
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
+  sha256: 1e07c197e0779fa9105e59cd55a835ded96bfde59eb169439736a89b27b48e5d
+  md5: 7b51f4ff82eeb1f386bfee20a7bed3ed
+  depends:
+  - gxx_impl_linux-64 14.3.0.*
+  - gcc_linux-64 ==14.3.0 h298d278_21
+  - binutils_linux-64
+  - sysroot_linux-64
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 27503
+  timestamp: 1770908213813
+- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
+  sha256: 142a722072fa96cf16ff98eaaf641f54ab84744af81754c292cb81e0881c0329
+  md5: 186a18e3ba246eccfc7cff00cd19a870
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: MIT
+  license_family: MIT
+  size: 12728445
+  timestamp: 1767969922681
+- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
+  sha256: 7cd5eccdb171a0adbf83a1ad8fc4e17822f4fc3f5518da9040de64e88bc07343
+  md5: 5b7ae2ec4e0750e094f804a6cf1b2a37
+  depends:
+  - python >=3.10
+  - ukkonen
+  license: MIT
+  license_family: MIT
+  size: 79520
+  timestamp: 1772402363021
+- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
+  sha256: c18ab120a0613ada4391b15981d86ff777b5690ca461ea7e9e49531e8f374745
+  md5: 63ccfdc3a3ce25b027b8767eb722fca8
+  depends:
+  - python >=3.9
+  - zipp >=3.20
+  - python
+  license: Apache-2.0
+  license_family: APACHE
+  size: 34641
+  timestamp: 1747934053147
+- conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
+  sha256: e1a9e3b1c8fe62dc3932a616c284b5d8cbe3124bbfbedcf4ce5c828cb166ee19
+  md5: 9614359868482abba1bd15ce465e3c42
+  depends:
+  - python >=3.10
+  license: MIT
+  license_family: MIT
+  size: 13387
+  timestamp: 1760831448842
+- conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
+  sha256: 41557eeadf641de6aeae49486cef30d02a6912d8da98585d687894afd65b356a
+  md5: 86d9cba083cd041bfbf242a01a7a1999
+  constrains:
+  - sysroot_linux-64 ==2.28
+  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later
+  license_family: GPL
+  size: 1278712
+  timestamp: 1765578681495
+- conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
+  sha256: 0960d06048a7185d3542d850986d807c6e37ca2e644342dd0c72feefcf26c2a4
+  md5: b38117a3c920364aff79f870c984b4a3
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: LGPL-2.1-or-later
+  size: 134088
+  timestamp: 1754905959823
+- conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
+  sha256: 3e307628ca3527448dd1cb14ad7bb9d04d1d28c7d4c5f97ba196ae984571dd25
+  md5: fb53fb07ce46a575c5d004bbc96032c2
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - keyutils >=1.6.3,<2.0a0
+  - libedit >=3.1.20250104,<3.2.0a0
+  - libedit >=3.1.20250104,<4.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - openssl >=3.5.5,<4.0a0
+  license: MIT
+  license_family: MIT
+  size: 1386730
+  timestamp: 1769769569681
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
+  sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3
+  md5: 12bd9a3f089ee6c9266a37dab82afabd
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - zstd >=1.5.7,<1.6.0a0
+  constrains:
+  - binutils_impl_linux-64 2.45.1
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 725507
+  timestamp: 1770267139900
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
+  build_number: 5
+  sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c
+  md5: c160954f7418d7b6e87eaf05a8913fa9
+  depends:
+  - libopenblas >=0.3.30,<0.3.31.0a0
+  - libopenblas >=0.3.30,<1.0a0
+  constrains:
+  - mkl <2026
+  - liblapack  3.11.0   5*_openblas
+  - libcblas   3.11.0   5*_openblas
+  - blas 2.305   openblas
+  - liblapacke 3.11.0   5*_openblas
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18213
+  timestamp: 1765818813880
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
+  sha256: 9517cce5193144af0fcbf19b7bd67db0a329c2cc2618f28ffecaa921a1cbe9d3
+  md5: 09c264d40c67b82b49a3f3b89037bd2e
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - attr >=2.5.2,<2.6.0a0
+  - libgcc >=14
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 121429
+  timestamp: 1762349484074
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
+  build_number: 5
+  sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8
+  md5: 6636a2b6f1a87572df2970d3ebc87cc0
+  depends:
+  - libblas 3.11.0 5_h4a7cf45_openblas
+  constrains:
+  - liblapacke 3.11.0   5*_openblas
+  - blas 2.305   openblas
+  - liblapack  3.11.0   5*_openblas
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18194
+  timestamp: 1765818837135
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+  sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1
+  md5: 48418c48dac04671fa46cb446122b8a5
+  depends:
+  - __glibc >=2.28,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - rdma-core >=60.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 990938
+  timestamp: 1768273732081
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
+  sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19
+  md5: 6d8ed8335d144ec7303b8d3587b2205c
+  depends:
+  - __glibc >=2.28,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - rdma-core >=61.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1085341
+  timestamp: 1773100191342
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
+  sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b
+  md5: 1707cdd636af2ff697b53186572c9f77
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - krb5 >=1.22.2,<1.23.0a0
+  - libgcc >=14
+  - libnghttp2 >=1.67.0,<2.0a0
+  - libssh2 >=1.11.1,<2.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - openssl >=3.5.5,<4.0a0
+  - zstd >=1.5.7,<1.6.0a0
+  license: curl
+  license_family: MIT
+  size: 463621
+  timestamp: 1770892808818
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+  sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724
+  md5: c277e0a4d549b03ac1e9d6cbbe3d017b
+  depends:
+  - ncurses
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - ncurses >=6.5,<7.0a0
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 134676
+  timestamp: 1738479519902
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
+  sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4
+  md5: 172bf1cd1ff8629f2b1179945ed45055
+  depends:
+  - libgcc-ng >=12
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 112766
+  timestamp: 1702146165126
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
+  sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5
+  md5: e7f7ce06ec24cfcfb9e36d28cf82ba57
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  constrains:
+  - expat 2.7.4.*
+  license: MIT
+  license_family: MIT
+  size: 76798
+  timestamp: 1771259418166
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+  sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6
+  md5: a360c33a5abe61c07959e449fa1453eb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: MIT
+  license_family: MIT
+  size: 58592
+  timestamp: 1769456073053
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+  sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5
+  md5: 0aa00f03f9e39fb9876085dee11a85d4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - _openmp_mutex >=4.5
+  constrains:
+  - libgcc-ng ==15.2.0=*_18
+  - libgomp 15.2.0 he0feb66_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 1041788
+  timestamp: 1771378212382
+- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
+  sha256: 1abc6a81ee66e8ac9ac09a26e2d6ad7bba23f0a0cc3a6118654f036f9c0e1854
+  md5: 06901733131833f5edd68cf3d9679798
+  depends:
+  - __unix
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 3084533
+  timestamp: 1771377786730
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
+  sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893
+  md5: d5e96b1ed75ca01906b3d2469b4ce493
+  depends:
+  - libgcc 15.2.0 he0feb66_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 27526
+  timestamp: 1771378224552
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+  sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee
+  md5: 9063115da5bc35fdc3e1002e69b9ef6e
+  depends:
+  - libgfortran5 15.2.0 h68bc16d_18
+  constrains:
+  - libgfortran-ng ==15.2.0=*_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 27523
+  timestamp: 1771378269450
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+  sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12
+  md5: 646855f357199a12f02a87382d429b75
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=15.2.0
+  constrains:
+  - libgfortran 15.2.0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 2482475
+  timestamp: 1771378241063
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+  sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110
+  md5: 239c5e9546c38a1e884d69effcf4c882
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 603262
+  timestamp: 1771378117851
+- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
+  build_number: 5
+  sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053
+  md5: b38076eb5c8e40d0106beda6f95d7609
+  depends:
+  - libblas 3.11.0 5_h4a7cf45_openblas
+  constrains:
+  - blas 2.305   openblas
+  - liblapacke 3.11.0   5*_openblas
+  - libcblas   3.11.0   5*_openblas
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18200
+  timestamp: 1765818857876
+- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
+  sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb
+  md5: c7c83eecbb72d88b940c249af56c8b17
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  constrains:
+  - xz 5.8.2.*
+  license: 0BSD
+  size: 113207
+  timestamp: 1768752626120
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+  sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843
+  md5: 2c21e66f50753a083cbe6b80f38268fa
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 92400
+  timestamp: 1769482286018
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
+  sha256: a4a7dab8db4dc81c736e9a9b42bdfd97b087816e029e221380511960ac46c690
+  md5: b499ce4b026493a13774bcf0f4c33849
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - c-ares >=1.34.5,<2.0a0
+  - libev >=4.33,<4.34.0a0
+  - libev >=4.33,<5.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - libzlib >=1.3.1,<2.0a0
+  - openssl >=3.5.2,<4.0a0
+  license: MIT
+  license_family: MIT
+  size: 666600
+  timestamp: 1756834976695
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+  sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4
+  md5: db63358239cbe1ff86242406d440e44a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: LGPL-2.1-or-later
+  license_family: LGPL
+  size: 741323
+  timestamp: 1731846827427
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda
+  sha256: 66b7bbe40d259e4927b9c264569afd49d0e31a3813c585beea63f3415577f1b3
+  md5: 7e6534bce7252c84efdedae1fae2148e
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 471076
+  timestamp: 1773100181931
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda
+  sha256: 6b5300bf9952da4bfdbfb45c13b042d786a0daffb1bd2fa45ea9ad971703fe96
+  md5: 851acc1af02d31c732b931b9ffddc2d9
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 31328660
+  timestamp: 1771443943495
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda
+  sha256: 2ca45a2c9e6cc307cea3c8a1bf27bceb745fa5e1150d7b768b63a781eeaee7a2
+  md5: 20a82402e6851e5d4e0b13ee1083d370
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 31691081
+  timestamp: 1773100788615
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
+  sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5
+  md5: be43915efc66345cccb3c310b6ed0374
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libgfortran
+  - libgfortran5 >=14.3.0
+  constrains:
+  - openblas >=0.3.30,<0.3.31.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 5927939
+  timestamp: 1763114673331
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
+  sha256: e03ed186eefb46d7800224ad34bad1268c9d19ecb8f621380a50601c6221a4a7
+  md5: ad3a0e2dc4cce549b2860e2ef0e6d75b
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14.3.0
+  - libstdcxx >=14.3.0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 7949259
+  timestamp: 1771377982207
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
+  sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217
+  md5: da5be73701eecd0e8454423fd6ffcf30
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - icu >=78.2,<79.0a0
+  - libgcc >=14
+  - libzlib >=1.3.1,<2.0a0
+  license: blessing
+  size: 942808
+  timestamp: 1768147973361
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
+  sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661
+  md5: eecce068c7e4eddeb169591baac20ac4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libzlib >=1.3.1,<2.0a0
+  - openssl >=3.5.0,<4.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 304790
+  timestamp: 1745608545575
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+  sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e
+  md5: 1b08cd684f34175e4514474793d44bcb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc 15.2.0 he0feb66_18
+  constrains:
+  - libstdcxx-ng ==15.2.0=*_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 5852330
+  timestamp: 1771378262446
+- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
+  sha256: b1c3824769b92a1486bf3e2cc5f13304d83ae613ea061b7bc47bb6080d6dfdba
+  md5: 865a399bce236119301ebd1532fced8d
+  depends:
+  - __unix
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 20171098
+  timestamp: 1771377827750
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
+  sha256: f0356bb344a684e7616fc84675cfca6401140320594e8686be30e8ac7547aed2
+  md5: 1d4c18d75c51ed9d00092a891a547a7d
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libcap >=2.77,<2.78.0a0
+  - libgcc >=14
+  license: LGPL-2.1-or-later
+  size: 491953
+  timestamp: 1770738638119
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
+  sha256: ed4d2c01fbeb1330f112f7e399408634db277d3dfb2dec1d0395f56feaa24351
+  md5: 6c74fba677b61a0842cbf0f63eee683b
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libcap >=2.77,<2.78.0a0
+  - libgcc >=14
+  license: LGPL-2.1-or-later
+  size: 144654
+  timestamp: 1770738650966
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
+  sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee
+  md5: db409b7c1720428638e7c0d509d3e1b5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 40311
+  timestamp: 1766271528534
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
+  sha256: c180f4124a889ac343fc59d15558e93667d894a966ec6fdb61da1604481be26b
+  md5: 0f03292cc56bf91a077a134ea8747118
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: MIT
+  license_family: MIT
+  size: 895108
+  timestamp: 1753948278280
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
+  sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4
+  md5: edb0dca6bc32e4f4789199455a1dbeb8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  constrains:
+  - zlib 1.3.1 *_2
+  license: Zlib
+  license_family: Other
+  size: 60963
+  timestamp: 1727963148474
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
+  sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586
+  md5: 47e340acb35de30501a76c7c799c41d7
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: X11 AND BSD-3-Clause
+  size: 891641
+  timestamp: 1738195959188
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
+  sha256: 6f7d59dbec0a7b00bf5d103a4306e8886678b796ff2151b62452d4582b2a53fb
+  md5: b518e9e92493721281a60fa975bddc65
+  depends:
+  - libstdcxx >=14
+  - libgcc >=14
+  - __glibc >=2.17,<3.0.a0
+  license: Apache-2.0
+  license_family: APACHE
+  size: 186323
+  timestamp: 1763688260928
+- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+  sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee
+  md5: eb52d14a901e23c39e9e7b4a1a5c015f
+  depends:
+  - python >=3.10
+  - setuptools
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 40866
+  timestamp: 1766261270149
+- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
+  sha256: 1d8377c8001c15ed12c2713b723213474b435706ab9d34ede69795d64af9e94d
+  md5: 4ea6b620fdf24a1a0bc4f1c7134dfafb
+  depends:
+  - python
+  - libstdcxx >=14
+  - libgcc >=14
+  - __glibc >=2.17,<3.0.a0
+  - libcblas >=3.9.0,<4.0a0
+  - python_abi 3.14.* *_cp314
+  - libblas >=3.9.0,<4.0a0
+  - liblapack >=3.9.0,<4.0a0
+  constrains:
+  - numpy-base <0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 8926994
+  timestamp: 1770098474394
+- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
+  sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c
+  md5: f61eb8cd60ff9057122a3d338b99c00f
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - ca-certificates
+  - libgcc >=14
+  license: Apache-2.0
+  license_family: Apache
+  size: 3164551
+  timestamp: 1769555830639
+- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
+  sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58
+  md5: b76541e68fea4d511b1ac46a28dcd2c6
+  depends:
+  - python >=3.8
+  - python
+  license: Apache-2.0
+  license_family: APACHE
+  size: 72010
+  timestamp: 1769093650580
+- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
+  sha256: 0289f0a38337ee201d984f8f31f11f6ef076cfbbfd0ab9181d12d9d1d099bf46
+  md5: 82c1787f2a65c0155ef9652466ee98d6
+  depends:
+  - python >=3.10
+  - python
+  license: MIT
+  license_family: MIT
+  size: 25646
+  timestamp: 1773199142345
+- conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
+  sha256: e14aafa63efa0528ca99ba568eaf506eb55a0371d12e6250aaaa61718d2eb62e
+  md5: d7585b6550ad04c8c5e21097ada2888e
+  depends:
+  - python >=3.9
+  - python
+  license: MIT
+  license_family: MIT
+  size: 25877
+  timestamp: 1764896838868
+- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
+  sha256: 5b81b7516d4baf43d0c185896b245fa7384b25dc5615e7baa504b7fa4e07b706
+  md5: 7f3ac694319c7eaf81a0325d6405e974
+  depends:
+  - cfgv >=2.0.0
+  - identify >=1.0.0
+  - nodeenv >=0.11.1
+  - python >=3.10
+  - pyyaml >=5.1
+  - virtualenv >=20.10.0
+  license: MIT
+  license_family: MIT
+  size: 200827
+  timestamp: 1765937577534
+- conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
+  sha256: 6d8f03c13d085a569fde931892cded813474acbef2e03381a1a87f420c7da035
+  md5: 46830ee16925d5ed250850503b5dc3a8
+  depends:
+  - python >=3.9
+  license: MIT
+  license_family: MIT
+  size: 25766
+  timestamp: 1733236452235
+- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+  sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6
+  md5: 12c566707c80111f9799308d9e265aef
+  depends:
+  - python >=3.9
+  - python
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 110100
+  timestamp: 1733195786147
+- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
+  sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a
+  md5: 6b6ece66ebcae2d5f326c77ef2c5a066
+  depends:
+  - python >=3.9
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 889287
+  timestamp: 1750615908735
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+  sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07
+  md5: ed166875b3876d5d7e6e39d2e8d1c6e3
+  depends:
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  - six
+  license: MIT
+  license_family: MIT
+  size: 273897
+  timestamp: 1765980972868
+- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
+  sha256: 9e749fb465a8bedf0184d8b8996992a38de351f7c64e967031944978de03a520
+  md5: 2b694bad8a50dc2f712f5368de866480
+  depends:
+  - pygments >=2.7.2
+  - python >=3.10
+  - iniconfig >=1.0.1
+  - packaging >=22
+  - pluggy >=1.5,<2
+  - tomli >=1
+  - colorama >=0.4
+  - exceptiongroup >=1
+  - python
+  constrains:
+  - pytest-faulthandler >=2
+  license: MIT
+  license_family: MIT
+  size: 299581
+  timestamp: 1765062031645
+- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
+  sha256: 2f2229415a6e5387c1faaedf442ea8c07471cb2bf5ad1007b9cfb83ea85ca29a
+  md5: 0e7294ed4af8b833fcd2c101d647c3da
+  depends:
+  - py-cpuinfo
+  - pytest >=8.1
+  - python >=3.10
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 43976
+  timestamp: 1762716480208
+- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
+  build_number: 101
+  sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd
+  md5: c014ad06e60441661737121d3eae8a60
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - bzip2 >=1.0.8,<2.0a0
+  - ld_impl_linux-64 >=2.36.1
+  - libexpat >=2.7.3,<3.0a0
+  - libffi >=3.5.2,<3.6.0a0
+  - libgcc >=14
+  - liblzma >=5.8.2,<6.0a0
+  - libmpdec >=4.0.0,<5.0a0
+  - libsqlite >=3.51.2,<4.0a0
+  - libuuid >=2.41.3,<3.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - ncurses >=6.5,<7.0a0
+  - openssl >=3.5.5,<4.0a0
+  - python_abi 3.14.* *_cp314
+  - readline >=8.3,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - zstd >=1.5.7,<1.6.0a0
+  license: Python-2.0
+  size: 36702440
+  timestamp: 1770675584356
+  python_site_packages_path: lib/python3.14/site-packages
+- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
+  sha256: 36429765f626c345710fbae14aeeda676c1745427667eb480bb855b7089affba
+  md5: 69fc0a99fc21b26b81026c72e00f83df
+  depends:
+  - python >=3.10
+  - filelock >=3.15.4
+  - platformdirs <5,>=4.3.6
+  - python
+  license: MIT
+  license_family: MIT
+  size: 33996
+  timestamp: 1773161039118
+- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+  build_number: 8
+  sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5
+  md5: 0539938c55b6b1a59b560e843ad864a4
+  constrains:
+  - python 3.14.* *_cp314
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6989
+  timestamp: 1752805904792
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+  sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d
+  md5: 2035f68f96be30dc60a5dfd7452c7941
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  - yaml >=0.2.5,<0.3.0a0
+  license: MIT
+  license_family: MIT
+  size: 202391
+  timestamp: 1770223462836
+- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+  sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717
+  md5: d487d93d170e332ab39803e05912a762
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libnl >=3.11.0,<4.0a0
+  - libstdcxx >=14
+  - libsystemd0 >=257.10
+  - libudev1 >=257.10
+  license: Linux-OpenIB
+  license_family: BSD
+  size: 1268666
+  timestamp: 1769154883613
+- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+  sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002
+  md5: d7d95fc8287ea7bf33e0e7116d2b95ec
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - ncurses >=6.5,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 345073
+  timestamp: 1765813471974
+- conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
+  sha256: d5c73079c1dd2c2a313c3bfd81c73dbd066b7eb08d213778c8bff520091ae894
+  md5: c1c9b02933fdb2cfb791d936c20e887e
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 193775
+  timestamp: 1748644872902
+- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+  sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1
+  md5: 8e194e7b992f99a5015edbd4ebd38efd
+  depends:
+  - python >=3.10
+  license: MIT
+  license_family: MIT
+  size: 639697
+  timestamp: 1773074868565
+- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+  sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d
+  md5: 3339e3b65d58accf4ca4fb8748ab16b3
+  depends:
+  - python >=3.9
+  - python
+  license: MIT
+  license_family: MIT
+  size: 18455
+  timestamp: 1753199211006
+- conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
+  sha256: c47299fe37aebb0fcf674b3be588e67e4afb86225be4b0d452c7eb75c086b851
+  md5: 13dc3adbc692664cd3beabd216434749
+  depends:
+  - __glibc >=2.28
+  - kernel-headers_linux-64 4.18.0 he073ed8_9
+  - tzdata
+  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later
+  license_family: GPL
+  size: 24008591
+  timestamp: 1765578833462
+- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+  sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac
+  md5: cffd3bdd58090148f4cfcd831f4b26ab
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libzlib >=1.3.1,<2.0a0
+  constrains:
+  - xorg-libx11 >=1.8.12,<2.0a0
+  license: TCL
+  license_family: BSD
+  size: 3301196
+  timestamp: 1769460227866
+- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
+  sha256: 62940c563de45790ba0f076b9f2085a842a65662268b02dd136a8e9b1eaf47a8
+  md5: 72e780e9aa2d0a3295f59b1874e3768b
+  depends:
+  - python >=3.10
+  - python
+  license: MIT
+  license_family: MIT
+  size: 21453
+  timestamp: 1768146676791
+- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+  sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731
+  md5: 0caa1af407ecff61170c9437a808404d
+  depends:
+  - python >=3.10
+  - python
+  license: PSF-2.0
+  license_family: PSF
+  size: 51692
+  timestamp: 1756220668932
+- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+  sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c
+  md5: ad659d0a2b3e47e38d829aa8cad2d610
+  license: LicenseRef-Public-Domain
+  size: 119135
+  timestamp: 1767016325805
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+  sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43
+  md5: 5d3c008e54c7f49592fca9c32896a76f
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cffi
+  - libgcc >=14
+  - libstdcxx >=14
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  license: MIT
+  license_family: MIT
+  size: 15004
+  timestamp: 1769438727085
+- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
+  sha256: b83246d145ba0e6814d2ed0b616293e56924e6c7d6649101f5a4f97f9e757ed1
+  md5: 704c22301912f7e37d0a92b2e7d5942d
+  depends:
+  - python >=3.10
+  - distlib >=0.3.7,<1
+  - filelock <4,>=3.24.2
+  - importlib-metadata >=6.6
+  - platformdirs >=3.9.1,<5
+  - python-discovery >=1
+  - typing_extensions >=4.13.2
+  - python
+  license: MIT
+  license_family: MIT
+  size: 4647775
+  timestamp: 1773133660203
+- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+  sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad
+  md5: a77f85f77be52ff59391544bfe73390a
+  depends:
+  - libgcc >=14
+  - __glibc >=2.17,<3.0.a0
+  license: MIT
+  license_family: MIT
+  size: 85189
+  timestamp: 1753484064210
+- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
+  sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae
+  md5: 30cd29cb87d819caead4d55184c1d115
+  depends:
+  - python >=3.10
+  - python
+  license: MIT
+  license_family: MIT
+  size: 24194
+  timestamp: 1764460141901
+- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+  sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7
+  md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libzlib >=1.3.1,<2.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 601375
+  timestamp: 1764777111296
diff --git a/benchmarks/cuda_bindings/pixi.toml b/benchmarks/cuda_bindings/pixi.toml
new file mode 100644
index 0000000000..a448e8d3e4
--- /dev/null
+++ b/benchmarks/cuda_bindings/pixi.toml
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[workspace]
+channels = ["conda-forge"]
+platforms = ["linux-64"]
+preview = ["pixi-build"]
+channel-priority = "disabled"
+
+[feature.cu13.system-requirements]
+cuda = "13"
+
+[feature.cu13-pinned.dependencies]
+cuda-version = "13.1.*"
+
+[feature.cu13-source.dependencies]
+cuda-version = "13.*"
+
+[feature.bench.dependencies]
+python = "3.14.*"
+pyperf = "*"
+pytest = "*"
+pytest-benchmark = "*"
+numpy = "*"
+
+[feature.cpp-bench.dependencies]
+cmake = "*"
+ninja = "*"
+cxx-compiler = "*"
+cuda-cudart-dev = "*"
+cuda-nvrtc-dev = "*"
+
+[feature.cpp-bench.target.linux-64.dependencies]
+cuda-crt-dev_linux-64 = "*"
+cuda-driver-dev_linux-64 = "*"
+
+[feature.cpp-bench.target.linux-64.activation.env]
+CUDA_HOME = "$CONDA_PREFIX/targets/x86_64-linux"
+
+[feature.dev.dependencies]
+pre-commit = "*"
+
+[feature.bindings-wheel.dependencies]
+cuda-bindings = "==13.1.0"
+
+[feature.bindings-source.dependencies]
+cuda-bindings = { path = ".." }
+
+[environments]
+wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }
+source = { features = ["cu13", "cu13-source", "bench", "cpp-bench", "dev", "bindings-source"] }
+
+[target.linux.tasks.bench]
+cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"]
+
+[target.linux.tasks.bench-smoke-test]
+cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--fast", "--min-time", "1"
+]
+
+[target.linux.tasks.bench-legacy]
+cmd = "pytest --benchmark-only --override-ini 'addopts=' $PIXI_PROJECT_ROOT/pytest-legacy/"
+
+[target.linux.tasks.bench-cpp-configure]
+cmd = [
+    "cmake",
+    "-S",
+    "$PIXI_PROJECT_ROOT/benchmarks/cpp",
+    "-B",
+    "$PIXI_PROJECT_ROOT/.build/cpp",
+    "-G",
+    "Ninja",
+]
+
+[target.linux.tasks.bench-cpp-build]
+cmd = ["cmake", "--build", "$PIXI_PROJECT_ROOT/.build/cpp"]
+depends-on = [{ task = "bench-cpp-configure" }]
+
+[target.linux.tasks.bench-cpp]
+cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"]
+depends-on = [{ task = "bench-cpp-build" }]
+
+[target.linux.tasks.bench-compare]
+cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"]
+
+[target.linux.tasks.lint]
+cmd = ["pre-commit", "run", "--all-files"]
diff --git a/benchmarks/cuda_bindings/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
new file mode 100644
index 0000000000..0ea7b1d772
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import numpy as np
+import pytest
+
+from cuda.bindings import driver as cuda
+from cuda.bindings import nvrtc
+from cuda.bindings import runtime as cudart
+
+
+def ASSERT_DRV(err):
+    if isinstance(err, cuda.CUresult):
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"Cuda Error: {err}")
+    elif isinstance(err, cudart.cudaError_t):
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(f"Cudart Error: {err}")
+    elif isinstance(err, nvrtc.nvrtcResult):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(f"Nvrtc Error: {err}")
+    else:
+        raise RuntimeError(f"Unknown error type: {err}")
+
+
+@pytest.fixture
+def init_cuda():
+    # Initialize
+    (err,) = cuda.cuInit(0)
+    ASSERT_DRV(err)
+    err, device = cuda.cuDeviceGet(0)
+    ASSERT_DRV(err)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
+    ASSERT_DRV(err)
+
+    # create stream
+    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+    ASSERT_DRV(err)
+
+    yield device, ctx, stream
+
+    (err,) = cuda.cuStreamDestroy(stream)
+    ASSERT_DRV(err)
+    (err,) = cuda.cuCtxDestroy(ctx)
+    ASSERT_DRV(err)
+
+
+@pytest.fixture
+def load_module():
+    module = None
+
+    def _load_module(kernel_string, device):
+        nonlocal module
+        # Get module
+        err, major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
+        )
+        ASSERT_DRV(err)
+        err, minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
+        )
+        ASSERT_DRV(err)
+
+        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
+        ASSERT_DRV(err)
+        opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
+        (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)
+
+        err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
+        ASSERT_DRV(err_log)
+        log = b" " * logSize
+        (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
+        ASSERT_DRV(err_log)
+        result = log.decode()
+        if len(result) > 1:
+            print(result)
+
+        ASSERT_DRV(err)
+        err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
+        ASSERT_DRV(err)
+        cubin = b" " * cubinSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
+        ASSERT_DRV(err)
+        cubin = np.char.array(cubin)
+        err, module = cuda.cuModuleLoadData(cubin)
+        ASSERT_DRV(err)
+
+        return module
+
+    yield _load_module
+
+    (err,) = cuda.cuModuleUnload(module)
+    ASSERT_DRV(err)
diff --git a/benchmarks/cuda_bindings/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
new file mode 100644
index 0000000000..36646fba00
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+kernel_string = """\
+#define ITEM_PARAM(x, T) T x
+#define REP1(x, T)   , ITEM_PARAM(x, T)
+#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
+#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
+#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
+#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
+#define REP32(x, T)  REP16(x##0, T)  REP16(x##1, T)
+#define REP64(x, T)  REP32(x##0, T)  REP32(x##1, T)
+#define REP128(x, T) REP64(x##0, T)  REP64(x##1, T)
+#define REP256(x, T) REP128(x##0, T) REP128(x##1, T)
+
+template<size_t maxBytes>
+struct KernelFunctionParam
+{
+   unsigned char p[maxBytes];
+};
+
+extern "C" __global__ void small_kernel(float *f)
+{
+   *f = 0.0f;
+}
+
+extern "C" __global__ void empty_kernel()
+{
+   return;
+}
+
+extern "C" __global__
+void small_kernel_512_args(
+	ITEM_PARAM(F, int*)
+	REP1(A, int*)
+	REP2(A, int*)
+	REP4(A, int*)
+	REP8(A, int*)
+	REP16(A, int*)
+	REP32(A, int*)
+	REP64(A, int*)
+	REP128(A, int*)
+	REP256(A, int*))
+{
+    *F = 0;
+}
+
+extern "C" __global__
+void small_kernel_512_bools(
+	ITEM_PARAM(F, bool)
+	REP1(A, bool)
+	REP2(A, bool)
+	REP4(A, bool)
+	REP8(A, bool)
+	REP16(A, bool)
+	REP32(A, bool)
+	REP64(A, bool)
+	REP128(A, bool)
+	REP256(A, bool))
+{
+    return;
+}
+
+extern "C" __global__
+void small_kernel_512_ints(
+	ITEM_PARAM(F, int)
+	REP1(A, int)
+	REP2(A, int)
+	REP4(A, int)
+	REP8(A, int)
+	REP16(A, int)
+	REP32(A, int)
+	REP64(A, int)
+	REP128(A, int)
+	REP256(A, int))
+{
+    return;
+}
+
+extern "C" __global__
+void small_kernel_512_doubles(
+	ITEM_PARAM(F, double)
+	REP1(A, double)
+	REP2(A, double)
+	REP4(A, double)
+	REP8(A, double)
+	REP16(A, double)
+	REP32(A, double)
+	REP64(A, double)
+	REP128(A, double)
+	REP256(A, double))
+{
+    return;
+}
+
+extern "C" __global__
+void small_kernel_512_chars(
+	ITEM_PARAM(F, char)
+	REP1(A, char)
+	REP2(A, char)
+	REP4(A, char)
+	REP8(A, char)
+	REP16(A, char)
+	REP32(A, char)
+	REP64(A, char)
+	REP128(A, char)
+	REP256(A, char))
+{
+    return;
+}
+
+extern "C" __global__
+void small_kernel_512_longlongs(
+	ITEM_PARAM(F, long long)
+	REP1(A, long long)
+	REP2(A, long long)
+	REP4(A, long long)
+	REP8(A, long long)
+	REP16(A, long long)
+	REP32(A, long long)
+	REP64(A, long long)
+	REP128(A, long long)
+	REP256(A, long long))
+{
+    return;
+}
+
+extern "C" __global__
+void small_kernel_256_args(
+	ITEM_PARAM(F, int*)
+	REP1(A, int*)
+	REP2(A, int*)
+	REP4(A, int*)
+	REP8(A, int*)
+	REP16(A, int*)
+	REP32(A, int*)
+	REP64(A, int*)
+	REP128(A, int*))
+{
+    *F = 0;
+}
+
+extern "C" __global__
+void small_kernel_16_args(
+	ITEM_PARAM(F, int*)
+	REP1(A, int*)
+	REP2(A, int*)
+	REP4(A, int*)
+	REP8(A, int*))
+{
+    *F = 0;
+}
+
+extern "C" __global__ void small_kernel_2048B(KernelFunctionParam<2048> param)
+{
+    // Do not touch param to prevent compiler from copying
+    // the whole structure from const bank to lmem.
+}
+"""
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
new file mode 100644
index 0000000000..76dd6e6a45
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
@@ -0,0 +1,199 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import ctypes
+
+import pytest
+
+try:
+    import cupy
+
+    skip_tests = False
+except ImportError:
+    skip_tests = True
+
+from kernels import kernel_string
+
+
+def launch(kernel, args=()):
+    kernel((1,), (1,), args)
+
+
+# Measure launch latency with no parmaeters
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_empty_kernel(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("empty_kernel")
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel)
+        stream.synchronize()
+
+
+# Measure launch latency with a single parameter
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel")
+    cupy.cuda.set_allocator()
+    arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, (arg,))
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_args(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_args")
+    cupy.cuda.set_allocator()
+
+    args = []
+    for _ in range(512):
+        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_bools(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_bools")
+    cupy.cuda.set_allocator()
+
+    args = [True] * 512
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_doubles(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_doubles")
+    cupy.cuda.set_allocator()
+
+    args = [1.2345] * 512
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_ints(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_ints")
+    cupy.cuda.set_allocator()
+
+    args = [123] * 512
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_bytes(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_chars")
+    cupy.cuda.set_allocator()
+
+    args = [127] * 512
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_512_longlongs(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_512_longlongs")
+    cupy.cuda.set_allocator()
+
+    args = [9223372036854775806] * 512
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_256_args(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_256_args")
+    cupy.cuda.set_allocator()
+
+    args = []
+    for _ in range(256):
+        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
+@pytest.mark.benchmark(group="cupy")
+def test_launch_latency_small_kernel_16_args(benchmark):
+    module = cupy.RawModule(code=kernel_string)
+    kernel = module.get_function("small_kernel_16_args")
+    cupy.cuda.set_allocator()
+
+    args = []
+    for _ in range(16):
+        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
+    args = tuple(args)
+
+    stream = cupy.cuda.stream.Stream(non_blocking=True)
+
+    with stream:
+        benchmark(launch, kernel, args)
+        stream.synchronize()
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
new file mode 100755
index 0000000000..dd994081a0
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
@@ -0,0 +1,336 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import ctypes
+
+import pytest
+from kernels import kernel_string
+
+from conftest import ASSERT_DRV
+from cuda.bindings import driver as cuda
+
+
+def launch(kernel, stream, args=(), arg_types=()):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (args, arg_types),
+        0,
+    )  # arguments
+
+
+def launch_packed(kernel, stream, params):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        params,
+        0,
+    )  # arguments
+
+
+# Measure launch latency with no parmaeters
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with a single parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
+    ASSERT_DRV(err)
+
+    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
+
+    (err,) = cuda.cuMemFree(f)
+    ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 512
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
+    ASSERT_DRV(err)
+
+    args = [True] * 512
+    arg_types = [ctypes.c_bool] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
+    ASSERT_DRV(err)
+
+    args = [1.2345] * 512
+    arg_types = [ctypes.c_double] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
+    ASSERT_DRV(err)
+
+    args = [123] * 512
+    arg_types = [ctypes.c_int] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
+    ASSERT_DRV(err)
+
+    args = [127] * 512
+    arg_types = [ctypes.c_byte] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
+    ASSERT_DRV(err)
+
+    args = [9223372036854775806] * 512
+    arg_types = [ctypes.c_longlong] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 256
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 16
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters, excluding parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    val_ps = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+        val_ps.append(ctypes.c_void_p(int(vals[i])))
+
+    packagedParams = (ctypes.c_void_p * 512)()
+    for i in range(512):
+        packagedParams[i] = ctypes.addressof(val_ps[i])
+
+    benchmark(launch_packed, func, stream, packagedParams)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+def pack_and_launch(kernel, stream, params):
+    packed_params = (ctypes.c_void_p * len(params))()
+    ptrs = [0] * len(params)
+    for i in range(len(params)):
+        ptrs[i] = ctypes.c_void_p(int(params[i]))
+        packed_params[i] = ctypes.addressof(ptrs[i])
+
+    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
+
+
+# Measure launch latency plus parameter packing using ctypes
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+
+    benchmark(pack_and_launch, func, stream, vals)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with a single large struct parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
+    ASSERT_DRV(err)
+
+    class struct_2048B(ctypes.Structure):
+        _fields_ = [("values", ctypes.c_uint8 * 2048)]
+
+    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
new file mode 100644
index 0000000000..dfe084c6b1
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import numpy as np
+import pytest
+
+try:
+    from numba import cuda
+
+    skip_tests = False
+except ImportError:
+    skip_tests = True
+
+
+def launch_empty(kernel, stream):
+    kernel[1, 1, stream]()
+
+
+def launch(kernel, stream, arg):
+    kernel[1, 1, stream](arg)
+
+
+# Measure launch latency with no parmaeters
+@pytest.mark.skipif(skip_tests, reason="Numba is not installed")
+@pytest.mark.benchmark(group="numba", min_rounds=1000)
+def test_launch_latency_empty_kernel(benchmark):
+    stream = cuda.stream()
+
+    @cuda.jit
+    def empty_kernel():
+        return
+
+    benchmark(launch_empty, empty_kernel, stream)
+
+    cuda.synchronize()
+
+
+# Measure launch latency with a single parameter
+@pytest.mark.skipif(skip_tests, reason="Numba is not installed")
+@pytest.mark.benchmark(group="numba", min_rounds=1000)
+def test_launch_latency_small_kernel(benchmark):
+    stream = cuda.stream()
+
+    arg = cuda.device_array(1, dtype=np.float32, stream=stream)
+
+    @cuda.jit
+    def small_kernel(array):
+        array[0] = 0.0
+
+    benchmark(launch, small_kernel, stream, arg)
+
+    cuda.synchronize()
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
new file mode 100644
index 0000000000..fae72ffd79
--- /dev/null
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import random
+
+import pytest
+
+from conftest import ASSERT_DRV
+from cuda.bindings import driver as cuda
+
+random.seed(0)
+
+idx = 0
+
+
+def query_attribute(attribute, ptrs):
+    global idx
+    ptr = ptrs[idx]
+    idx = (idx + 1) % len(ptrs)
+
+    cuda.cuPointerGetAttribute(attribute, ptr)
+
+
+def query_attributes(attributes, ptrs):
+    global idx
+    ptr = ptrs[idx]
+    idx = (idx + 1) % len(ptrs)
+
+    cuda.cuPointerGetAttributes(len(attributes), attributes, ptr)
+
+
+@pytest.mark.benchmark(group="pointer-attributes")
+# Measure cuPointerGetAttribute in the same way as C benchmarks
+def test_pointer_get_attribute(benchmark, init_cuda):
+    _ = init_cuda
+
+    ptrs = []
+    for _ in range(500):
+        err, ptr = cuda.cuMemAlloc(1 << 18)
+        ASSERT_DRV(err)
+        ptrs.append(ptr)
+
+    random.shuffle(ptrs)
+
+    benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs)
+
+    for p in ptrs:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+@pytest.mark.benchmark(group="pointer-attributes")
+# Measure cuPointerGetAttributes with all attributes
+def test_pointer_get_attributes_all(benchmark, init_cuda):
+    _ = init_cuda
+
+    ptrs = []
+    for _ in range(500):
+        err, ptr = cuda.cuMemAlloc(1 << 18)
+        ASSERT_DRV(err)
+        ptrs.append(ptr)
+
+    random.shuffle(ptrs)
+
+    attributes = [
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
+    ]
+
+    benchmark(query_attributes, attributes, ptrs)
+
+    for p in ptrs:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+@pytest.mark.benchmark(group="pointer-attributes")
+# Measure cuPointerGetAttributes with a single attribute
+def test_pointer_get_attributes_single(benchmark, init_cuda):
+    _ = init_cuda
+
+    ptrs = []
+    for _ in range(500):
+        err, ptr = cuda.cuMemAlloc(1 << 18)
+        ASSERT_DRV(err)
+        ptrs.append(ptr)
+
+    random.shuffle(ptrs)
+
+    attributes = [
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+    ]
+
+    benchmark(query_attributes, attributes, ptrs)
+
+    for p in ptrs:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
diff --git a/benchmarks/cuda_bindings/run_cpp.py b/benchmarks/cuda_bindings/run_cpp.py
new file mode 100644
index 0000000000..96e50cb890
--- /dev/null
+++ b/benchmarks/cuda_bindings/run_cpp.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from runner.cpp import main
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_bindings/run_pyperf.py b/benchmarks/cuda_bindings/run_pyperf.py
new file mode 100644
index 0000000000..f45af8c69a
--- /dev/null
+++ b/benchmarks/cuda_bindings/run_pyperf.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from runner.main import main
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_bindings/runner/__init__.py b/benchmarks/cuda_bindings/runner/__init__.py
new file mode 100644
index 0000000000..27422b3cb7
--- /dev/null
+++ b/benchmarks/cuda_bindings/runner/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/benchmarks/cuda_bindings/runner/cpp.py b/benchmarks/cuda_bindings/runner/cpp.py
new file mode 100644
index 0000000000..f8c3490381
--- /dev/null
+++ b/benchmarks/cuda_bindings/runner/cpp.py
@@ -0,0 +1,180 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+BUILD_DIR = PROJECT_ROOT / ".build" / "cpp"
+DEFAULT_OUTPUT = PROJECT_ROOT / "results-cpp.json"
+
+BINARY_PREFIX = "bench_"
+BINARY_SUFFIX = "_cpp"
+
+
+def discover_binaries() -> dict[str, Path]:
+    """Discover C++ benchmark binaries in the build directory"""
+    if not BUILD_DIR.is_dir():
+        return {}
+
+    registry: dict[str, Path] = {}
+    for path in sorted(BUILD_DIR.iterdir()):
+        if not path.is_file() or not path.name.startswith(BINARY_PREFIX):
+            continue
+        if not path.name.endswith(BINARY_SUFFIX):
+            continue
+        name = path.name.removeprefix(BINARY_PREFIX).removesuffix(BINARY_SUFFIX)
+        registry[name] = path
+    return registry
+
+
+def strip_output_args(argv: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    skip_next = False
+    for arg in argv:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg in ("-o", "--output"):
+            skip_next = True
+            continue
+        if arg.startswith(("-o=", "--output=")):
+            continue
+        cleaned.append(arg)
+    return cleaned
+
+
+def merge_pyperf_json(individual_files: list[Path], output_path: Path) -> int:
+    """Merge individual pyperf JSON files into a single BenchmarkSuite file.
+
+    Each C++ binary produces a file with structure:
+        {"version": "1.0", "metadata": {...}, "benchmarks": [{...}]}
+
+    We merge them by collecting all benchmark entries into one file.
+    """
+    all_benchmarks = []
+
+    for path in individual_files:
+        with open(path) as f:
+            data = json.load(f)
+
+        file_metadata = data.get("metadata", {})
+        bench_name = file_metadata.get("name", "")
+        loops = file_metadata.get("loops")
+        unit = file_metadata.get("unit", "second")
+
+        for bench in data.get("benchmarks", []):
+            for run in bench.get("runs", []):
+                run_meta = run.setdefault("metadata", {})
+                if bench_name:
+                    run_meta.setdefault("name", bench_name)
+                if loops is not None:
+                    run_meta.setdefault("loops", loops)
+                run_meta.setdefault("unit", unit)
+
+            all_benchmarks.append(bench)
+
+    merged = {
+        "version": "1.0",
+        "benchmarks": all_benchmarks,
+    }
+
+    with open(output_path, "w") as f:
+        json.dump(merged, f)
+
+    return len(all_benchmarks)
+
+
+def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+    parser = argparse.ArgumentParser(
+        description="Run C++ CUDA benchmarks",
+        add_help=False,
+    )
+    parser.add_argument(
+        "--benchmark",
+        action="append",
+        default=[],
+        help="Benchmark name to run (e.g. 'ctx_device'). Repeat for multiple. Defaults to all.",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="Print discovered benchmark names and exit.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=DEFAULT_OUTPUT,
+        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
+    )
+    parsed, remaining = parser.parse_known_args(argv)
+    return parsed, remaining
+
+
+def main() -> None:
+    parsed, remaining_argv = parse_args(sys.argv[1:])
+
+    registry = discover_binaries()
+    if not registry:
+        print(
+            f"No C++ benchmark binaries found in {BUILD_DIR}.\nRun 'pixi run bench-cpp-build' first.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    if parsed.list:
+        for name in sorted(registry):
+            print(name)
+        return
+
+    if parsed.benchmark:
+        missing = sorted(set(parsed.benchmark) - set(registry))
+        if missing:
+            known = ", ".join(sorted(registry))
+            unknown = ", ".join(missing)
+            print(
+                f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        names = parsed.benchmark
+    else:
+        names = sorted(registry)
+
+    # Strip any --output args to avoid conflicts with our output handling
+    passthrough_argv = strip_output_args(remaining_argv)
+
+    output_path = parsed.output.resolve()
+    failed = False
+    individual_files: list[Path] = []
+
+    with tempfile.TemporaryDirectory(prefix="cuda_bench_cpp_") as tmpdir:
+        tmpdir_path = Path(tmpdir)
+
+        for name in names:
+            binary = registry[name]
+            tmp_json = tmpdir_path / f"{name}.json"
+            cmd = [str(binary), "-o", str(tmp_json), *passthrough_argv]
+            result = subprocess.run(cmd, check=False)  # noqa: S603
+            if result.returncode != 0:
+                print(f"FAILED: {name} (exit code {result.returncode})", file=sys.stderr)
+                failed = True
+            elif tmp_json.exists():
+                individual_files.append(tmp_json)
+
+        if individual_files:
+            count = merge_pyperf_json(individual_files, output_path)
+            print(f"\nResults saved to {output_path} ({count} benchmark(s))")
+
+    if failed:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
new file mode 100644
index 0000000000..4089aa5559
--- /dev/null
+++ b/benchmarks/cuda_bindings/runner/main.py
@@ -0,0 +1,217 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import ast
+import importlib.util
+import os
+import sys
+from collections.abc import Callable
+from pathlib import Path
+from types import ModuleType
+
+import pyperf
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+BENCH_DIR = PROJECT_ROOT / "benchmarks"
+DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
+PYPERF_INHERITED_ENV_VARS = (
+    "CUDA_HOME",
+    "CUDA_PATH",
+    "CUDA_VISIBLE_DEVICES",
+    "LD_LIBRARY_PATH",
+    "NVIDIA_VISIBLE_DEVICES",
+)
+_MODULE_CACHE: dict[Path, ModuleType] = {}
+
+
+def load_module(module_path: Path) -> ModuleType:
+    module_path = module_path.resolve()
+    cached_module = _MODULE_CACHE.get(module_path)
+    if cached_module is not None:
+        return cached_module
+
+    module_name = f"cuda_bindings_bench_{module_path.stem}"
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to load benchmark module: {module_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    _MODULE_CACHE[module_path] = module
+    return module
+
+
+def benchmark_id(module_name: str, function_name: str) -> str:
+    module_suffix = module_name.removeprefix("bench_")
+    suffix = function_name.removeprefix("bench_")
+    return f"{module_suffix}.{suffix}"
+
+
+def _discover_module_functions(module_path: Path) -> list[str]:
+    tree = ast.parse(module_path.read_text(encoding="utf-8"), filename=str(module_path))
+    return [
+        node.name
+        for node in tree.body
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_")
+    ]
+
+
+def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
+    loaded_function: Callable[[int], float] | None = None
+
+    def run(loops: int) -> float:
+        nonlocal loaded_function
+        if loaded_function is None:
+            module = load_module(module_path)
+            loaded_function = getattr(module, function_name)
+        return loaded_function(loops)
+
+    run.__name__ = function_name
+    return run
+
+
+def discover_benchmarks() -> dict[str, Callable[[int], float]]:
+    """Discover bench_ functions.
+
+    Each bench_ function must have the signature: bench_*(loops: int) -> float
+    where it calls the operation `loops` times and returns the total elapsed
+    time in seconds (using time.perf_counter).
+    """
+    registry: dict[str, Callable[[int], float]] = {}
+    for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
+        module_name = module_path.stem
+        for function_name in _discover_module_functions(module_path):
+            bench_id = benchmark_id(module_name, function_name)
+            if bench_id in registry:
+                raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
+            registry[bench_id] = _lazy_benchmark(module_path, function_name)
+    return registry
+
+
+def strip_pyperf_output_args(argv: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    skip_next = False
+    for arg in argv:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg in ("-o", "--output", "--append"):
+            skip_next = True
+            continue
+        if arg.startswith(("-o=", "--output=", "--append=")):
+            continue
+        cleaned.append(arg)
+    return cleaned
+
+
+def _split_env_vars(arg_value: str) -> list[str]:
+    return [env_var for env_var in arg_value.split(",") if env_var]
+
+
+def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
+    if "--copy-env" in argv:
+        return list(argv)
+
+    inherited_env: list[str] = []
+    cleaned: list[str] = []
+    skip_next = False
+    for arg in argv:
+        if skip_next:
+            inherited_env.extend(_split_env_vars(arg))
+            skip_next = False
+            continue
+        if arg == "--inherit-environ":
+            skip_next = True
+            continue
+        if arg.startswith("--inherit-environ="):
+            inherited_env.extend(_split_env_vars(arg.partition("=")[2]))
+            continue
+        cleaned.append(arg)
+
+    if skip_next:
+        raise ValueError("Missing value for --inherit-environ")
+
+    for env_var in PYPERF_INHERITED_ENV_VARS:
+        if env_var in os.environ:
+            inherited_env.append(env_var)
+
+    deduped_env: list[str] = []
+    for env_var in inherited_env:
+        if env_var not in deduped_env:
+            deduped_env.append(env_var)
+
+    if deduped_env:
+        cleaned.extend(["--inherit-environ", ",".join(deduped_env)])
+
+    return cleaned
+
+
+def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument(
+        "--benchmark",
+        action="append",
+        default=[],
+        help="Benchmark ID to run. Repeat to run multiple IDs. Defaults to all.",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="Print discovered benchmark IDs and exit.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=DEFAULT_OUTPUT,
+        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
+    )
+    parsed, remaining = parser.parse_known_args(argv)
+    return parsed, remaining
+
+
+def main() -> None:
+    parsed, remaining_argv = parse_args(sys.argv[1:])
+
+    registry = discover_benchmarks()
+    if not registry:
+        raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
+
+    if parsed.list:
+        for bench_id in sorted(registry):
+            print(bench_id)
+        return
+
+    if parsed.benchmark:
+        missing = sorted(set(parsed.benchmark) - set(registry))
+        if missing:
+            known = ", ".join(sorted(registry))
+            unknown = ", ".join(missing)
+            raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
+        benchmark_ids = parsed.benchmark
+    else:
+        benchmark_ids = sorted(registry)
+
+    # Strip any --output args to avoid conflicts with our output handling.
+    output_path = parsed.output.resolve()
+    remaining_argv = strip_pyperf_output_args(remaining_argv)
+    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
+    is_worker = "--worker" in remaining_argv
+
+    # Delete the file so this run starts fresh.
+    if not is_worker:
+        output_path.unlink(missing_ok=True)
+
+    sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
+
+    runner = pyperf.Runner()
+    for bench_id in benchmark_ids:
+        runner.bench_time_func(bench_id, registry[bench_id])
+
+    if not is_worker:
+        print(f"\nResults saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_bindings/runner/runtime.py b/benchmarks/cuda_bindings/runner/runtime.py
new file mode 100644
index 0000000000..c985adb2e2
--- /dev/null
+++ b/benchmarks/cuda_bindings/runner/runtime.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import atexit
+
+from cuda.bindings import driver as cuda
+from cuda.bindings import nvrtc
+
+_ctx = None
+_device = None
+_persistent_ptrs: list[int] = []
+_modules: list = []
+
+
+def assert_drv(err) -> None:
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Cuda Error: {err}")
+
+
+def ensure_context() -> int:
+    global _ctx, _device
+    if _ctx is not None:
+        return _ctx
+
+    (err,) = cuda.cuInit(0)
+    assert_drv(err)
+
+    err, device = cuda.cuDeviceGet(0)
+    assert_drv(err)
+    _device = device
+
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
+    assert_drv(err)
+    _ctx = ctx
+    return ctx
+
+
+def alloc_persistent(size: int) -> int:
+    ensure_context()
+    err, ptr = cuda.cuMemAlloc(size)
+    assert_drv(err)
+    _persistent_ptrs.append(ptr)
+    return ptr
+
+
+def compile_and_load(kernel_source: str) -> int:
+    """Compile CUDA C source and returns the CUmodule handle"""
+    ensure_context()
+
+    err, major = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _device
+    )
+    assert_drv(err)
+    err, minor = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _device
+    )
+    assert_drv(err)
+
+    err, prog = nvrtc.nvrtcCreateProgram(kernel_source.encode(), b"benchmark_kernel.cu", 0, [], [])
+    assert_drv(err)
+
+    arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode()
+    (err,) = nvrtc.nvrtcCompileProgram(prog, 2, [b"--fmad=false", arch_flag])
+
+    # check for compile errors
+    err_log, log_size = nvrtc.nvrtcGetProgramLogSize(prog)
+    assert_drv(err_log)
+    log = b" " * log_size
+    (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
+    assert_drv(err_log)
+    assert_drv(err)
+
+    err, cubin_size = nvrtc.nvrtcGetCUBINSize(prog)
+    assert_drv(err)
+    cubin = b" " * cubin_size
+    (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
+    assert_drv(err)
+
+    err, module = cuda.cuModuleLoadData(cubin)
+    assert_drv(err)
+    _modules.append(module)
+    return module
+
+
+def cleanup() -> None:
+    global _ctx
+    for ptr in reversed(_persistent_ptrs):
+        (err,) = cuda.cuMemFree(ptr)
+        assert_drv(err)
+    _persistent_ptrs.clear()
+
+    for module in reversed(_modules):
+        (err,) = cuda.cuModuleUnload(module)
+        assert_drv(err)
+    _modules.clear()
+
+    if _ctx is None:
+        return
+    (err,) = cuda.cuCtxDestroy(_ctx)
+    assert_drv(err)
+    _ctx = None
+
+
+atexit.register(cleanup)
diff --git a/benchmarks/cuda_bindings/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py
new file mode 100644
index 0000000000..612094dac9
--- /dev/null
+++ b/benchmarks/cuda_bindings/tests/test_runner.py
@@ -0,0 +1,166 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import importlib.util
+import itertools
+import sys
+import types
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+RUNNER_MAIN_PATH = REPO_ROOT / "cuda_bindings/benchmarks/runner/main.py"
+BENCH_LAUNCH_PATH = REPO_ROOT / "cuda_bindings/benchmarks/benchmarks/bench_launch.py"
+
+
+def load_module_from_path(module_name: str, module_path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to load test module: {module_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def load_runner_main(monkeypatch):
+    pyperf_module = types.ModuleType("pyperf")
+
+    class FakeRunner:
+        def bench_time_func(self, *_args, **_kwargs) -> None:
+            raise AssertionError("FakeRunner should not be used in these tests")
+
+    pyperf_module.Runner = FakeRunner
+    monkeypatch.setitem(sys.modules, "pyperf", pyperf_module)
+    return load_module_from_path("test_cuda_bindings_bench_runner_main", RUNNER_MAIN_PATH)
+
+
+def load_bench_launch(monkeypatch, calls: list[tuple]):
+    pointer_values = itertools.count(1000)
+
+    runtime_module = types.ModuleType("runner.runtime")
+
+    def alloc_persistent(size: int) -> int:
+        calls.append(("alloc_persistent", size))
+        return next(pointer_values)
+
+    def assert_drv(err) -> None:
+        calls.append(("assert_drv", err))
+        assert err == 0
+
+    def compile_and_load(source: str) -> str:
+        calls.append(("compile_and_load", source))
+        return "module"
+
+    runtime_module.alloc_persistent = alloc_persistent
+    runtime_module.assert_drv = assert_drv
+    runtime_module.compile_and_load = compile_and_load
+
+    runner_module = types.ModuleType("runner")
+    runner_module.runtime = runtime_module
+
+    driver_module = types.ModuleType("cuda.bindings.driver")
+
+    class FakeCUresult:
+        CUDA_SUCCESS = 0
+
+    class FakeCUstreamFlags:
+        CU_STREAM_NON_BLOCKING = types.SimpleNamespace(value=1)
+
+    def cuModuleGetFunction(module, name):
+        calls.append(("cuModuleGetFunction", module, name))
+        return 0, name
+
+    def cuStreamCreate(flags):
+        calls.append(("cuStreamCreate", flags))
+        return 0, "stream"
+
+    def cuLaunchKernel(*args):
+        calls.append(("cuLaunchKernel", args))
+        return 0
+
+    driver_module.CUresult = FakeCUresult
+    driver_module.CUstream_flags = FakeCUstreamFlags
+    driver_module.cuModuleGetFunction = cuModuleGetFunction
+    driver_module.cuStreamCreate = cuStreamCreate
+    driver_module.cuLaunchKernel = cuLaunchKernel
+
+    cuda_module = types.ModuleType("cuda")
+    bindings_module = types.ModuleType("cuda.bindings")
+    bindings_module.driver = driver_module
+    cuda_module.bindings = bindings_module
+
+    monkeypatch.setitem(sys.modules, "runner", runner_module)
+    monkeypatch.setitem(sys.modules, "runner.runtime", runtime_module)
+    monkeypatch.setitem(sys.modules, "cuda", cuda_module)
+    monkeypatch.setitem(sys.modules, "cuda.bindings", bindings_module)
+    monkeypatch.setitem(sys.modules, "cuda.bindings.driver", driver_module)
+
+    return load_module_from_path("test_cuda_bindings_bench_launch", BENCH_LAUNCH_PATH)
+
+
+def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
+    runner_main = load_runner_main(monkeypatch)
+
+    marker_path = tmp_path / "imported.txt"
+    bench_path = tmp_path / "bench_lazy.py"
+    bench_path.write_text(
+        "\n".join(
+            (
+                "from pathlib import Path",
+                f"Path({str(marker_path)!r}).write_text('imported')",
+                "",
+                "def helper() -> float:",
+                "    return 0.0",
+                "",
+                "def bench_visible(loops: int) -> float:",
+                "    return loops + 0.5",
+                "",
+            )
+        ),
+        encoding="utf-8",
+    )
+
+    monkeypatch.setattr(runner_main, "BENCH_DIR", tmp_path)
+    runner_main._MODULE_CACHE.clear()
+
+    registry = runner_main.discover_benchmarks()
+
+    assert sorted(registry) == ["lazy.visible"]
+    assert not marker_path.exists()
+    assert registry["lazy.visible"](3) == 3.5
+    assert marker_path.read_text(encoding="utf-8") == "imported"
+
+
+def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
+    runner_main = load_runner_main(monkeypatch)
+
+    for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
+        monkeypatch.delenv(env_var, raising=False)
+    monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
+    monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")
+
+    argv = runner_main.ensure_pyperf_worker_env(["--fast", "--inherit-environ=FOO,BAR"])
+
+    assert argv == ["--fast", "--inherit-environ", "FOO,BAR,CUDA_PATH,LD_LIBRARY_PATH"]
+
+
+def test_bench_launch_initializes_on_first_use(monkeypatch):
+    calls: list[tuple] = []
+    bench_launch = load_bench_launch(monkeypatch, calls)
+
+    assert calls == []
+
+    bench_launch.bench_launch_empty_kernel(1)
+    compile_calls = [call for call in calls if call[0] == "compile_and_load"]
+    launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"]
+
+    assert len(compile_calls) == 1
+    assert len(launch_calls) == 1
+
+    bench_launch.bench_launch_16_args_pre_packed(1)
+    compile_calls = [call for call in calls if call[0] == "compile_and_load"]
+    launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"]
+
+    assert len(compile_calls) == 1
+    assert len(launch_calls) == 2

From dcf93f87e2b46934afa1c2c9f74f7be1b414a489 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 11:11:25 -0500
Subject: [PATCH 02/10] Add Memory benchmarks

---
 .../cuda_bindings/benchmarks/bench_memory.py  |  90 +++++++++++++++
 .../benchmarks/cpp/bench_memory.cpp           | 106 ++++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_memory.py
 create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp

diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
new file mode 100644
index 0000000000..faa4795580
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+import time
+
+import numpy as np
+
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+# Allocation size for alloc/free benchmarks
+ALLOC_SIZE = 1024
+
+# Small transfer size (8 bytes) to measure call overhead, not bandwidth
+COPY_SIZE = 8
+
+# Pre-allocate device memory and host buffers for memcpy benchmarks
+DST_DPTR = alloc_persistent(COPY_SIZE)
+SRC_DPTR = alloc_persistent(COPY_SIZE)
+HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
+HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
+
+# Stream for async operations
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    _cuMemAlloc = cuda.cuMemAlloc
+    _cuMemFree = cuda.cuMemFree
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _cuMemAlloc(_size)
+        _cuMemFree(ptr)
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _cuMemAllocAsync = cuda.cuMemAllocAsync
+    _cuMemFreeAsync = cuda.cuMemFreeAsync
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _cuMemAllocAsync(_size, _stream)
+        _cuMemFreeAsync(ptr, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_htod(loops: int) -> float:
+    _cuMemcpyHtoD = cuda.cuMemcpyHtoD
+    _dst = DST_DPTR
+    _src = HOST_SRC
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyHtoD(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtoh(loops: int) -> float:
+    _cuMemcpyDtoH = cuda.cuMemcpyDtoH
+    _dst = HOST_DST
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyDtoH(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtod(loops: int) -> float:
+    _cuMemcpyDtoD = cuda.cuMemcpyDtoD
+    _dst = DST_DPTR
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyDtoD(_dst, _src, _size)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
new file mode 100644
index 0000000000..4e71b73fb5
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
@@ -0,0 +1,106 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+static constexpr size_t ALLOC_SIZE = 1024;
+static constexpr size_t COPY_SIZE = 8;
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    // Pre-allocate device memory for memcpy benchmarks
+    CUdeviceptr dst_dptr, src_dptr;
+    check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed");
+    check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed");
+
+    // Host buffers for memcpy
+    uint8_t host_src[COPY_SIZE] = {};
+    uint8_t host_dst[COPY_SIZE] = {};
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- mem_alloc_free ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_free", [&]() {
+            check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed");
+            check_cu(cuMemFree(ptr), "cuMemFree failed");
+        });
+    }
+
+    // --- mem_alloc_async_free_async ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_async_free_async", [&]() {
+            check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed");
+            check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed");
+        });
+    }
+
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    // --- memcpy_htod ---
+    {
+        suite.run("memory.memcpy_htod", [&]() {
+            check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed");
+        });
+    }
+
+    // --- memcpy_dtoh ---
+    {
+        suite.run("memory.memcpy_dtoh", [&]() {
+            check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed");
+        });
+    }
+
+    // --- memcpy_dtod ---
+    {
+        suite.run("memory.memcpy_dtod", [&]() {
+            check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed");
+        });
+    }
+
+    // Cleanup
+    check_cu(cuMemFree(dst_dptr), "cuMemFree failed");
+    check_cu(cuMemFree(src_dptr), "cuMemFree failed");
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}

From 4097d747c42213ba32dd406d67e39cdfac1b7e6c Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 13:54:40 -0500
Subject: [PATCH 03/10] Add memory benchmarks

---
 benchmarks/cuda_bindings/AGENTS.md            |  4 +-
 benchmarks/cuda_bindings/README.md            |  2 +-
 .../benchmarks/cpp/bench_launch.cpp           | 33 -------------
 benchmarks/cuda_bindings/pixi.lock            | 48 +++++++++----------
 benchmarks/cuda_bindings/pixi.toml            |  2 +-
 5 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md
index 04f2f713fa..b9096a737f 100644
--- a/benchmarks/cuda_bindings/AGENTS.md
+++ b/benchmarks/cuda_bindings/AGENTS.md
@@ -1,4 +1,6 @@
 # cuda.bindings benchmarks
 
+Read the README.md in this directory for more details about the benchmarks.
+
 When generating code verify that that the code is correct based on the source for cuda-bindings
-that can be found in ../cuda_bindings
+that can be found in ../../cuda_bindings
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
index 75e16db031..f8d5ccf043 100644
--- a/benchmarks/cuda_bindings/README.md
+++ b/benchmarks/cuda_bindings/README.md
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
 pixi run -e wheel -- python -m pyperf system show
 
 # Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+$(pixi run -e wheel -- which python) -m pyperf system tune
 ```
 
 ### Running benchmarks
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
index fb65da6d74..a249426963 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
@@ -168,39 +168,6 @@ int main(int argc, char** argv) {
         });
     }
 
-    // --- launch_small_kernel ---
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args ---
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
     // Cleanup
     for (int i = 0; i < 16; ++i) {
         check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
diff --git a/benchmarks/cuda_bindings/pixi.lock b/benchmarks/cuda_bindings/pixi.lock
index c610db2f45..c571d4756c 100644
--- a/benchmarks/cuda_bindings/pixi.lock
+++ b/benchmarks/cuda_bindings/pixi.lock
@@ -38,8 +38,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
@@ -66,7 +66,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
@@ -130,7 +130,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
-      - conda: ..
+      - conda: ../../cuda_bindings
       - conda: ../../cuda_pathfinder
   wheel:
     channels:
@@ -406,7 +406,7 @@ packages:
   license_family: GPL
   size: 31705
   timestamp: 1771378159534
-- conda: ..
+- conda: ../../cuda_bindings
   name: cuda-bindings
   version: 13.2.0
   build: hb0f4dca_0
@@ -419,11 +419,11 @@ packages:
   - cuda-pathfinder
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.2.51,<14.0a0
+  - cuda-nvrtc >=13.2.78,<14.0a0
   - cuda-nvvm
   - libnvfatbin
   - libcufile
-  - libcufile >=1.17.0.44,<2.0a0
+  - libcufile >=1.17.1.22,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -643,17 +643,17 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 35339417
   timestamp: 1768272955912
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-  sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5
-  md5: 0413baaa73be1a39d5d8e442184acc78
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+  sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8
+  md5: 66623d882c42506fa3f1780b90841400
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-version >=13.2,<13.3.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35736655
-  timestamp: 1773100338749
+  size: 35670504
+  timestamp: 1776109867257
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
   sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3
   md5: 1096fce4abad7dd975ce6d9953fceb6a
@@ -668,20 +668,20 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 35845
   timestamp: 1768273073971
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
-  sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879
-  md5: 83535200a9e77165d5291b4ac82ebf6a
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda
+  sha256: 12505f1bbc222acf2a63da5c84e4176d2f9c18b458e2bde28939fdf326b6d292
+  md5: cc313f0ea18ebc6e713a8980611431f5
   depends:
   - __glibc >=2.17,<3.0.a0
-  - cuda-nvrtc 13.2.51 hecca717_0
+  - cuda-nvrtc 13.2.78 hecca717_0
   - cuda-version >=13.2,<13.3.0a0
   - libgcc >=14
   - libstdcxx >=14
   constrains:
-  - cuda-nvrtc-static >=13.2.51
+  - cuda-nvrtc-static >=13.2.78
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 36305
-  timestamp: 1773100458841
+  size: 36312
+  timestamp: 1776109983818
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
   sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b
   md5: 03cd3639b8e13623c7b91b1cb0136402
@@ -1018,9 +1018,9 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 990938
   timestamp: 1768273732081
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
-  sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19
-  md5: 6d8ed8335d144ec7303b8d3587b2205c
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
+  sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74
+  md5: 056a67706108efd1f9c24682ba8d3685
   depends:
   - __glibc >=2.28,<3.0.a0
   - cuda-version >=13.2,<13.3.0a0
@@ -1028,8 +1028,8 @@ packages:
   - libstdcxx >=14
   - rdma-core >=61.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1085341
-  timestamp: 1773100191342
+  size: 1082447
+  timestamp: 1776110053053
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
   sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b
   md5: 1707cdd636af2ff697b53186572c9f77
diff --git a/benchmarks/cuda_bindings/pixi.toml b/benchmarks/cuda_bindings/pixi.toml
index a448e8d3e4..dbbddcd939 100644
--- a/benchmarks/cuda_bindings/pixi.toml
+++ b/benchmarks/cuda_bindings/pixi.toml
@@ -45,7 +45,7 @@ pre-commit = "*"
 cuda-bindings = "==13.1.0"
 
 [feature.bindings-source.dependencies]
-cuda-bindings = { path = ".." }
+cuda-bindings = { path = "../../cuda_bindings" }
 
 [environments]
 wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }

From a368a48e07969e911f62d35589c2bc8328da4b1e Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 13:59:42 -0500
Subject: [PATCH 04/10] Move to top level of the repo

---
 cuda_bindings/benchmarks/.gitignore           |   16 -
 cuda_bindings/benchmarks/README.md            |   74 -
 .../benchmarks/benchmarks/bench_ctx_device.py |   62 -
 .../benchmarks/benchmarks/bench_event.py      |   62 -
 .../benchmarks/benchmarks/bench_launch.py     |  133 --
 .../benchmarks/bench_pointer_attributes.py    |   25 -
 .../benchmarks/benchmarks/bench_stream.py     |   45 -
 .../benchmarks/benchmarks/cpp/CMakeLists.txt  |   91 -
 .../benchmarks/cpp/bench_ctx_device.cpp       |   87 -
 .../benchmarks/benchmarks/cpp/bench_event.cpp |   90 -
 .../benchmarks/cpp/bench_launch.cpp           |  216 --
 .../cpp/bench_pointer_attributes.cpp          |   59 -
 .../benchmarks/cpp/bench_stream.cpp           |   74 -
 .../benchmarks/cpp/bench_support.hpp          |  309 ---
 cuda_bindings/benchmarks/compare.py           |  118 --
 cuda_bindings/benchmarks/pixi.lock            | 1767 -----------------
 cuda_bindings/benchmarks/pixi.toml            |   87 -
 .../benchmarks/pytest-legacy/conftest.py      |   93 -
 .../benchmarks/pytest-legacy/kernels.py       |  159 --
 .../benchmarks/pytest-legacy/test_cupy.py     |  199 --
 .../pytest-legacy/test_launch_latency.py      |  336 ----
 .../benchmarks/pytest-legacy/test_numba.py    |   52 -
 .../pytest-legacy/test_pointer_attributes.py  |  112 --
 cuda_bindings/benchmarks/run_cpp.py           |    8 -
 cuda_bindings/benchmarks/run_pyperf.py        |    8 -
 cuda_bindings/benchmarks/runner/__init__.py   |    3 -
 cuda_bindings/benchmarks/runner/cpp.py        |  180 --
 cuda_bindings/benchmarks/runner/main.py       |  217 --
 cuda_bindings/benchmarks/runner/runtime.py    |  105 -
 cuda_bindings/benchmarks/tests/test_runner.py |  166 --
 30 files changed, 4953 deletions(-)
 delete mode 100644 cuda_bindings/benchmarks/.gitignore
 delete mode 100644 cuda_bindings/benchmarks/README.md
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_event.py
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_launch.py
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_stream.py
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp
 delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp
 delete mode 100644 cuda_bindings/benchmarks/compare.py
 delete mode 100644 cuda_bindings/benchmarks/pixi.lock
 delete mode 100644 cuda_bindings/benchmarks/pixi.toml
 delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/conftest.py
 delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/kernels.py
 delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_cupy.py
 delete mode 100755 cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py
 delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_numba.py
 delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py
 delete mode 100644 cuda_bindings/benchmarks/run_cpp.py
 delete mode 100644 cuda_bindings/benchmarks/run_pyperf.py
 delete mode 100644 cuda_bindings/benchmarks/runner/__init__.py
 delete mode 100644 cuda_bindings/benchmarks/runner/cpp.py
 delete mode 100644 cuda_bindings/benchmarks/runner/main.py
 delete mode 100644 cuda_bindings/benchmarks/runner/runtime.py
 delete mode 100644 cuda_bindings/benchmarks/tests/test_runner.py

diff --git a/cuda_bindings/benchmarks/.gitignore b/cuda_bindings/benchmarks/.gitignore
deleted file mode 100644
index b795782a32..0000000000
--- a/cuda_bindings/benchmarks/.gitignore
+++ /dev/null
@@ -1,16 +0,0 @@
-# Build artifacts
-.build/
-__pycache__/
-
-# Benchmark results
-*.json
-.benchmarks/
-
-# Pixi environments
-.pixi/
-
-# Override root .gitignore *.cpp rule (which targets Cython-generated files)
-!benchmarks/cpp/*.cpp
-
-results-python.json
-results-cpp.json
diff --git a/cuda_bindings/benchmarks/README.md b/cuda_bindings/benchmarks/README.md
deleted file mode 100644
index 75e16db031..0000000000
--- a/cuda_bindings/benchmarks/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# cuda.bindings benchmarks
-
-These benchmarks are intended to measure the latency overhead of calling CUDA
-Driver APIs through cuda.bindings, relative to a similar C++ baseline.
-
-The goal is to benchmark how much overhead does the Python layer adds to calling
-CUDA APIs and what operations are not in our target of less than 1us of overhead.
-
-Each Python benchmark has a C++ counterpart, which is used to compare the
-operations. We try to make each implementation perform small operations
-and nearly the same work as possible and are run under similar conditions.
-
-These are **not** throughput benchmarks to measure the overall performance
-of kernels and applications.
-
-## Usage
-
-Requires pixi.
-
-There are a couple of environments defined based on how `cuda.bindings` is installed:
-
-- `wheel`: Installs from conda packages
-- `source`: Installs from source
-
-There are a couple of tasks defined:
-
-- `bench`: Runs the Python benchmarks
-- `bench-cpp`: Runs the C++ benchmarks
-
-### System tuning
-
-For more stable results on Linux, tune the system before running benchmarks.
-See: https://pyperf.readthedocs.io/en/latest/system.html#system
-
-```bash
-# Show current system state
-pixi run -e wheel -- python -m pyperf system show
-
-# Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
-```
-
-### Running benchmarks
-
-To run the benchmarks combine the environment and task:
-
-```bash
-# Run the Python benchmarks in the wheel environment
-pixi run -e wheel bench
-
-# Run the Python benchmarks in the source environment
-pixi run -e source bench
-
-# Run the C++ benchmarks
-pixi run -e wheel bench-cpp
-```
-
-Both runners automatically save results to JSON files in the benchmarks
-directory: `results-python.json` and `results-cpp.json`.
-
-## Output JSON and analysis
-
-The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
-Both Python and C++ results are saved in pyperf-compatible JSON format,
-which can be analyzed with pyperf commands:
-
-```bash
-# Show results and statistics
-pixi run -e wheel -- python -m pyperf stats results-python.json
-pixi run -e wheel -- python -m pyperf stats results-cpp.json
-
-# Compare C++ vs Python results
-pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
-```
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
deleted file mode 100644
index 1c82cd4046..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-from runner.runtime import ensure_context
-
-from cuda.bindings import driver as cuda
-
-CTX = ensure_context()
-
-_, DEVICE = cuda.cuDeviceGet(0)
-ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
-
-
-def bench_ctx_get_current(loops: int) -> float:
-    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuCtxGetCurrent()
-    return time.perf_counter() - t0
-
-
-def bench_ctx_set_current(loops: int) -> float:
-    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
-    _ctx = CTX
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuCtxSetCurrent(_ctx)
-    return time.perf_counter() - t0
-
-
-def bench_ctx_get_device(loops: int) -> float:
-    _cuCtxGetDevice = cuda.cuCtxGetDevice
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuCtxGetDevice()
-    return time.perf_counter() - t0
-
-
-def bench_device_get(loops: int) -> float:
-    _cuDeviceGet = cuda.cuDeviceGet
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuDeviceGet(0)
-    return time.perf_counter() - t0
-
-
-def bench_device_get_attribute(loops: int) -> float:
-    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
-    _attr = ATTRIBUTE
-    _dev = DEVICE
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuDeviceGetAttribute(_attr, _dev)
-    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_event.py b/cuda_bindings/benchmarks/benchmarks/bench_event.py
deleted file mode 100644
index e8e319115d..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/bench_event.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-from runner.runtime import ensure_context
-
-from cuda.bindings import driver as cuda
-
-ensure_context()
-
-_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
-_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
-
-cuda.cuEventRecord(EVENT, STREAM)
-cuda.cuStreamSynchronize(STREAM)
-
-EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
-
-
-def bench_event_create_destroy(loops: int) -> float:
-    _cuEventCreate = cuda.cuEventCreate
-    _cuEventDestroy = cuda.cuEventDestroy
-    _flags = EVENT_FLAGS
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _, e = _cuEventCreate(_flags)
-        _cuEventDestroy(e)
-    return time.perf_counter() - t0
-
-
-def bench_event_record(loops: int) -> float:
-    _cuEventRecord = cuda.cuEventRecord
-    _event = EVENT
-    _stream = STREAM
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuEventRecord(_event, _stream)
-    return time.perf_counter() - t0
-
-
-def bench_event_query(loops: int) -> float:
-    _cuEventQuery = cuda.cuEventQuery
-    _event = EVENT
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuEventQuery(_event)
-    return time.perf_counter() - t0
-
-
-def bench_event_synchronize(loops: int) -> float:
-    _cuEventSynchronize = cuda.cuEventSynchronize
-    _event = EVENT
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuEventSynchronize(_event)
-    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/cuda_bindings/benchmarks/benchmarks/bench_launch.py
deleted file mode 100644
index 931194fbd3..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/bench_launch.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import ctypes
-import time
-
-from runner.runtime import alloc_persistent, assert_drv, compile_and_load
-
-from cuda.bindings import driver as cuda
-
-# Compile kernels lazily so benchmark discovery does not need NVRTC.
-KERNEL_SOURCE = """\
-extern "C" __global__ void empty_kernel() { return; }
-extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
-
-#define ITEM_PARAM(x, T) T x
-#define REP1(x, T)   , ITEM_PARAM(x, T)
-#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
-#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
-#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
-#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
-
-extern "C" __global__
-void small_kernel_16_args(
-    ITEM_PARAM(F, int*)
-    REP1(A, int*)
-    REP2(A, int*)
-    REP4(A, int*)
-    REP8(A, int*))
-{ *F = 0; }
-"""
-
-MODULE = None
-EMPTY_KERNEL = None
-SMALL_KERNEL = None
-KERNEL_16_ARGS = None
-STREAM = None
-FLOAT_PTR = None
-INT_PTRS = None
-_VAL_PS = None
-PACKED_16 = None
-
-
-def _ensure_launch_state() -> None:
-    global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
-    global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
-
-    if EMPTY_KERNEL is not None:
-        return
-
-    module = compile_and_load(KERNEL_SOURCE)
-
-    err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
-    assert_drv(err)
-    err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
-    assert_drv(err)
-    err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
-    assert_drv(err)
-
-    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
-    assert_drv(err)
-
-    float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
-    int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
-
-    val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
-    packed_16 = (ctypes.c_void_p * 16)()
-    for index, value_ptr in enumerate(val_ps):
-        packed_16[index] = ctypes.addressof(value_ptr)
-
-    MODULE = module
-    EMPTY_KERNEL = empty_kernel
-    SMALL_KERNEL = small_kernel
-    KERNEL_16_ARGS = kernel_16_args
-    STREAM = stream
-    FLOAT_PTR = float_ptr
-    INT_PTRS = int_ptrs
-    _VAL_PS = val_ps
-    PACKED_16 = packed_16
-
-
-def bench_launch_empty_kernel(loops: int) -> float:
-    _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
-    _kernel = EMPTY_KERNEL
-    _stream = STREAM
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
-    return time.perf_counter() - t0
-
-
-def bench_launch_small_kernel(loops: int) -> float:
-    _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
-    _kernel = SMALL_KERNEL
-    _stream = STREAM
-    _args = (FLOAT_PTR,)
-    _arg_types = (None,)
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
-    return time.perf_counter() - t0
-
-
-def bench_launch_16_args(loops: int) -> float:
-    _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
-    _kernel = KERNEL_16_ARGS
-    _stream = STREAM
-    _args = INT_PTRS
-    _arg_types = (None,) * 16
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
-    return time.perf_counter() - t0
-
-
-def bench_launch_16_args_pre_packed(loops: int) -> float:
-    _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
-    _kernel = KERNEL_16_ARGS
-    _stream = STREAM
-    _packed = PACKED_16
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
-    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py b/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
deleted file mode 100644
index a02b82c399..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-from runner.runtime import alloc_persistent
-
-from cuda.bindings import driver as cuda
-
-# Allocate memory used by the tests
-PTR = alloc_persistent(1 << 18)
-ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
-
-
-def bench_pointer_get_attribute(loops: int) -> float:
-    # Local references to avoid global lookups in the hot loop
-    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
-    _attr = ATTRIBUTE
-    _ptr = PTR
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuPointerGetAttribute(_attr, _ptr)
-    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_stream.py b/cuda_bindings/benchmarks/benchmarks/bench_stream.py
deleted file mode 100644
index d816099ed5..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/bench_stream.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-from runner.runtime import ensure_context
-
-from cuda.bindings import driver as cuda
-
-ensure_context()
-
-_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
-
-
-def bench_stream_create_destroy(loops: int) -> float:
-    _cuStreamCreate = cuda.cuStreamCreate
-    _cuStreamDestroy = cuda.cuStreamDestroy
-    _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _, s = _cuStreamCreate(_flags)
-        _cuStreamDestroy(s)
-    return time.perf_counter() - t0
-
-
-def bench_stream_query(loops: int) -> float:
-    _cuStreamQuery = cuda.cuStreamQuery
-    _stream = STREAM
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuStreamQuery(_stream)
-    return time.perf_counter() - t0
-
-
-def bench_stream_synchronize(loops: int) -> float:
-    _cuStreamSynchronize = cuda.cuStreamSynchronize
-    _stream = STREAM
-
-    t0 = time.perf_counter()
-    for _ in range(loops):
-        _cuStreamSynchronize(_stream)
-    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
deleted file mode 100644
index b4285834aa..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.24)
-project(cuda_bindings_cpp_benchmarks LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(CUDA_HOME_HINT "$ENV{CUDA_HOME}")
-set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}")
-
-# Find cuda.h (driver API header)
-find_path(
-    CUDA_DRIVER_INCLUDE_DIR
-    cuda.h
-    HINTS
-        "${CUDA_HOME_HINT}/include"
-        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
-        "${CONDA_PREFIX_HINT}/include"
-)
-
-# Find libcuda (driver API library) — lives on the system, not in toolkit
-find_library(
-    CUDA_DRIVER_LIBRARY
-    NAMES cuda
-    HINTS
-        "/usr/lib/x86_64-linux-gnu"
-        "/usr/lib64"
-        "${CUDA_HOME_HINT}/lib64/stubs"
-        "${CUDA_HOME_HINT}/lib/stubs"
-        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs"
-        "${CONDA_PREFIX_HINT}/lib/stubs"
-)
-
-# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks)
-find_path(
-    NVRTC_INCLUDE_DIR
-    nvrtc.h
-    HINTS
-        "${CUDA_HOME_HINT}/include"
-        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
-        "${CONDA_PREFIX_HINT}/include"
-)
-
-find_library(
-    NVRTC_LIBRARY
-    NAMES nvrtc
-    HINTS
-        "${CUDA_HOME_HINT}/lib64"
-        "${CUDA_HOME_HINT}/lib"
-        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib"
-        "${CONDA_PREFIX_HINT}/lib"
-)
-
-if(NOT CUDA_DRIVER_INCLUDE_DIR)
-    message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
-endif()
-
-if(NOT CUDA_DRIVER_LIBRARY)
-    message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
-endif()
-
-# Helper: add a benchmark that only needs the driver API
-function(add_driver_benchmark name)
-    add_executable(${name}_cpp ${name}.cpp)
-    target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
-    target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
-endfunction()
-
-# Helper: add a benchmark that needs driver API + NVRTC
-function(add_nvrtc_benchmark name)
-    add_executable(${name}_cpp ${name}.cpp)
-    target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}")
-    target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}")
-endfunction()
-
-# Driver-only benchmarks
-add_driver_benchmark(bench_pointer_attributes)
-add_driver_benchmark(bench_ctx_device)
-add_driver_benchmark(bench_stream)
-add_driver_benchmark(bench_event)
-
-# NVRTC benchmarks (require nvrtc for kernel compilation)
-if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
-    add_nvrtc_benchmark(bench_launch)
-else()
-    message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.")
-endif()
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp
deleted file mode 100644
index 052df9cc1d..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cuda.h>
-
-#include "bench_support.hpp"
-
-#include <cstdlib>
-#include <iostream>
-
-
-static void check_cu(CUresult status, const char* message) {
-    if (status != CUDA_SUCCESS) {
-        const char* error_name = nullptr;
-        cuGetErrorName(status, &error_name);
-        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
-        std::exit(1);
-    }
-}
-
-
-int main(int argc, char** argv) {
-    bench::Options options = bench::parse_args(argc, argv);
-
-    // Setup: init CUDA and create a context
-    check_cu(cuInit(0), "cuInit failed");
-
-    CUdevice device;
-    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
-
-    CUcontext ctx;
-    CUctxCreateParams ctxParams = {};
-    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
-
-    bench::BenchmarkSuite suite(options);
-
-    // --- ctx_get_current ---
-    {
-        CUcontext current_ctx = nullptr;
-        suite.run("ctx_device.ctx_get_current", [&]() {
-            check_cu(cuCtxGetCurrent(&current_ctx), "cuCtxGetCurrent failed");
-        });
-    }
-
-    // --- ctx_set_current ---
-    {
-        suite.run("ctx_device.ctx_set_current", [&]() {
-            check_cu(cuCtxSetCurrent(ctx), "cuCtxSetCurrent failed");
-        });
-    }
-
-    // --- ctx_get_device ---
-    {
-        CUdevice dev;
-        suite.run("ctx_device.ctx_get_device", [&]() {
-            check_cu(cuCtxGetDevice(&dev), "cuCtxGetDevice failed");
-        });
-    }
-
-    // --- device_get ---
-    {
-        CUdevice dev;
-        suite.run("ctx_device.device_get", [&]() {
-            check_cu(cuDeviceGet(&dev, 0), "cuDeviceGet failed");
-        });
-    }
-
-    // --- device_get_attribute ---
-    {
-        int value = 0;
-        suite.run("ctx_device.device_get_attribute", [&]() {
-            check_cu(
-                cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device),
-                "cuDeviceGetAttribute failed"
-            );
-        });
-    }
-
-    // Cleanup
-    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
-
-    // Write all results
-    suite.write();
-
-    return 0;
-}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp
deleted file mode 100644
index 44cd617778..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cuda.h>
-
-#include "bench_support.hpp"
-
-#include <cstdlib>
-#include <iostream>
-
-
-static void check_cu(CUresult status, const char* message) {
-    if (status != CUDA_SUCCESS) {
-        const char* error_name = nullptr;
-        cuGetErrorName(status, &error_name);
-        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
-        std::exit(1);
-    }
-}
-
-
-int main(int argc, char** argv) {
-    bench::Options options = bench::parse_args(argc, argv);
-
-    // Setup
-    check_cu(cuInit(0), "cuInit failed");
-
-    CUdevice device;
-    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
-
-    CUcontext ctx;
-    CUctxCreateParams ctxParams = {};
-    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
-
-    CUstream stream;
-    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
-
-    // Persistent event for query/synchronize/record benchmarks
-    CUevent event;
-    check_cu(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed");
-
-    // Record and sync so the event starts in a completed state
-    check_cu(cuEventRecord(event, stream), "cuEventRecord failed");
-    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-
-    bench::BenchmarkSuite suite(options);
-
-    // --- event_create_destroy ---
-    {
-        CUevent e;
-        suite.run("event.event_create_destroy", [&]() {
-            check_cu(cuEventCreate(&e, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed");
-            check_cu(cuEventDestroy(e), "cuEventDestroy failed");
-        });
-    }
-
-    // --- event_record ---
-    {
-        suite.run("event.event_record", [&]() {
-            check_cu(cuEventRecord(event, stream), "cuEventRecord failed");
-        });
-    }
-
-    // Re-sync so event is in a known completed state after the record benchmark
-    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-
-    {
-        suite.run("event.event_query", [&]() {
-            // Returns CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if not
-            cuEventQuery(event);
-        });
-    }
-
-    // --- event_synchronize ---
-    {
-        suite.run("event.event_synchronize", [&]() {
-            check_cu(cuEventSynchronize(event), "cuEventSynchronize failed");
-        });
-    }
-
-    // Cleanup
-    check_cu(cuEventDestroy(event), "cuEventDestroy failed");
-    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
-    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
-
-    suite.write();
-
-    return 0;
-}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp
deleted file mode 100644
index fb65da6d74..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cuda.h>
-#include <nvrtc.h>
-
-#include "bench_support.hpp"
-
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <string>
-#include <vector>
-
-
-static void check_cu(CUresult status, const char* message) {
-    if (status != CUDA_SUCCESS) {
-        const char* error_name = nullptr;
-        cuGetErrorName(status, &error_name);
-        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
-        std::exit(1);
-    }
-}
-
-static void check_nvrtc(nvrtcResult status, const char* message) {
-    if (status != NVRTC_SUCCESS) {
-        std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n';
-        std::exit(1);
-    }
-}
-
-static CUmodule compile_and_load(const char* source, CUdevice device) {
-    int major = 0, minor = 0;
-    check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device),
-             "cuDeviceGetAttribute failed");
-    check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device),
-             "cuDeviceGetAttribute failed");
-
-    nvrtcProgram prog;
-    check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr),
-                "nvrtcCreateProgram failed");
-
-    std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor);
-    const char* opts[] = {"--fmad=false", arch.c_str()};
-    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts);
-
-    // Print log on failure
-    if (compile_result != NVRTC_SUCCESS) {
-        size_t log_size = 0;
-        nvrtcGetProgramLogSize(prog, &log_size);
-        std::vector<char> log(log_size);
-        nvrtcGetProgramLog(prog, log.data());
-        std::cerr << "NVRTC compile failed:\n" << log.data() << '\n';
-        std::exit(1);
-    }
-
-    size_t cubin_size = 0;
-    check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed");
-    std::vector<char> cubin(cubin_size);
-    check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed");
-    nvrtcDestroyProgram(&prog);
-
-    CUmodule module;
-    check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed");
-    return module;
-}
-
-
-static const char* KERNEL_SOURCE = R"(
-extern "C" __global__ void empty_kernel() { return; }
-extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
-
-extern "C" __global__
-void small_kernel_16_args(
-    int* a0,  int* a1,  int* a2,  int* a3,
-    int* a4,  int* a5,  int* a6,  int* a7,
-    int* a8,  int* a9,  int* a10, int* a11,
-    int* a12, int* a13, int* a14, int* a15)
-{ *a0 = 0; }
-)";
-
-
-int main(int argc, char** argv) {
-    bench::Options options = bench::parse_args(argc, argv);
-
-    // Setup
-    check_cu(cuInit(0), "cuInit failed");
-
-    CUdevice device;
-    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
-
-    CUcontext ctx;
-    CUctxCreateParams ctxParams = {};
-    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
-
-    CUmodule module = compile_and_load(KERNEL_SOURCE, device);
-
-    CUfunction empty_kernel, small_kernel, kernel_16_args;
-    check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed");
-    check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed");
-    check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed");
-
-    CUstream stream;
-    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
-
-    // Allocate device memory for arguments
-    CUdeviceptr float_ptr;
-    check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed");
-
-    CUdeviceptr int_ptrs[16];
-    for (int i = 0; i < 16; ++i) {
-        check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed");
-    }
-
-    // Pre-pack kernel params for the pre-packed benchmark
-    void* packed_16[16];
-    for (int i = 0; i < 16; ++i) {
-        packed_16[i] = &int_ptrs[i];
-    }
-
-    bench::BenchmarkSuite suite(options);
-
-    // --- launch_empty_kernel ---
-    {
-        suite.run("launch.launch_empty_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // Drain the stream between benchmarks so each starts with a clean queue
-    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_small_kernel ---
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args ---
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // Cleanup
-    for (int i = 0; i < 16; ++i) {
-        check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
-    }
-    check_cu(cuMemFree(float_ptr), "cuMemFree failed");
-    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
-    check_cu(cuModuleUnload(module), "cuModuleUnload failed");
-    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
-
-    suite.write();
-
-    return 0;
-}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp
deleted file mode 100644
index 4d9afc6566..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cuda.h>
-
-#include "bench_support.hpp"
-
-#include <cstdlib>
-#include <iostream>
-
-
-static void check_cu(CUresult status, const char* message) {
-    if (status != CUDA_SUCCESS) {
-        const char* error_name = nullptr;
-        cuGetErrorName(status, &error_name);
-        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
-        std::exit(1);
-    }
-}
-
-
-int main(int argc, char** argv) {
-    bench::Options options = bench::parse_args(argc, argv);
-
-    // Setup: init CUDA, allocate memory
-    check_cu(cuInit(0), "cuInit failed");
-
-    CUdevice device;
-    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
-
-    CUcontext ctx;
-    CUctxCreateParams ctxParams = {};
-    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
-
-    CUdeviceptr ptr;
-    check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed");
-
-    bench::BenchmarkSuite suite(options);
-
-    // --- pointer_get_attribute ---
-    {
-        unsigned int memory_type = 0;
-        suite.run("pointer_attributes.pointer_get_attribute", [&]() {
-            check_cu(
-                cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr),
-                "cuPointerGetAttribute failed"
-            );
-        });
-    }
-
-    // Cleanup
-    check_cu(cuMemFree(ptr), "cuMemFree failed");
-    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
-
-    suite.write();
-
-    return 0;
-}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp
deleted file mode 100644
index 702e86aef0..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cuda.h>
-
-#include "bench_support.hpp"
-
-#include <cstdlib>
-#include <iostream>
-
-
-static void check_cu(CUresult status, const char* message) {
-    if (status != CUDA_SUCCESS) {
-        const char* error_name = nullptr;
-        cuGetErrorName(status, &error_name);
-        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
-        std::exit(1);
-    }
-}
-
-
-int main(int argc, char** argv) {
-    bench::Options options = bench::parse_args(argc, argv);
-
-    // Setup
-    check_cu(cuInit(0), "cuInit failed");
-
-    CUdevice device;
-    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
-
-    CUcontext ctx;
-    CUctxCreateParams ctxParams = {};
-    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
-
-    // Persistent stream for query/synchronize benchmarks
-    CUstream stream;
-    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
-
-    bench::BenchmarkSuite suite(options);
-
-    // --- stream_create_destroy ---
-    {
-        CUstream s;
-        suite.run("stream.stream_create_destroy", [&]() {
-            check_cu(cuStreamCreate(&s, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
-            check_cu(cuStreamDestroy(s), "cuStreamDestroy failed");
-        });
-    }
-
-    // --- stream_query ---
-    {
-        suite.run("stream.stream_query", [&]() {
-            // cuStreamQuery returns CUDA_SUCCESS if stream is idle,
-            // CUDA_ERROR_NOT_READY if busy — both are valid here.
-            cuStreamQuery(stream);
-        });
-    }
-
-    // --- stream_synchronize ---
-    {
-        suite.run("stream.stream_synchronize", [&]() {
-            check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
-        });
-    }
-
-    // Cleanup
-    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
-    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
-
-    suite.write();
-
-    return 0;
-}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp
deleted file mode 100644
index 837c15a9d1..0000000000
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp
+++ /dev/null
@@ -1,309 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <chrono>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <ctime>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace bench {
-
-struct Options {
-    std::uint64_t loops = 1000;
-    std::uint64_t warmups = 5;
-    std::uint64_t values = 20;
-    std::uint64_t runs = 20;
-    std::string output_path;
-    std::string benchmark_name;
-};
-
-// A single run result: warmup values and timed values (seconds per loop)
-struct RunResult {
-    std::string date;
-    double duration_sec;
-    std::vector<double> warmup_values;  // seconds per loop
-    std::vector<double> values;         // seconds per loop
-};
-
-inline Options parse_args(int argc, char** argv) {
-    Options options;
-    for (int i = 1; i < argc; ++i) {
-        const std::string arg(argv[i]);
-        if (arg == "--loops" && i + 1 < argc) {
-            options.loops = std::strtoull(argv[++i], nullptr, 10);
-            continue;
-        }
-        if (arg == "--warmups" && i + 1 < argc) {
-            options.warmups = std::strtoull(argv[++i], nullptr, 10);
-            continue;
-        }
-        if (arg == "--values" && i + 1 < argc) {
-            options.values = std::strtoull(argv[++i], nullptr, 10);
-            continue;
-        }
-        if (arg == "--runs" && i + 1 < argc) {
-            options.runs = std::strtoull(argv[++i], nullptr, 10);
-            continue;
-        }
-        if ((arg == "-o" || arg == "--output") && i + 1 < argc) {
-            options.output_path = argv[++i];
-            continue;
-        }
-        if (arg == "--name" && i + 1 < argc) {
-            options.benchmark_name = argv[++i];
-            continue;
-        }
-        if (arg == "--help" || arg == "-h") {
-            std::cout << "Usage: benchmark [options]\n"
-                      << "  --loops N       Loop iterations per value (default: 1000)\n"
-                      << "  --warmups N     Warmup values per run (default: 5)\n"
-                      << "  --values N      Timed values per run (default: 20)\n"
-                      << "  --runs N        Number of runs (default: 20)\n"
-                      << "  -o, --output F  Write pyperf-compatible JSON to file\n"
-                      << "  --name S        Benchmark name (overrides default)\n";
-            std::exit(0);
-        }
-
-        std::cerr << "Unknown argument: " << arg << '\n';
-        std::exit(2);
-    }
-    return options;
-}
-
-inline std::string iso_now() {
-    const auto now = std::chrono::system_clock::now();
-    const std::time_t t = std::chrono::system_clock::to_time_t(now);
-    std::tm tm{};
-#ifdef _WIN32
-    gmtime_s(&tm, &t);
-#else
-    gmtime_r(&t, &tm);
-#endif
-    char buf[64];
-    std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm);
-    return std::string(buf);
-}
-
-// Run a benchmark function. The function signature is: void fn() — one call = one operation.
-// The harness calls fn() in a tight loop `loops` times per value.
-template <typename Fn>
-std::vector<RunResult> run_benchmark(const Options& options, Fn&& fn) {
-    std::vector<RunResult> results;
-    results.reserve(options.runs);
-
-    for (std::uint64_t r = 0; r < options.runs; ++r) {
-        RunResult run;
-        run.date = iso_now();
-        const auto run_start = std::chrono::steady_clock::now();
-
-        // Warmups
-        for (std::uint64_t w = 0; w < options.warmups; ++w) {
-            const auto t0 = std::chrono::steady_clock::now();
-            for (std::uint64_t i = 0; i < options.loops; ++i) {
-                fn();
-            }
-            const auto t1 = std::chrono::steady_clock::now();
-            const double elapsed = std::chrono::duration<double>(t1 - t0).count();
-            run.warmup_values.push_back(elapsed / static_cast<double>(options.loops));
-        }
-
-        // Timed values
-        for (std::uint64_t v = 0; v < options.values; ++v) {
-            const auto t0 = std::chrono::steady_clock::now();
-            for (std::uint64_t i = 0; i < options.loops; ++i) {
-                fn();
-            }
-            const auto t1 = std::chrono::steady_clock::now();
-            const double elapsed = std::chrono::duration<double>(t1 - t0).count();
-            run.values.push_back(elapsed / static_cast<double>(options.loops));
-        }
-
-        const auto run_end = std::chrono::steady_clock::now();
-        run.duration_sec = std::chrono::duration<double>(run_end - run_start).count();
-        results.push_back(std::move(run));
-    }
-
-    return results;
-}
-
-inline void print_summary(const std::string& name, const std::vector<RunResult>& results) {
-    // Collect all timed values
-    std::vector<double> all_values;
-    for (const auto& run : results) {
-        for (double v : run.values) {
-            all_values.push_back(v);
-        }
-    }
-    if (all_values.empty())
-        return;
-
-    double sum = 0;
-    for (double v : all_values)
-        sum += v;
-
-    double mean = sum / static_cast<double>(all_values.size());
-
-    double sq_sum = 0;
-    for (double v : all_values) {
-        double diff = v - mean;
-        sq_sum += diff * diff;
-    }
-    double stdev = std::sqrt(sq_sum / static_cast<double>(all_values.size()));
-
-    std::cout << name << ": Mean +- std dev: "
-              << std::fixed << std::setprecision(0)
-              << (mean * 1e9) << " ns +- "
-              << (stdev * 1e9) << " ns\n";
-}
-
-// Escape a JSON string (minimal — no control chars expected)
-inline std::string json_str(const std::string& s) {
-    return "\"" + s + "\"";
-}
-
-inline void write_pyperf_json(
-    const std::string& output_path,
-    const std::string& name,
-    std::uint64_t loops,
-    const std::vector<RunResult>& results
-) {
-    std::ofstream out(output_path);
-    if (!out) {
-        std::cerr << "Failed to open output file: " << output_path << '\n';
-        std::exit(3);
-    }
-
-    out << std::setprecision(17);
-
-    out << "{\"version\": \"1.0\", ";
-    out << "\"metadata\": {";
-    out << "\"name\": " << json_str(name) << ", ";
-    out << "\"loops\": " << loops << ", ";
-    out << "\"unit\": \"second\"";
-    out << "}, ";
-
-    out << "\"benchmarks\": [{\"runs\": [";
-
-    for (std::size_t r = 0; r < results.size(); ++r) {
-        const auto& run = results[r];
-        if (r > 0) out << ", ";
-
-        out << "{\"metadata\": {";
-        out << "\"date\": " << json_str(run.date) << ", ";
-        out << "\"duration\": " << run.duration_sec;
-        out << "}, ";
-
-        // Warmups: array of [loops, value] pairs
-        out << "\"warmups\": [";
-        for (std::size_t w = 0; w < run.warmup_values.size(); ++w) {
-            if (w > 0) out << ", ";
-            out << "[" << loops << ", " << run.warmup_values[w] << "]";
-        }
-        out << "], ";
-
-        // Values
-        out << "\"values\": [";
-        for (std::size_t v = 0; v < run.values.size(); ++v) {
-            if (v > 0) out << ", ";
-            out << run.values[v];
-        }
-        out << "]}";
-    }
-
-    out << "]}]}\n";
-}
-
-// A collected benchmark entry: name, loops, and run results
-struct BenchmarkEntry {
-    std::string name;
-    std::uint64_t loops;
-    std::vector<RunResult> results;
-};
-
-// Collect multiple benchmarks from a single binary and write them all
-// to one pyperf-compatible JSON file.
-class BenchmarkSuite {
-public:
-    explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}
-
-    // Run a benchmark and record it. The name is used as the benchmark ID.
-    template <typename Fn>
-    void run(const std::string& name, Fn&& fn) {
-        auto results = run_benchmark(options_, std::forward<Fn>(fn));
-        print_summary(name, results);
-        entries_.push_back({name, options_.loops, std::move(results)});
-    }
-
-    // Write all collected benchmarks to the output file (if -o was given).
-    void write() const {
-        if (options_.output_path.empty() || entries_.empty())
-            return;
-        write_multi_pyperf_json(options_.output_path, entries_);
-    }
-
-private:
-    Options options_;
-    std::vector<BenchmarkEntry> entries_;
-
-    static void write_multi_pyperf_json(
-        const std::string& output_path,
-        const std::vector<BenchmarkEntry>& entries
-    ) {
-        std::ofstream out(output_path);
-        if (!out) {
-            std::cerr << "Failed to open output file: " << output_path << '\n';
-            std::exit(3);
-        }
-
-        out << std::setprecision(17);
-        out << "{\"version\": \"1.0\", \"benchmarks\": [";
-
-        for (std::size_t e = 0; e < entries.size(); ++e) {
-            const auto& entry = entries[e];
-            if (e > 0) out << ", ";
-
-            out << "{\"metadata\": {";
-            out << "\"name\": " << json_str(entry.name) << ", ";
-            out << "\"loops\": " << entry.loops << ", ";
-            out << "\"unit\": \"second\"";
-            out << "}, \"runs\": [";
-
-            for (std::size_t r = 0; r < entry.results.size(); ++r) {
-                const auto& run = entry.results[r];
-                if (r > 0) out << ", ";
-
-                out << "{\"metadata\": {";
-                out << "\"date\": " << json_str(run.date) << ", ";
-                out << "\"duration\": " << run.duration_sec;
-                out << "}, ";
-
-                out << "\"warmups\": [";
-                for (std::size_t w = 0; w < run.warmup_values.size(); ++w) {
-                    if (w > 0) out << ", ";
-                    out << "[" << entry.loops << ", " << run.warmup_values[w] << "]";
-                }
-                out << "], ";
-
-                out << "\"values\": [";
-                for (std::size_t v = 0; v < run.values.size(); ++v) {
-                    if (v > 0) out << ", ";
-                    out << run.values[v];
-                }
-                out << "]}";
-            }
-            out << "]}";
-        }
-        out << "]}\n";
-    }
-};
-
-}  // namespace bench
diff --git a/cuda_bindings/benchmarks/compare.py b/cuda_bindings/benchmarks/compare.py
deleted file mode 100644
index 6a3e94f344..0000000000
--- a/cuda_bindings/benchmarks/compare.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Compare Python and C++ benchmark results in a summary table."""
-
-import argparse
-import json
-import statistics
-import sys
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).resolve().parent
-DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json"
-DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json"
-
-
-def load_benchmarks(path: Path) -> dict[str, list[float]]:
-    """Load a pyperf JSON file and return {name: [values]}."""
-    with open(path) as f:
-        data = json.load(f)
-
-    results: dict[str, list[float]] = {}
-    for bench in data.get("benchmarks", []):
-        name = bench.get("metadata", {}).get("name", "")
-        if not name:
-            # Try to find name in run metadata
-            for run in bench.get("runs", []):
-                name = run.get("metadata", {}).get("name", "")
-                if name:
-                    break
-        values = []
-        for run in bench.get("runs", []):
-            values.extend(run.get("values", []))
-        if name and values:
-            results[name] = values
-    return results
-
-
-def fmt_ns(seconds: float) -> str:
-    ns = seconds * 1e9
-    if ns >= 1000:
-        return f"{ns / 1000:.2f} us"
-    return f"{ns:.0f} ns"
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results")
-    parser.add_argument(
-        "--python",
-        type=Path,
-        default=DEFAULT_PYTHON,
-        help=f"Python results JSON (default: {DEFAULT_PYTHON.name})",
-    )
-    parser.add_argument(
-        "--cpp",
-        type=Path,
-        default=DEFAULT_CPP,
-        help=f"C++ results JSON (default: {DEFAULT_CPP.name})",
-    )
-    args = parser.parse_args()
-
-    if not args.python.exists():
-        print(f"Python results not found: {args.python}", file=sys.stderr)
-        print("Run: pixi run -e wheel bench", file=sys.stderr)
-        sys.exit(1)
-
-    py_benchmarks = load_benchmarks(args.python)
-    cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {}
-
-    if not py_benchmarks:
-        print("No benchmarks found in Python results.", file=sys.stderr)
-        sys.exit(1)
-
-    # Column widths
-    all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks))
-    name_width = max(len(n) for n in all_names)
-    name_width = max(name_width, len("Benchmark"))
-
-    # Header
-    if cpp_benchmarks:
-        header = f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'Python (mean)':>14}  {'Overhead':>10}"
-        sep = "-" * len(header)
-        print(sep)
-        print(header)
-        print(sep)
-    else:
-        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}"
-        sep = "-" * len(header)
-        print(sep)
-        print(header)
-        print(sep)
-
-    for name in all_names:
-        py_vals = py_benchmarks.get(name)
-        cpp_vals = cpp_benchmarks.get(name)
-
-        py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
-        cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
-
-        if py_vals and cpp_vals:
-            py_mean = statistics.mean(py_vals)
-            cpp_mean = statistics.mean(cpp_vals)
-            overhead_ns = (py_mean - cpp_mean) * 1e9
-            overhead_str = f"+{overhead_ns:.0f} ns"
-        else:
-            overhead_str = "-"
-
-        if cpp_benchmarks:
-            print(f"{name:<{name_width}}  {cpp_str:>12}  {py_str:>14}  {overhead_str:>10}")
-        else:
-            print(f"{name:<{name_width}}  {py_str:>14}")
-
-    print(sep)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cuda_bindings/benchmarks/pixi.lock b/cuda_bindings/benchmarks/pixi.lock
deleted file mode 100644
index c610db2f45..0000000000
--- a/cuda_bindings/benchmarks/pixi.lock
+++ /dev/null
@@ -1,1767 +0,0 @@
-version: 6
-environments:
-  default:
-    channels:
-    - url: https://conda.anaconda.org/conda-forge/
-    options:
-      channel-priority: disabled
-      pypi-prerelease-mode: if-necessary-or-explicit
-    packages: {}
-  source:
-    channels:
-    - url: https://conda.anaconda.org/conda-forge/
-    options:
-      channel-priority: disabled
-      pypi-prerelease-mode: if-necessary-or-explicit
-    packages:
-      linux-64:
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
-      - conda: ..
-      - conda: ../../cuda_pathfinder
-  wheel:
-    channels:
-    - url: https://conda.anaconda.org/conda-forge/
-    options:
-      channel-priority: disabled
-      pypi-prerelease-mode: if-necessary-or-explicit
-    packages:
-      linux-64:
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
-packages:
-- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
-  build_number: 20
-  sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9
-  md5: a9f577daf3de00bca7c3c76c0ecbd1de
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgomp >=7.5.0
-  constrains:
-  - openmp_impl <0.0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 28948
-  timestamp: 1770939786096
-- conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda
-  sha256: a9c114cbfeda42a226e2db1809a538929d2f118ef855372293bd188f71711c48
-  md5: 791365c5f65975051e4e017b5da3abf5
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  license: GPL-2.0-or-later
-  license_family: GPL
-  size: 68072
-  timestamp: 1756738968573
-- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda
-  sha256: 2851d34944b056d028543f0440fb631aeeff204151ea09589d8d9c13882395de
-  md5: 9902aeb08445c03fb31e01beeb173988
-  depends:
-  - binutils_impl_linux-64 >=2.45.1,<2.45.2.0a0
-  license: GPL-3.0-only
-  license_family: GPL
-  size: 35128
-  timestamp: 1770267175160
-- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda
-  sha256: 74341b26a2b9475dc14ba3cf12432fcd10a23af285101883e720216d81d44676
-  md5: 83aa53cb3f5fc849851a84d777a60551
-  depends:
-  - ld_impl_linux-64 2.45.1 default_hbd61a6d_101
-  - sysroot_linux-64
-  - zstd >=1.5.7,<1.6.0a0
-  license: GPL-3.0-only
-  license_family: GPL
-  size: 3744895
-  timestamp: 1770267152681
-- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda
-  sha256: 4826f97d33cbe54459970a1e84500dbe0cccf8326aaf370e707372ae20ec5a47
-  md5: dec96579f9a7035a59492bf6ee613b53
-  depends:
-  - binutils_impl_linux-64 2.45.1 default_hfdba357_101
-  license: GPL-3.0-only
-  license_family: GPL
-  size: 36060
-  timestamp: 1770267177798
-- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
-  sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6
-  md5: d2ffd7602c02f2b316fd921d39876885
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: bzip2-1.0.6
-  license_family: BSD
-  size: 260182
-  timestamp: 1771350215188
-- conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda
-  sha256: cc9accf72fa028d31c2a038460787751127317dcfa991f8d1f1babf216bb454e
-  md5: 920bb03579f15389b9e512095ad995b7
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: MIT
-  license_family: MIT
-  size: 207882
-  timestamp: 1765214722852
-- conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda
-  sha256: 8e7a40f16400d7839c82581410aa05c1f8324a693c9d50079f8c50dc9fb241f0
-  md5: abd85120de1187b0d1ec305c2173c71b
-  depends:
-  - binutils
-  - gcc
-  - gcc_linux-64 14.*
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 6693
-  timestamp: 1753098721814
-- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda
-  sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc
-  md5: 4492fd26db29495f0ba23f146cd5638d
-  depends:
-  - __unix
-  license: ISC
-  size: 147413
-  timestamp: 1772006283803
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
-  sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e
-  md5: cf45f4278afd6f4e6d03eda0f435d527
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libffi >=3.5.2,<3.6.0a0
-  - libgcc >=14
-  - pycparser
-  - python >=3.14,<3.15.0a0
-  - python_abi 3.14.* *_cp314
-  license: MIT
-  license_family: MIT
-  size: 300271
-  timestamp: 1761203085220
-- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
-  sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449
-  md5: 381bd45fb7aa032691f3063aff47e3a1
-  depends:
-  - python >=3.10
-  license: MIT
-  license_family: MIT
-  size: 13589
-  timestamp: 1763607964133
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda
-  sha256: 5ece78754577b8d9030ec1f09ce1cd481125f27d8d6fcdcfe2c1017661830c61
-  md5: 51d37989c1758b5edfe98518088bf700
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - bzip2 >=1.0.8,<2.0a0
-  - libcurl >=8.18.0,<9.0a0
-  - libexpat >=2.7.4,<3.0a0
-  - libgcc >=14
-  - liblzma >=5.8.2,<6.0a0
-  - libstdcxx >=14
-  - libuv >=1.51.0,<2.0a0
-  - libzlib >=1.3.1,<2.0a0
-  - ncurses >=6.5,<7.0a0
-  - rhash >=1.4.6,<2.0a0
-  - zstd >=1.5.7,<1.6.0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 22330508
-  timestamp: 1771383666798
-- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
-  sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287
-  md5: 962b9857ee8e7018c22f2776ffa0b2d7
-  depends:
-  - python >=3.9
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 27011
-  timestamp: 1733218222191
-- conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda
-  sha256: b90ec0e6a9eb22f7240b3584fe785457cff961fec68d40e6aece5d596f9bbd9a
-  md5: 0e3e144115c43c9150d18fa20db5f31c
-  depends:
-  - gcc_impl_linux-64 >=14.3.0,<14.3.1.0a0
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 31705
-  timestamp: 1771378159534
-- conda: ..
-  name: cuda-bindings
-  version: 13.2.0
-  build: hb0f4dca_0
-  subdir: linux-64
-  variants:
-    target_platform: linux-64
-  depends:
-  - python
-  - cuda-version
-  - cuda-pathfinder
-  - libnvjitlink
-  - cuda-nvrtc
-  - cuda-nvrtc >=13.2.51,<14.0a0
-  - cuda-nvvm
-  - libnvfatbin
-  - libcufile
-  - libcufile >=1.17.0.44,<2.0a0
-  - libgcc >=15
-  - libgcc >=15
-  - libstdcxx >=15
-  - python_abi 3.14.* *_cp314
-  license: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-  sources:
-    cuda-pathfinder:
-      path: ../cuda_pathfinder
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda
-  sha256: aecfbbc9a687e5daba66b896613a00c617e3eadc21a31b19e53e8e642e83d7a7
-  md5: 3bd3abdf71e1b8c53310195677bf00be
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-nvrtc >=13,<14.0a0
-  - cuda-nvvm-impl >=13,<14.0a0
-  - cuda-pathfinder >=1.1.0,<2
-  - cuda-version >=13,<14.0a0
-  - libcufile >=1,<2.0a0
-  - libgcc >=14
-  - libnvjitlink >=13.0,<14.0a0
-  - libstdcxx >=14
-  - numpy
-  - python >=3.14,<3.15.0a0
-  - python_abi 3.14.* *_cp314
-  constrains:
-  - cuda-python >=13.1.0,<13.2.0a0
-  - cuda-cudart >=13,<14.0a0
-  license: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-  size: 7267159
-  timestamp: 1764919647948
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda
-  sha256: 0715f15da71587238600f0584bc8d243d8fde602c3d8856f421b58dff3fb9422
-  md5: a179486129ff28d053bb16fdb533568e
-  depends:
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1277295
-  timestamp: 1768272295906
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda
-  sha256: e539baa32e3be63f89bd11d421911363faac322903caf58a15a46ba68ae29867
-  md5: 4910b7b709f1168baffc2a742b39a222
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1415308
-  timestamp: 1773098874302
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda
-  sha256: 82ae1f3e492146722e258e237daa537f4d4df8157b2dfa49a0869eb41a11d284
-  md5: 3723bca2a84e6cc0f0a98427b71bec73
-  depends:
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 96480
-  timestamp: 1768280269206
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda
-  sha256: dd9a74a40b196b1ea150b17ca8fb539dd8f75edd349af354a7bae6dbb43e43b4
-  md5: 6f4a609f3d142d4b22728823955249e9
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 97122
-  timestamp: 1773115163637
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda
-  sha256: 00acb7564e7c7dd60be431bd2a1a937856e38a86535d72281461cd193500a0a4
-  md5: 2e2b71c8d67f6ceb1d3820aa438f3580
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart_linux-64 13.1.80 h376f20c_0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24159
-  timestamp: 1764883525821
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda
-  sha256: 9cc44fd4914738a32cf5c801925a08c61ce45b5534833cf1df1621236a9a321d
-  md5: 29f5b46965bd82b0e9cc27a96d13f2bd
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart_linux-64 13.2.51 h376f20c_0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24534
-  timestamp: 1773104357094
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda
-  sha256: 12aa5dcf82cdf863be18a48a9ad4d271aa864ef985752bc9707371b84085f0c8
-  md5: e3cbe24bf8ae135e9f82450be520e886
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart 13.1.80 hecca717_0
-  - cuda-cudart-dev_linux-64 13.1.80 h376f20c_0
-  - cuda-cudart-static 13.1.80 hecca717_0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24597
-  timestamp: 1764883573873
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda
-  sha256: f6d81c961b6212389c07ffc9dc1268966db63aa351d46875effee40447eb9dd8
-  md5: 9b35a56418b6cbbde5ea5f7d84c26317
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart 13.2.51 hecca717_0
-  - cuda-cudart-dev_linux-64 13.2.51 h376f20c_0
-  - cuda-cudart-static 13.2.51 hecca717_0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24961
-  timestamp: 1773104406956
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda
-  sha256: 41a1cc86f2759ef6ae47cc68e2180baaeb4b989709931366ee0cdc90f8e10f5f
-  md5: a36776a49ae0e47a26e129bdc82aeb3e
-  depends:
-  - cuda-cccl_linux-64
-  - cuda-cudart-static_linux-64
-  - cuda-cudart_linux-64
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 392459
-  timestamp: 1764883538793
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda
-  sha256: 86dd0dc301bab5263d63f13d47b02507e0cf2fd22ff9aefa37dea2dd03c6df83
-  md5: 7e5cf4b991525b7b1a2cfa3f1c81462e
-  depends:
-  - cuda-cccl_linux-64
-  - cuda-cudart-static_linux-64
-  - cuda-cudart_linux-64
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 399921
-  timestamp: 1773104368666
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
-  sha256: 7cbf145b3e59d360052556bfe9425753b119c33cbba0c1f20f0191a7330ced5c
-  md5: 0e5edde73725a13f7d62ddf96b7656b9
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart-static_linux-64 13.1.80 h376f20c_0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24119
-  timestamp: 1764883551735
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda
-  sha256: d4a316038b02161e04a864c8cd146d2ec62cbd114eb951197c6ef6042d3c46c4
-  md5: daec4c4dc0355adcdf009dceb3b94259
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-cudart-static_linux-64 13.2.51 h376f20c_0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24494
-  timestamp: 1773104383494
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
-  sha256: 2252e12fa9a806f685684b6395a660d845dc95bdc95e52a6bc09dba8a9eccec3
-  md5: be9f8ef5a01fca1f28c8d523f8501771
-  depends:
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1121385
-  timestamp: 1764883490595
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
-  sha256: e3cc51809bd8be0a96bbe01a668f08e6e611c8fba60426c4d9f10926f3159456
-  md5: aa9c7d5cd427042ffbd59c9ef6014f98
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1103784
-  timestamp: 1773104321614
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-  sha256: fca2951815564c36cf5a4e0f7ed0222429d206fda3d4e1aa3d52a969a293b868
-  md5: 4dc4c3a1e010e06035f01d661c1b70bd
-  depends:
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 199654
-  timestamp: 1764883502803
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
-  sha256: e1d943a5582c8e171c9dcf2c0c72ddd5bf0a2ac9acd6ed15898d69d618cf53c6
-  md5: 51a1624c7e26d8821b5d959ee7ecb517
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 203460
-  timestamp: 1773104333900
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda
-  sha256: 83bf37d5a3b4a85853cded6a8b90db302b014845b7d9461ccdb84db8c2abfbc3
-  md5: 1d7073905d0359ff234545494a933d59
-  depends:
-  - cuda-version >=13.1,<13.2.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 38992
-  timestamp: 1764883514338
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
-  sha256: 1b372b7af937a3a2fdb1cbd5356e6b365f3495d899a413ebf98369ab0c5c0c79
-  md5: 970891239574056829fc1cfc208278a7
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 39485
-  timestamp: 1773104345638
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
-  sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b
-  md5: df16c9049d882cdaf4f83a5b90079589
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35339417
-  timestamp: 1768272955912
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-  sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5
-  md5: 0413baaa73be1a39d5d8e442184acc78
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35736655
-  timestamp: 1773100338749
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
-  sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3
-  md5: 1096fce4abad7dd975ce6d9953fceb6a
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-nvrtc 13.1.115 hecca717_0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  constrains:
-  - cuda-nvrtc-static >=13.1.115
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35845
-  timestamp: 1768273073971
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
-  sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879
-  md5: 83535200a9e77165d5291b4ac82ebf6a
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-nvrtc 13.2.51 hecca717_0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  constrains:
-  - cuda-nvrtc-static >=13.2.51
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 36305
-  timestamp: 1773100458841
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
-  sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b
-  md5: 03cd3639b8e13623c7b91b1cb0136402
-  depends:
-  - cuda-nvvm-dev_linux-64 13.2.51.*
-  - cuda-nvvm-impl 13.2.51.*
-  - cuda-nvvm-tools 13.2.51.*
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 25494
-  timestamp: 1773157399568
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
-  sha256: f00fce92bf7f1da314654f7693f571a014aaa2ba1fae3762634f3e5be254da83
-  md5: 57724ac113f7435762d0c39e1b1ad341
-  depends:
-  - cuda-version >=13.2,<13.3.0a0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 28399
-  timestamp: 1773115185916
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
-  sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c
-  md5: bf76661bc0de83a60537c4913f339fb3
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=12
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 21873791
-  timestamp: 1768280315627
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
-  sha256: bea7cbd2ff0f8bf07e0b90d522b4834533b4024237322c09f1b3875970c4abc9
-  md5: 3c3872ff2bd6cc6368dcd4b35bb995f2
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=12
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 22202489
-  timestamp: 1773115209641
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda
-  sha256: da5fd2dc57df2047215ff76f295685b1e1e586a46c2e46214120458cee18ee80
-  md5: 2df6cd3b3d6d1365a2979285703056f9
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=12
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 25988523
-  timestamp: 1773115248060
-- conda: ../../cuda_pathfinder
-  name: cuda-pathfinder
-  version: 1.3.4a0
-  build: pyh4616a5c_0
-  subdir: noarch
-  variants:
-    target_platform: noarch
-  depends:
-  - python >=3.10
-  - python *
-  license: Apache-2.0
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
-  sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151
-  md5: 2d13e524da66b60e6e7d5c6585729ea8
-  depends:
-  - python >=3.10
-  - cuda-version >=12.0,<14
-  - python
-  license: Apache-2.0
-  license_family: APACHE
-  size: 39327
-  timestamp: 1772059437166
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
-  sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992
-  md5: d85448460c25ee43ff2f8346bb9ad52b
-  constrains:
-  - cudatoolkit 13.1|13.1.*
-  - __cuda >=13
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 21511
-  timestamp: 1757017115788
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
-  sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679
-  md5: 45676e3dd76b30ec613f1f822d450eff
-  constrains:
-  - __cuda >=13
-  - cudatoolkit 13.2|13.2.*
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 21908
-  timestamp: 1773093709154
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda
-  sha256: 3fcc97ae3e89c150401a50a4de58794ffc67b1ed0e1851468fcc376980201e25
-  md5: 5da8c935dca9186673987f79cef0b2a5
-  depends:
-  - c-compiler 1.11.0 h4d9bdce_0
-  - gxx
-  - gxx_linux-64 14.*
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 6635
-  timestamp: 1753098722177
-- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
-  sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e
-  md5: 003b8ba0a94e2f1e117d0bd46aebc901
-  depends:
-  - python >=3.9
-  license: Apache-2.0
-  license_family: APACHE
-  size: 275642
-  timestamp: 1752823081585
-- conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
-  sha256: ee6cf346d017d954255bbcbdb424cddea4d14e4ed7e9813e429db1d795d01144
-  md5: 8e662bd460bda79b1ea39194e3c4c9ab
-  depends:
-  - python >=3.10
-  - typing_extensions >=4.6.0
-  license: MIT and PSF-2.0
-  size: 21333
-  timestamp: 1763918099466
-- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda
-  sha256: dddea9ec53d5e179de82c24569d41198f98db93314f0adae6b15195085d5567f
-  md5: f58064cec97b12a7136ebb8a6f8a129b
-  depends:
-  - python >=3.10
-  license: Unlicense
-  size: 25845
-  timestamp: 1773314012590
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda
-  sha256: 9b34b57b06b485e33a40d430f71ac88c8f381673592507cf7161c50ff0832772
-  md5: 52d6457abc42e320787ada5f9033fa99
-  depends:
-  - conda-gcc-specs
-  - gcc_impl_linux-64 14.3.0 hbdf3cc3_18
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 29506
-  timestamp: 1771378321585
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda
-  sha256: 3b31a273b806c6851e16e9cf63ef87cae28d19be0df148433f3948e7da795592
-  md5: 30bb690150536f622873758b0e8d6712
-  depends:
-  - binutils_impl_linux-64 >=2.45
-  - libgcc >=14.3.0
-  - libgcc-devel_linux-64 14.3.0 hf649bbc_118
-  - libgomp >=14.3.0
-  - libsanitizer 14.3.0 h8f1669f_18
-  - libstdcxx >=14.3.0
-  - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118
-  - sysroot_linux-64
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 76302378
-  timestamp: 1771378056505
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda
-  sha256: 27ad0cd10dccffca74e20fb38c9f8643ff8fce56eee260bf89fa257d5ab0c90a
-  md5: 1403ed5fe091bd7442e4e8a229d14030
-  depends:
-  - gcc_impl_linux-64 14.3.0.*
-  - binutils_linux-64
-  - sysroot_linux-64
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 28946
-  timestamp: 1770908213807
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda
-  sha256: 1b490c9be9669f9c559db7b2a1f7d8b973c58ca0c6f21a5d2ba3f0ab2da63362
-  md5: 19189121d644d4ef75fed05383bc75f5
-  depends:
-  - gcc 14.3.0 h0dff253_18
-  - gxx_impl_linux-64 14.3.0 h2185e75_18
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 28883
-  timestamp: 1771378355605
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda
-  sha256: 38ffca57cc9c264d461ac2ce9464a9d605e0f606d92d831de9075cb0d95fc68a
-  md5: 6514b3a10e84b6a849e1b15d3753eb22
-  depends:
-  - gcc_impl_linux-64 14.3.0 hbdf3cc3_18
-  - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118
-  - sysroot_linux-64
-  - tzdata
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 14566100
-  timestamp: 1771378271421
-- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda
-  sha256: 1e07c197e0779fa9105e59cd55a835ded96bfde59eb169439736a89b27b48e5d
-  md5: 7b51f4ff82eeb1f386bfee20a7bed3ed
-  depends:
-  - gxx_impl_linux-64 14.3.0.*
-  - gcc_linux-64 ==14.3.0 h298d278_21
-  - binutils_linux-64
-  - sysroot_linux-64
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 27503
-  timestamp: 1770908213813
-- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda
-  sha256: 142a722072fa96cf16ff98eaaf641f54ab84744af81754c292cb81e0881c0329
-  md5: 186a18e3ba246eccfc7cff00cd19a870
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: MIT
-  license_family: MIT
-  size: 12728445
-  timestamp: 1767969922681
-- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda
-  sha256: 7cd5eccdb171a0adbf83a1ad8fc4e17822f4fc3f5518da9040de64e88bc07343
-  md5: 5b7ae2ec4e0750e094f804a6cf1b2a37
-  depends:
-  - python >=3.10
-  - ukkonen
-  license: MIT
-  license_family: MIT
-  size: 79520
-  timestamp: 1772402363021
-- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda
-  sha256: c18ab120a0613ada4391b15981d86ff777b5690ca461ea7e9e49531e8f374745
-  md5: 63ccfdc3a3ce25b027b8767eb722fca8
-  depends:
-  - python >=3.9
-  - zipp >=3.20
-  - python
-  license: Apache-2.0
-  license_family: APACHE
-  size: 34641
-  timestamp: 1747934053147
-- conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda
-  sha256: e1a9e3b1c8fe62dc3932a616c284b5d8cbe3124bbfbedcf4ce5c828cb166ee19
-  md5: 9614359868482abba1bd15ce465e3c42
-  depends:
-  - python >=3.10
-  license: MIT
-  license_family: MIT
-  size: 13387
-  timestamp: 1760831448842
-- conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda
-  sha256: 41557eeadf641de6aeae49486cef30d02a6912d8da98585d687894afd65b356a
-  md5: 86d9cba083cd041bfbf242a01a7a1999
-  constrains:
-  - sysroot_linux-64 ==2.28
-  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later
-  license_family: GPL
-  size: 1278712
-  timestamp: 1765578681495
-- conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda
-  sha256: 0960d06048a7185d3542d850986d807c6e37ca2e644342dd0c72feefcf26c2a4
-  md5: b38117a3c920364aff79f870c984b4a3
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  license: LGPL-2.1-or-later
-  size: 134088
-  timestamp: 1754905959823
-- conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda
-  sha256: 3e307628ca3527448dd1cb14ad7bb9d04d1d28c7d4c5f97ba196ae984571dd25
-  md5: fb53fb07ce46a575c5d004bbc96032c2
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - keyutils >=1.6.3,<2.0a0
-  - libedit >=3.1.20250104,<3.2.0a0
-  - libedit >=3.1.20250104,<4.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  - openssl >=3.5.5,<4.0a0
-  license: MIT
-  license_family: MIT
-  size: 1386730
-  timestamp: 1769769569681
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda
-  sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3
-  md5: 12bd9a3f089ee6c9266a37dab82afabd
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - zstd >=1.5.7,<1.6.0a0
-  constrains:
-  - binutils_impl_linux-64 2.45.1
-  license: GPL-3.0-only
-  license_family: GPL
-  size: 725507
-  timestamp: 1770267139900
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
-  build_number: 5
-  sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c
-  md5: c160954f7418d7b6e87eaf05a8913fa9
-  depends:
-  - libopenblas >=0.3.30,<0.3.31.0a0
-  - libopenblas >=0.3.30,<1.0a0
-  constrains:
-  - mkl <2026
-  - liblapack  3.11.0   5*_openblas
-  - libcblas   3.11.0   5*_openblas
-  - blas 2.305   openblas
-  - liblapacke 3.11.0   5*_openblas
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 18213
-  timestamp: 1765818813880
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
-  sha256: 9517cce5193144af0fcbf19b7bd67db0a329c2cc2618f28ffecaa921a1cbe9d3
-  md5: 09c264d40c67b82b49a3f3b89037bd2e
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - attr >=2.5.2,<2.6.0a0
-  - libgcc >=14
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 121429
-  timestamp: 1762349484074
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-  build_number: 5
-  sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8
-  md5: 6636a2b6f1a87572df2970d3ebc87cc0
-  depends:
-  - libblas 3.11.0 5_h4a7cf45_openblas
-  constrains:
-  - liblapacke 3.11.0   5*_openblas
-  - blas 2.305   openblas
-  - liblapack  3.11.0   5*_openblas
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 18194
-  timestamp: 1765818837135
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
-  sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1
-  md5: 48418c48dac04671fa46cb446122b8a5
-  depends:
-  - __glibc >=2.28,<3.0.a0
-  - cuda-version >=13.1,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  - rdma-core >=60.0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 990938
-  timestamp: 1768273732081
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
-  sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19
-  md5: 6d8ed8335d144ec7303b8d3587b2205c
-  depends:
-  - __glibc >=2.28,<3.0.a0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  - rdma-core >=61.0
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1085341
-  timestamp: 1773100191342
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
-  sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b
-  md5: 1707cdd636af2ff697b53186572c9f77
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - krb5 >=1.22.2,<1.23.0a0
-  - libgcc >=14
-  - libnghttp2 >=1.67.0,<2.0a0
-  - libssh2 >=1.11.1,<2.0a0
-  - libzlib >=1.3.1,<2.0a0
-  - openssl >=3.5.5,<4.0a0
-  - zstd >=1.5.7,<1.6.0a0
-  license: curl
-  license_family: MIT
-  size: 463621
-  timestamp: 1770892808818
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
-  sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724
-  md5: c277e0a4d549b03ac1e9d6cbbe3d017b
-  depends:
-  - ncurses
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  - ncurses >=6.5,<7.0a0
-  license: BSD-2-Clause
-  license_family: BSD
-  size: 134676
-  timestamp: 1738479519902
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
-  sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4
-  md5: 172bf1cd1ff8629f2b1179945ed45055
-  depends:
-  - libgcc-ng >=12
-  license: BSD-2-Clause
-  license_family: BSD
-  size: 112766
-  timestamp: 1702146165126
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda
-  sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5
-  md5: e7f7ce06ec24cfcfb9e36d28cf82ba57
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  constrains:
-  - expat 2.7.4.*
-  license: MIT
-  license_family: MIT
-  size: 76798
-  timestamp: 1771259418166
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
-  sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6
-  md5: a360c33a5abe61c07959e449fa1453eb
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: MIT
-  license_family: MIT
-  size: 58592
-  timestamp: 1769456073053
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
-  sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5
-  md5: 0aa00f03f9e39fb9876085dee11a85d4
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - _openmp_mutex >=4.5
-  constrains:
-  - libgcc-ng ==15.2.0=*_18
-  - libgomp 15.2.0 he0feb66_18
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 1041788
-  timestamp: 1771378212382
-- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda
-  sha256: 1abc6a81ee66e8ac9ac09a26e2d6ad7bba23f0a0cc3a6118654f036f9c0e1854
-  md5: 06901733131833f5edd68cf3d9679798
-  depends:
-  - __unix
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 3084533
-  timestamp: 1771377786730
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda
-  sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893
-  md5: d5e96b1ed75ca01906b3d2469b4ce493
-  depends:
-  - libgcc 15.2.0 he0feb66_18
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 27526
-  timestamp: 1771378224552
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
-  sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee
-  md5: 9063115da5bc35fdc3e1002e69b9ef6e
-  depends:
-  - libgfortran5 15.2.0 h68bc16d_18
-  constrains:
-  - libgfortran-ng ==15.2.0=*_18
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 27523
-  timestamp: 1771378269450
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
-  sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12
-  md5: 646855f357199a12f02a87382d429b75
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=15.2.0
-  constrains:
-  - libgfortran 15.2.0
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 2482475
-  timestamp: 1771378241063
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
-  sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110
-  md5: 239c5e9546c38a1e884d69effcf4c882
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 603262
-  timestamp: 1771378117851
-- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda
-  build_number: 5
-  sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053
-  md5: b38076eb5c8e40d0106beda6f95d7609
-  depends:
-  - libblas 3.11.0 5_h4a7cf45_openblas
-  constrains:
-  - blas 2.305   openblas
-  - liblapacke 3.11.0   5*_openblas
-  - libcblas   3.11.0   5*_openblas
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 18200
-  timestamp: 1765818857876
-- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda
-  sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb
-  md5: c7c83eecbb72d88b940c249af56c8b17
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  constrains:
-  - xz 5.8.2.*
-  license: 0BSD
-  size: 113207
-  timestamp: 1768752626120
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
-  sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843
-  md5: 2c21e66f50753a083cbe6b80f38268fa
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: BSD-2-Clause
-  license_family: BSD
-  size: 92400
-  timestamp: 1769482286018
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda
-  sha256: a4a7dab8db4dc81c736e9a9b42bdfd97b087816e029e221380511960ac46c690
-  md5: b499ce4b026493a13774bcf0f4c33849
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - c-ares >=1.34.5,<2.0a0
-  - libev >=4.33,<4.34.0a0
-  - libev >=4.33,<5.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  - libzlib >=1.3.1,<2.0a0
-  - openssl >=3.5.2,<4.0a0
-  license: MIT
-  license_family: MIT
-  size: 666600
-  timestamp: 1756834976695
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
-  sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4
-  md5: db63358239cbe1ff86242406d440e44a
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  license: LGPL-2.1-or-later
-  license_family: LGPL
-  size: 741323
-  timestamp: 1731846827427
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda
-  sha256: 66b7bbe40d259e4927b9c264569afd49d0e31a3813c585beea63f3415577f1b3
-  md5: 7e6534bce7252c84efdedae1fae2148e
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13.2,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 471076
-  timestamp: 1773100181931
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda
-  sha256: 6b5300bf9952da4bfdbfb45c13b042d786a0daffb1bd2fa45ea9ad971703fe96
-  md5: 851acc1af02d31c732b931b9ffddc2d9
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13,<13.2.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 31328660
-  timestamp: 1771443943495
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda
-  sha256: 2ca45a2c9e6cc307cea3c8a1bf27bceb745fa5e1150d7b768b63a781eeaee7a2
-  md5: 20a82402e6851e5d4e0b13ee1083d370
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cuda-version >=13,<13.3.0a0
-  - libgcc >=14
-  - libstdcxx >=14
-  license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 31691081
-  timestamp: 1773100788615
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
-  sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5
-  md5: be43915efc66345cccb3c310b6ed0374
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - libgfortran
-  - libgfortran5 >=14.3.0
-  constrains:
-  - openblas >=0.3.30,<0.3.31.0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 5927939
-  timestamp: 1763114673331
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda
-  sha256: e03ed186eefb46d7800224ad34bad1268c9d19ecb8f621380a50601c6221a4a7
-  md5: ad3a0e2dc4cce549b2860e2ef0e6d75b
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14.3.0
-  - libstdcxx >=14.3.0
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 7949259
-  timestamp: 1771377982207
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda
-  sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217
-  md5: da5be73701eecd0e8454423fd6ffcf30
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - icu >=78.2,<79.0a0
-  - libgcc >=14
-  - libzlib >=1.3.1,<2.0a0
-  license: blessing
-  size: 942808
-  timestamp: 1768147973361
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
-  sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661
-  md5: eecce068c7e4eddeb169591baac20ac4
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  - libzlib >=1.3.1,<2.0a0
-  - openssl >=3.5.0,<4.0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 304790
-  timestamp: 1745608545575
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
-  sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e
-  md5: 1b08cd684f34175e4514474793d44bcb
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc 15.2.0 he0feb66_18
-  constrains:
-  - libstdcxx-ng ==15.2.0=*_18
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 5852330
-  timestamp: 1771378262446
-- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda
-  sha256: b1c3824769b92a1486bf3e2cc5f13304d83ae613ea061b7bc47bb6080d6dfdba
-  md5: 865a399bce236119301ebd1532fced8d
-  depends:
-  - __unix
-  license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 20171098
-  timestamp: 1771377827750
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda
-  sha256: f0356bb344a684e7616fc84675cfca6401140320594e8686be30e8ac7547aed2
-  md5: 1d4c18d75c51ed9d00092a891a547a7d
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libcap >=2.77,<2.78.0a0
-  - libgcc >=14
-  license: LGPL-2.1-or-later
-  size: 491953
-  timestamp: 1770738638119
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda
-  sha256: ed4d2c01fbeb1330f112f7e399408634db277d3dfb2dec1d0395f56feaa24351
-  md5: 6c74fba677b61a0842cbf0f63eee683b
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libcap >=2.77,<2.78.0a0
-  - libgcc >=14
-  license: LGPL-2.1-or-later
-  size: 144654
-  timestamp: 1770738650966
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda
-  sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee
-  md5: db409b7c1720428638e7c0d509d3e1b5
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 40311
-  timestamp: 1766271528534
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda
-  sha256: c180f4124a889ac343fc59d15558e93667d894a966ec6fdb61da1604481be26b
-  md5: 0f03292cc56bf91a077a134ea8747118
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  license: MIT
-  license_family: MIT
-  size: 895108
-  timestamp: 1753948278280
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
-  sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4
-  md5: edb0dca6bc32e4f4789199455a1dbeb8
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  constrains:
-  - zlib 1.3.1 *_2
-  license: Zlib
-  license_family: Other
-  size: 60963
-  timestamp: 1727963148474
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
-  sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586
-  md5: 47e340acb35de30501a76c7c799c41d7
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  license: X11 AND BSD-3-Clause
-  size: 891641
-  timestamp: 1738195959188
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda
-  sha256: 6f7d59dbec0a7b00bf5d103a4306e8886678b796ff2151b62452d4582b2a53fb
-  md5: b518e9e92493721281a60fa975bddc65
-  depends:
-  - libstdcxx >=14
-  - libgcc >=14
-  - __glibc >=2.17,<3.0.a0
-  license: Apache-2.0
-  license_family: APACHE
-  size: 186323
-  timestamp: 1763688260928
-- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
-  sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee
-  md5: eb52d14a901e23c39e9e7b4a1a5c015f
-  depends:
-  - python >=3.10
-  - setuptools
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 40866
-  timestamp: 1766261270149
-- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda
-  sha256: 1d8377c8001c15ed12c2713b723213474b435706ab9d34ede69795d64af9e94d
-  md5: 4ea6b620fdf24a1a0bc4f1c7134dfafb
-  depends:
-  - python
-  - libstdcxx >=14
-  - libgcc >=14
-  - __glibc >=2.17,<3.0.a0
-  - libcblas >=3.9.0,<4.0a0
-  - python_abi 3.14.* *_cp314
-  - libblas >=3.9.0,<4.0a0
-  - liblapack >=3.9.0,<4.0a0
-  constrains:
-  - numpy-base <0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 8926994
-  timestamp: 1770098474394
-- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda
-  sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c
-  md5: f61eb8cd60ff9057122a3d338b99c00f
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - ca-certificates
-  - libgcc >=14
-  license: Apache-2.0
-  license_family: Apache
-  size: 3164551
-  timestamp: 1769555830639
-- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda
-  sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58
-  md5: b76541e68fea4d511b1ac46a28dcd2c6
-  depends:
-  - python >=3.8
-  - python
-  license: Apache-2.0
-  license_family: APACHE
-  size: 72010
-  timestamp: 1769093650580
-- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda
-  sha256: 0289f0a38337ee201d984f8f31f11f6ef076cfbbfd0ab9181d12d9d1d099bf46
-  md5: 82c1787f2a65c0155ef9652466ee98d6
-  depends:
-  - python >=3.10
-  - python
-  license: MIT
-  license_family: MIT
-  size: 25646
-  timestamp: 1773199142345
-- conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda
-  sha256: e14aafa63efa0528ca99ba568eaf506eb55a0371d12e6250aaaa61718d2eb62e
-  md5: d7585b6550ad04c8c5e21097ada2888e
-  depends:
-  - python >=3.9
-  - python
-  license: MIT
-  license_family: MIT
-  size: 25877
-  timestamp: 1764896838868
-- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda
-  sha256: 5b81b7516d4baf43d0c185896b245fa7384b25dc5615e7baa504b7fa4e07b706
-  md5: 7f3ac694319c7eaf81a0325d6405e974
-  depends:
-  - cfgv >=2.0.0
-  - identify >=1.0.0
-  - nodeenv >=0.11.1
-  - python >=3.10
-  - pyyaml >=5.1
-  - virtualenv >=20.10.0
-  license: MIT
-  license_family: MIT
-  size: 200827
-  timestamp: 1765937577534
-- conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda
-  sha256: 6d8f03c13d085a569fde931892cded813474acbef2e03381a1a87f420c7da035
-  md5: 46830ee16925d5ed250850503b5dc3a8
-  depends:
-  - python >=3.9
-  license: MIT
-  license_family: MIT
-  size: 25766
-  timestamp: 1733236452235
-- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
-  sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6
-  md5: 12c566707c80111f9799308d9e265aef
-  depends:
-  - python >=3.9
-  - python
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 110100
-  timestamp: 1733195786147
-- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
-  sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a
-  md5: 6b6ece66ebcae2d5f326c77ef2c5a066
-  depends:
-  - python >=3.9
-  license: BSD-2-Clause
-  license_family: BSD
-  size: 889287
-  timestamp: 1750615908735
-- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
-  sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07
-  md5: ed166875b3876d5d7e6e39d2e8d1c6e3
-  depends:
-  - python >=3.14,<3.15.0a0
-  - python_abi 3.14.* *_cp314
-  - six
-  license: MIT
-  license_family: MIT
-  size: 273897
-  timestamp: 1765980972868
-- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda
-  sha256: 9e749fb465a8bedf0184d8b8996992a38de351f7c64e967031944978de03a520
-  md5: 2b694bad8a50dc2f712f5368de866480
-  depends:
-  - pygments >=2.7.2
-  - python >=3.10
-  - iniconfig >=1.0.1
-  - packaging >=22
-  - pluggy >=1.5,<2
-  - tomli >=1
-  - colorama >=0.4
-  - exceptiongroup >=1
-  - python
-  constrains:
-  - pytest-faulthandler >=2
-  license: MIT
-  license_family: MIT
-  size: 299581
-  timestamp: 1765062031645
-- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda
-  sha256: 2f2229415a6e5387c1faaedf442ea8c07471cb2bf5ad1007b9cfb83ea85ca29a
-  md5: 0e7294ed4af8b833fcd2c101d647c3da
-  depends:
-  - py-cpuinfo
-  - pytest >=8.1
-  - python >=3.10
-  license: BSD-2-Clause
-  license_family: BSD
-  size: 43976
-  timestamp: 1762716480208
-- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda
-  build_number: 101
-  sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd
-  md5: c014ad06e60441661737121d3eae8a60
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - bzip2 >=1.0.8,<2.0a0
-  - ld_impl_linux-64 >=2.36.1
-  - libexpat >=2.7.3,<3.0a0
-  - libffi >=3.5.2,<3.6.0a0
-  - libgcc >=14
-  - liblzma >=5.8.2,<6.0a0
-  - libmpdec >=4.0.0,<5.0a0
-  - libsqlite >=3.51.2,<4.0a0
-  - libuuid >=2.41.3,<3.0a0
-  - libzlib >=1.3.1,<2.0a0
-  - ncurses >=6.5,<7.0a0
-  - openssl >=3.5.5,<4.0a0
-  - python_abi 3.14.* *_cp314
-  - readline >=8.3,<9.0a0
-  - tk >=8.6.13,<8.7.0a0
-  - tzdata
-  - zstd >=1.5.7,<1.6.0a0
-  license: Python-2.0
-  size: 36702440
-  timestamp: 1770675584356
-  python_site_packages_path: lib/python3.14/site-packages
-- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda
-  sha256: 36429765f626c345710fbae14aeeda676c1745427667eb480bb855b7089affba
-  md5: 69fc0a99fc21b26b81026c72e00f83df
-  depends:
-  - python >=3.10
-  - filelock >=3.15.4
-  - platformdirs <5,>=4.3.6
-  - python
-  license: MIT
-  license_family: MIT
-  size: 33996
-  timestamp: 1773161039118
-- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
-  build_number: 8
-  sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5
-  md5: 0539938c55b6b1a59b560e843ad864a4
-  constrains:
-  - python 3.14.* *_cp314
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 6989
-  timestamp: 1752805904792
-- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
-  sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d
-  md5: 2035f68f96be30dc60a5dfd7452c7941
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - python >=3.14,<3.15.0a0
-  - python_abi 3.14.* *_cp314
-  - yaml >=0.2.5,<0.3.0a0
-  license: MIT
-  license_family: MIT
-  size: 202391
-  timestamp: 1770223462836
-- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
-  sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717
-  md5: d487d93d170e332ab39803e05912a762
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - libnl >=3.11.0,<4.0a0
-  - libstdcxx >=14
-  - libsystemd0 >=257.10
-  - libudev1 >=257.10
-  license: Linux-OpenIB
-  license_family: BSD
-  size: 1268666
-  timestamp: 1769154883613
-- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
-  sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002
-  md5: d7d95fc8287ea7bf33e0e7116d2b95ec
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - ncurses >=6.5,<7.0a0
-  license: GPL-3.0-only
-  license_family: GPL
-  size: 345073
-  timestamp: 1765813471974
-- conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda
-  sha256: d5c73079c1dd2c2a313c3bfd81c73dbd066b7eb08d213778c8bff520091ae894
-  md5: c1c9b02933fdb2cfb791d936c20e887e
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=13
-  license: MIT
-  license_family: MIT
-  size: 193775
-  timestamp: 1748644872902
-- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
-  sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1
-  md5: 8e194e7b992f99a5015edbd4ebd38efd
-  depends:
-  - python >=3.10
-  license: MIT
-  license_family: MIT
-  size: 639697
-  timestamp: 1773074868565
-- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
-  sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d
-  md5: 3339e3b65d58accf4ca4fb8748ab16b3
-  depends:
-  - python >=3.9
-  - python
-  license: MIT
-  license_family: MIT
-  size: 18455
-  timestamp: 1753199211006
-- conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda
-  sha256: c47299fe37aebb0fcf674b3be588e67e4afb86225be4b0d452c7eb75c086b851
-  md5: 13dc3adbc692664cd3beabd216434749
-  depends:
-  - __glibc >=2.28
-  - kernel-headers_linux-64 4.18.0 he073ed8_9
-  - tzdata
-  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later
-  license_family: GPL
-  size: 24008591
-  timestamp: 1765578833462
-- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
-  sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac
-  md5: cffd3bdd58090148f4cfcd831f4b26ab
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libgcc >=14
-  - libzlib >=1.3.1,<2.0a0
-  constrains:
-  - xorg-libx11 >=1.8.12,<2.0a0
-  license: TCL
-  license_family: BSD
-  size: 3301196
-  timestamp: 1769460227866
-- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda
-  sha256: 62940c563de45790ba0f076b9f2085a842a65662268b02dd136a8e9b1eaf47a8
-  md5: 72e780e9aa2d0a3295f59b1874e3768b
-  depends:
-  - python >=3.10
-  - python
-  license: MIT
-  license_family: MIT
-  size: 21453
-  timestamp: 1768146676791
-- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
-  sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731
-  md5: 0caa1af407ecff61170c9437a808404d
-  depends:
-  - python >=3.10
-  - python
-  license: PSF-2.0
-  license_family: PSF
-  size: 51692
-  timestamp: 1756220668932
-- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
-  sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c
-  md5: ad659d0a2b3e47e38d829aa8cad2d610
-  license: LicenseRef-Public-Domain
-  size: 119135
-  timestamp: 1767016325805
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
-  sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43
-  md5: 5d3c008e54c7f49592fca9c32896a76f
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - cffi
-  - libgcc >=14
-  - libstdcxx >=14
-  - python >=3.14,<3.15.0a0
-  - python_abi 3.14.* *_cp314
-  license: MIT
-  license_family: MIT
-  size: 15004
-  timestamp: 1769438727085
-- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda
-  sha256: b83246d145ba0e6814d2ed0b616293e56924e6c7d6649101f5a4f97f9e757ed1
-  md5: 704c22301912f7e37d0a92b2e7d5942d
-  depends:
-  - python >=3.10
-  - distlib >=0.3.7,<1
-  - filelock <4,>=3.24.2
-  - importlib-metadata >=6.6
-  - platformdirs >=3.9.1,<5
-  - python-discovery >=1
-  - typing_extensions >=4.13.2
-  - python
-  license: MIT
-  license_family: MIT
-  size: 4647775
-  timestamp: 1773133660203
-- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
-  sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad
-  md5: a77f85f77be52ff59391544bfe73390a
-  depends:
-  - libgcc >=14
-  - __glibc >=2.17,<3.0.a0
-  license: MIT
-  license_family: MIT
-  size: 85189
-  timestamp: 1753484064210
-- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
-  sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae
-  md5: 30cd29cb87d819caead4d55184c1d115
-  depends:
-  - python >=3.10
-  - python
-  license: MIT
-  license_family: MIT
-  size: 24194
-  timestamp: 1764460141901
-- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
-  sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7
-  md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829
-  depends:
-  - __glibc >=2.17,<3.0.a0
-  - libzlib >=1.3.1,<2.0a0
-  license: BSD-3-Clause
-  license_family: BSD
-  size: 601375
-  timestamp: 1764777111296
diff --git a/cuda_bindings/benchmarks/pixi.toml b/cuda_bindings/benchmarks/pixi.toml
deleted file mode 100644
index a448e8d3e4..0000000000
--- a/cuda_bindings/benchmarks/pixi.toml
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-[workspace]
-channels = ["conda-forge"]
-platforms = ["linux-64"]
-preview = ["pixi-build"]
-channel-priority = "disabled"
-
-[feature.cu13.system-requirements]
-cuda = "13"
-
-[feature.cu13-pinned.dependencies]
-cuda-version = "13.1.*"
-
-[feature.cu13-source.dependencies]
-cuda-version = "13.*"
-
-[feature.bench.dependencies]
-python = "3.14.*"
-pyperf = "*"
-pytest = "*"
-pytest-benchmark = "*"
-numpy = "*"
-
-[feature.cpp-bench.dependencies]
-cmake = "*"
-ninja = "*"
-cxx-compiler = "*"
-cuda-cudart-dev = "*"
-cuda-nvrtc-dev = "*"
-
-[feature.cpp-bench.target.linux-64.dependencies]
-cuda-crt-dev_linux-64 = "*"
-cuda-driver-dev_linux-64 = "*"
-
-[feature.cpp-bench.target.linux-64.activation.env]
-CUDA_HOME = "$CONDA_PREFIX/targets/x86_64-linux"
-
-[feature.dev.dependencies]
-pre-commit = "*"
-
-[feature.bindings-wheel.dependencies]
-cuda-bindings = "==13.1.0"
-
-[feature.bindings-source.dependencies]
-cuda-bindings = { path = ".." }
-
-[environments]
-wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }
-source = { features = ["cu13", "cu13-source", "bench", "cpp-bench", "dev", "bindings-source"] }
-
-[target.linux.tasks.bench]
-cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"]
-
-[target.linux.tasks.bench-smoke-test]
-cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--fast", "--min-time", "1"
-]
-
-[target.linux.tasks.bench-legacy]
-cmd = "pytest --benchmark-only --override-ini 'addopts=' $PIXI_PROJECT_ROOT/pytest-legacy/"
-
-[target.linux.tasks.bench-cpp-configure]
-cmd = [
-    "cmake",
-    "-S",
-    "$PIXI_PROJECT_ROOT/benchmarks/cpp",
-    "-B",
-    "$PIXI_PROJECT_ROOT/.build/cpp",
-    "-G",
-    "Ninja",
-]
-
-[target.linux.tasks.bench-cpp-build]
-cmd = ["cmake", "--build", "$PIXI_PROJECT_ROOT/.build/cpp"]
-depends-on = [{ task = "bench-cpp-configure" }]
-
-[target.linux.tasks.bench-cpp]
-cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"]
-depends-on = [{ task = "bench-cpp-build" }]
-
-[target.linux.tasks.bench-compare]
-cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"]
-
-[target.linux.tasks.lint]
-cmd = ["pre-commit", "run", "--all-files"]
diff --git a/cuda_bindings/benchmarks/pytest-legacy/conftest.py b/cuda_bindings/benchmarks/pytest-legacy/conftest.py
deleted file mode 100644
index 0ea7b1d772..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/conftest.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import numpy as np
-import pytest
-
-from cuda.bindings import driver as cuda
-from cuda.bindings import nvrtc
-from cuda.bindings import runtime as cudart
-
-
-def ASSERT_DRV(err):
-    if isinstance(err, cuda.CUresult):
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError(f"Cuda Error: {err}")
-    elif isinstance(err, cudart.cudaError_t):
-        if err != cudart.cudaError_t.cudaSuccess:
-            raise RuntimeError(f"Cudart Error: {err}")
-    elif isinstance(err, nvrtc.nvrtcResult):
-        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError(f"Nvrtc Error: {err}")
-    else:
-        raise RuntimeError(f"Unknown error type: {err}")
-
-
-@pytest.fixture
-def init_cuda():
-    # Initialize
-    (err,) = cuda.cuInit(0)
-    ASSERT_DRV(err)
-    err, device = cuda.cuDeviceGet(0)
-    ASSERT_DRV(err)
-    err, ctx = cuda.cuCtxCreate(None, 0, device)
-    ASSERT_DRV(err)
-
-    # create stream
-    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
-    ASSERT_DRV(err)
-
-    yield device, ctx, stream
-
-    (err,) = cuda.cuStreamDestroy(stream)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuCtxDestroy(ctx)
-    ASSERT_DRV(err)
-
-
-@pytest.fixture
-def load_module():
-    module = None
-
-    def _load_module(kernel_string, device):
-        nonlocal module
-        # Get module
-        err, major = cuda.cuDeviceGetAttribute(
-            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
-        )
-        ASSERT_DRV(err)
-        err, minor = cuda.cuDeviceGetAttribute(
-            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
-        )
-        ASSERT_DRV(err)
-
-        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
-        ASSERT_DRV(err)
-        opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
-        (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)
-
-        err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
-        ASSERT_DRV(err_log)
-        log = b" " * logSize
-        (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
-        ASSERT_DRV(err_log)
-        result = log.decode()
-        if len(result) > 1:
-            print(result)
-
-        ASSERT_DRV(err)
-        err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
-        ASSERT_DRV(err)
-        cubin = b" " * cubinSize
-        (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
-        ASSERT_DRV(err)
-        cubin = np.char.array(cubin)
-        err, module = cuda.cuModuleLoadData(cubin)
-        ASSERT_DRV(err)
-
-        return module
-
-    yield _load_module
-
-    (err,) = cuda.cuModuleUnload(module)
-    ASSERT_DRV(err)
diff --git a/cuda_bindings/benchmarks/pytest-legacy/kernels.py b/cuda_bindings/benchmarks/pytest-legacy/kernels.py
deleted file mode 100644
index 36646fba00..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/kernels.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-kernel_string = """\
-#define ITEM_PARAM(x, T) T x
-#define REP1(x, T)   , ITEM_PARAM(x, T)
-#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
-#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
-#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
-#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
-#define REP32(x, T)  REP16(x##0, T)  REP16(x##1, T)
-#define REP64(x, T)  REP32(x##0, T)  REP32(x##1, T)
-#define REP128(x, T) REP64(x##0, T)  REP64(x##1, T)
-#define REP256(x, T) REP128(x##0, T) REP128(x##1, T)
-
-template<size_t maxBytes>
-struct KernelFunctionParam
-{
-   unsigned char p[maxBytes];
-};
-
-extern "C" __global__ void small_kernel(float *f)
-{
-   *f = 0.0f;
-}
-
-extern "C" __global__ void empty_kernel()
-{
-   return;
-}
-
-extern "C" __global__
-void small_kernel_512_args(
-	ITEM_PARAM(F, int*)
-	REP1(A, int*)
-	REP2(A, int*)
-	REP4(A, int*)
-	REP8(A, int*)
-	REP16(A, int*)
-	REP32(A, int*)
-	REP64(A, int*)
-	REP128(A, int*)
-	REP256(A, int*))
-{
-    *F = 0;
-}
-
-extern "C" __global__
-void small_kernel_512_bools(
-	ITEM_PARAM(F, bool)
-	REP1(A, bool)
-	REP2(A, bool)
-	REP4(A, bool)
-	REP8(A, bool)
-	REP16(A, bool)
-	REP32(A, bool)
-	REP64(A, bool)
-	REP128(A, bool)
-	REP256(A, bool))
-{
-    return;
-}
-
-extern "C" __global__
-void small_kernel_512_ints(
-	ITEM_PARAM(F, int)
-	REP1(A, int)
-	REP2(A, int)
-	REP4(A, int)
-	REP8(A, int)
-	REP16(A, int)
-	REP32(A, int)
-	REP64(A, int)
-	REP128(A, int)
-	REP256(A, int))
-{
-    return;
-}
-
-extern "C" __global__
-void small_kernel_512_doubles(
-	ITEM_PARAM(F, double)
-	REP1(A, double)
-	REP2(A, double)
-	REP4(A, double)
-	REP8(A, double)
-	REP16(A, double)
-	REP32(A, double)
-	REP64(A, double)
-	REP128(A, double)
-	REP256(A, double))
-{
-    return;
-}
-
-extern "C" __global__
-void small_kernel_512_chars(
-	ITEM_PARAM(F, char)
-	REP1(A, char)
-	REP2(A, char)
-	REP4(A, char)
-	REP8(A, char)
-	REP16(A, char)
-	REP32(A, char)
-	REP64(A, char)
-	REP128(A, char)
-	REP256(A, char))
-{
-    return;
-}
-
-extern "C" __global__
-void small_kernel_512_longlongs(
-	ITEM_PARAM(F, long long)
-	REP1(A, long long)
-	REP2(A, long long)
-	REP4(A, long long)
-	REP8(A, long long)
-	REP16(A, long long)
-	REP32(A, long long)
-	REP64(A, long long)
-	REP128(A, long long)
-	REP256(A, long long))
-{
-    return;
-}
-
-extern "C" __global__
-void small_kernel_256_args(
-	ITEM_PARAM(F, int*)
-	REP1(A, int*)
-	REP2(A, int*)
-	REP4(A, int*)
-	REP8(A, int*)
-	REP16(A, int*)
-	REP32(A, int*)
-	REP64(A, int*)
-	REP128(A, int*))
-{
-    *F = 0;
-}
-
-extern "C" __global__
-void small_kernel_16_args(
-	ITEM_PARAM(F, int*)
-	REP1(A, int*)
-	REP2(A, int*)
-	REP4(A, int*)
-	REP8(A, int*))
-{
-    *F = 0;
-}
-
-extern "C" __global__ void small_kernel_2048B(KernelFunctionParam<2048> param)
-{
-    // Do not touch param to prevent compiler from copying
-    // the whole structure from const bank to lmem.
-}
-"""
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py b/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py
deleted file mode 100644
index 76dd6e6a45..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import ctypes
-
-import pytest
-
-try:
-    import cupy
-
-    skip_tests = False
-except ImportError:
-    skip_tests = True
-
-from kernels import kernel_string
-
-
-def launch(kernel, args=()):
-    kernel((1,), (1,), args)
-
-
-# Measure launch latency with no parmaeters
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_empty_kernel(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("empty_kernel")
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel)
-        stream.synchronize()
-
-
-# Measure launch latency with a single parameter
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel")
-    cupy.cuda.set_allocator()
-    arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, (arg,))
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_args(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_args")
-    cupy.cuda.set_allocator()
-
-    args = []
-    for _ in range(512):
-        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_bools(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_bools")
-    cupy.cuda.set_allocator()
-
-    args = [True] * 512
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_doubles(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_doubles")
-    cupy.cuda.set_allocator()
-
-    args = [1.2345] * 512
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_ints(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_ints")
-    cupy.cuda.set_allocator()
-
-    args = [123] * 512
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_bytes(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_chars")
-    cupy.cuda.set_allocator()
-
-    args = [127] * 512
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_512_longlongs(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_512_longlongs")
-    cupy.cuda.set_allocator()
-
-    args = [9223372036854775806] * 512
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_256_args(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_256_args")
-    cupy.cuda.set_allocator()
-
-    args = []
-    for _ in range(256):
-        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
-@pytest.mark.benchmark(group="cupy")
-def test_launch_latency_small_kernel_16_args(benchmark):
-    module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function("small_kernel_16_args")
-    cupy.cuda.set_allocator()
-
-    args = []
-    for _ in range(16):
-        args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int)))
-    args = tuple(args)
-
-    stream = cupy.cuda.stream.Stream(non_blocking=True)
-
-    with stream:
-        benchmark(launch, kernel, args)
-        stream.synchronize()
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py b/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py
deleted file mode 100755
index dd994081a0..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import ctypes
-
-import pytest
-from kernels import kernel_string
-
-from conftest import ASSERT_DRV
-from cuda.bindings import driver as cuda
-
-
-def launch(kernel, stream, args=(), arg_types=()):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        (args, arg_types),
-        0,
-    )  # arguments
-
-
-def launch_packed(kernel, stream, params):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        params,
-        0,
-    )  # arguments
-
-
-# Measure launch latency with no parmaeters
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with a single parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
-    ASSERT_DRV(err)
-
-    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
-
-    (err,) = cuda.cuMemFree(f)
-    ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 512
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
-    ASSERT_DRV(err)
-
-    args = [True] * 512
-    arg_types = [ctypes.c_bool] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
-    ASSERT_DRV(err)
-
-    args = [1.2345] * 512
-    arg_types = [ctypes.c_double] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
-    ASSERT_DRV(err)
-
-    args = [123] * 512
-    arg_types = [ctypes.c_int] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
-    ASSERT_DRV(err)
-
-    args = [127] * 512
-    arg_types = [ctypes.c_byte] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
-    ASSERT_DRV(err)
-
-    args = [9223372036854775806] * 512
-    arg_types = [ctypes.c_longlong] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 256
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 16
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters, excluding parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    val_ps = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-        val_ps.append(ctypes.c_void_p(int(vals[i])))
-
-    packagedParams = (ctypes.c_void_p * 512)()
-    for i in range(512):
-        packagedParams[i] = ctypes.addressof(val_ps[i])
-
-    benchmark(launch_packed, func, stream, packagedParams)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-def pack_and_launch(kernel, stream, params):
-    packed_params = (ctypes.c_void_p * len(params))()
-    ptrs = [0] * len(params)
-    for i in range(len(params)):
-        ptrs[i] = ctypes.c_void_p(int(params[i]))
-        packed_params[i] = ctypes.addressof(ptrs[i])
-
-    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
-
-
-# Measure launch latency plus parameter packing using ctypes
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-
-    benchmark(pack_and_launch, func, stream, vals)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with a single large struct parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
-    ASSERT_DRV(err)
-
-    class struct_2048B(ctypes.Structure):
-        _fields_ = [("values", ctypes.c_uint8 * 2048)]
-
-    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py b/cuda_bindings/benchmarks/pytest-legacy/test_numba.py
deleted file mode 100644
index dfe084c6b1..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import numpy as np
-import pytest
-
-try:
-    from numba import cuda
-
-    skip_tests = False
-except ImportError:
-    skip_tests = True
-
-
-def launch_empty(kernel, stream):
-    kernel[1, 1, stream]()
-
-
-def launch(kernel, stream, arg):
-    kernel[1, 1, stream](arg)
-
-
-# Measure launch latency with no parmaeters
-@pytest.mark.skipif(skip_tests, reason="Numba is not installed")
-@pytest.mark.benchmark(group="numba", min_rounds=1000)
-def test_launch_latency_empty_kernel(benchmark):
-    stream = cuda.stream()
-
-    @cuda.jit
-    def empty_kernel():
-        return
-
-    benchmark(launch_empty, empty_kernel, stream)
-
-    cuda.synchronize()
-
-
-# Measure launch latency with a single parameter
-@pytest.mark.skipif(skip_tests, reason="Numba is not installed")
-@pytest.mark.benchmark(group="numba", min_rounds=1000)
-def test_launch_latency_small_kernel(benchmark):
-    stream = cuda.stream()
-
-    arg = cuda.device_array(1, dtype=np.float32, stream=stream)
-
-    @cuda.jit
-    def small_kernel(array):
-        array[0] = 0.0
-
-    benchmark(launch, small_kernel, stream, arg)
-
-    cuda.synchronize()
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py b/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py
deleted file mode 100644
index fae72ffd79..0000000000
--- a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import random
-
-import pytest
-
-from conftest import ASSERT_DRV
-from cuda.bindings import driver as cuda
-
-random.seed(0)
-
-idx = 0
-
-
-def query_attribute(attribute, ptrs):
-    global idx
-    ptr = ptrs[idx]
-    idx = (idx + 1) % len(ptrs)
-
-    cuda.cuPointerGetAttribute(attribute, ptr)
-
-
-def query_attributes(attributes, ptrs):
-    global idx
-    ptr = ptrs[idx]
-    idx = (idx + 1) % len(ptrs)
-
-    cuda.cuPointerGetAttributes(len(attributes), attributes, ptr)
-
-
-@pytest.mark.benchmark(group="pointer-attributes")
-# Measure cuPointerGetAttribute in the same way as C benchmarks
-def test_pointer_get_attribute(benchmark, init_cuda):
-    _ = init_cuda
-
-    ptrs = []
-    for _ in range(500):
-        err, ptr = cuda.cuMemAlloc(1 << 18)
-        ASSERT_DRV(err)
-        ptrs.append(ptr)
-
-    random.shuffle(ptrs)
-
-    benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs)
-
-    for p in ptrs:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-@pytest.mark.benchmark(group="pointer-attributes")
-# Measure cuPointerGetAttributes with all attributes
-def test_pointer_get_attributes_all(benchmark, init_cuda):
-    _ = init_cuda
-
-    ptrs = []
-    for _ in range(500):
-        err, ptr = cuda.cuMemAlloc(1 << 18)
-        ASSERT_DRV(err)
-        ptrs.append(ptr)
-
-    random.shuffle(ptrs)
-
-    attributes = [
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
-    ]
-
-    benchmark(query_attributes, attributes, ptrs)
-
-    for p in ptrs:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-@pytest.mark.benchmark(group="pointer-attributes")
-# Measure cuPointerGetAttributes with a single attribute
-def test_pointer_get_attributes_single(benchmark, init_cuda):
-    _ = init_cuda
-
-    ptrs = []
-    for _ in range(500):
-        err, ptr = cuda.cuMemAlloc(1 << 18)
-        ASSERT_DRV(err)
-        ptrs.append(ptr)
-
-    random.shuffle(ptrs)
-
-    attributes = [
-        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-    ]
-
-    benchmark(query_attributes, attributes, ptrs)
-
-    for p in ptrs:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
diff --git a/cuda_bindings/benchmarks/run_cpp.py b/cuda_bindings/benchmarks/run_cpp.py
deleted file mode 100644
index 96e50cb890..0000000000
--- a/cuda_bindings/benchmarks/run_cpp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from runner.cpp import main
-
-if __name__ == "__main__":
-    main()
diff --git a/cuda_bindings/benchmarks/run_pyperf.py b/cuda_bindings/benchmarks/run_pyperf.py
deleted file mode 100644
index f45af8c69a..0000000000
--- a/cuda_bindings/benchmarks/run_pyperf.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from runner.main import main
-
-if __name__ == "__main__":
-    main()
diff --git a/cuda_bindings/benchmarks/runner/__init__.py b/cuda_bindings/benchmarks/runner/__init__.py
deleted file mode 100644
index 27422b3cb7..0000000000
--- a/cuda_bindings/benchmarks/runner/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/cuda_bindings/benchmarks/runner/cpp.py
deleted file mode 100644
index f8c3490381..0000000000
--- a/cuda_bindings/benchmarks/runner/cpp.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import json
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).resolve().parent.parent
-BUILD_DIR = PROJECT_ROOT / ".build" / "cpp"
-DEFAULT_OUTPUT = PROJECT_ROOT / "results-cpp.json"
-
-BINARY_PREFIX = "bench_"
-BINARY_SUFFIX = "_cpp"
-
-
-def discover_binaries() -> dict[str, Path]:
-    """Discover C++ benchmark binaries in the build directory"""
-    if not BUILD_DIR.is_dir():
-        return {}
-
-    registry: dict[str, Path] = {}
-    for path in sorted(BUILD_DIR.iterdir()):
-        if not path.is_file() or not path.name.startswith(BINARY_PREFIX):
-            continue
-        if not path.name.endswith(BINARY_SUFFIX):
-            continue
-        name = path.name.removeprefix(BINARY_PREFIX).removesuffix(BINARY_SUFFIX)
-        registry[name] = path
-    return registry
-
-
-def strip_output_args(argv: list[str]) -> list[str]:
-    cleaned: list[str] = []
-    skip_next = False
-    for arg in argv:
-        if skip_next:
-            skip_next = False
-            continue
-        if arg in ("-o", "--output"):
-            skip_next = True
-            continue
-        if arg.startswith(("-o=", "--output=")):
-            continue
-        cleaned.append(arg)
-    return cleaned
-
-
-def merge_pyperf_json(individual_files: list[Path], output_path: Path) -> int:
-    """Merge individual pyperf JSON files into a single BenchmarkSuite file.
-
-    Each C++ binary produces a file with structure:
-        {"version": "1.0", "metadata": {...}, "benchmarks": [{...}]}
-
-    We merge them by collecting all benchmark entries into one file.
-    """
-    all_benchmarks = []
-
-    for path in individual_files:
-        with open(path) as f:
-            data = json.load(f)
-
-        file_metadata = data.get("metadata", {})
-        bench_name = file_metadata.get("name", "")
-        loops = file_metadata.get("loops")
-        unit = file_metadata.get("unit", "second")
-
-        for bench in data.get("benchmarks", []):
-            for run in bench.get("runs", []):
-                run_meta = run.setdefault("metadata", {})
-                if bench_name:
-                    run_meta.setdefault("name", bench_name)
-                if loops is not None:
-                    run_meta.setdefault("loops", loops)
-                run_meta.setdefault("unit", unit)
-
-            all_benchmarks.append(bench)
-
-    merged = {
-        "version": "1.0",
-        "benchmarks": all_benchmarks,
-    }
-
-    with open(output_path, "w") as f:
-        json.dump(merged, f)
-
-    return len(all_benchmarks)
-
-
-def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
-    parser = argparse.ArgumentParser(
-        description="Run C++ CUDA benchmarks",
-        add_help=False,
-    )
-    parser.add_argument(
-        "--benchmark",
-        action="append",
-        default=[],
-        help="Benchmark name to run (e.g. 'ctx_device'). Repeat for multiple. Defaults to all.",
-    )
-    parser.add_argument(
-        "--list",
-        action="store_true",
-        help="Print discovered benchmark names and exit.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=Path,
-        default=DEFAULT_OUTPUT,
-        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
-    )
-    parsed, remaining = parser.parse_known_args(argv)
-    return parsed, remaining
-
-
-def main() -> None:
-    parsed, remaining_argv = parse_args(sys.argv[1:])
-
-    registry = discover_binaries()
-    if not registry:
-        print(
-            f"No C++ benchmark binaries found in {BUILD_DIR}.\nRun 'pixi run bench-cpp-build' first.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    if parsed.list:
-        for name in sorted(registry):
-            print(name)
-        return
-
-    if parsed.benchmark:
-        missing = sorted(set(parsed.benchmark) - set(registry))
-        if missing:
-            known = ", ".join(sorted(registry))
-            unknown = ", ".join(missing)
-            print(
-                f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-        names = parsed.benchmark
-    else:
-        names = sorted(registry)
-
-    # Strip any --output args to avoid conflicts with our output handling
-    passthrough_argv = strip_output_args(remaining_argv)
-
-    output_path = parsed.output.resolve()
-    failed = False
-    individual_files: list[Path] = []
-
-    with tempfile.TemporaryDirectory(prefix="cuda_bench_cpp_") as tmpdir:
-        tmpdir_path = Path(tmpdir)
-
-        for name in names:
-            binary = registry[name]
-            tmp_json = tmpdir_path / f"{name}.json"
-            cmd = [str(binary), "-o", str(tmp_json), *passthrough_argv]
-            result = subprocess.run(cmd, check=False)  # noqa: S603
-            if result.returncode != 0:
-                print(f"FAILED: {name} (exit code {result.returncode})", file=sys.stderr)
-                failed = True
-            elif tmp_json.exists():
-                individual_files.append(tmp_json)
-
-        if individual_files:
-            count = merge_pyperf_json(individual_files, output_path)
-            print(f"\nResults saved to {output_path} ({count} benchmark(s))")
-
-    if failed:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cuda_bindings/benchmarks/runner/main.py b/cuda_bindings/benchmarks/runner/main.py
deleted file mode 100644
index 4089aa5559..0000000000
--- a/cuda_bindings/benchmarks/runner/main.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import ast
-import importlib.util
-import os
-import sys
-from collections.abc import Callable
-from pathlib import Path
-from types import ModuleType
-
-import pyperf
-
-PROJECT_ROOT = Path(__file__).resolve().parent.parent
-BENCH_DIR = PROJECT_ROOT / "benchmarks"
-DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
-PYPERF_INHERITED_ENV_VARS = (
-    "CUDA_HOME",
-    "CUDA_PATH",
-    "CUDA_VISIBLE_DEVICES",
-    "LD_LIBRARY_PATH",
-    "NVIDIA_VISIBLE_DEVICES",
-)
-_MODULE_CACHE: dict[Path, ModuleType] = {}
-
-
-def load_module(module_path: Path) -> ModuleType:
-    module_path = module_path.resolve()
-    cached_module = _MODULE_CACHE.get(module_path)
-    if cached_module is not None:
-        return cached_module
-
-    module_name = f"cuda_bindings_bench_{module_path.stem}"
-    spec = importlib.util.spec_from_file_location(module_name, module_path)
-    if spec is None or spec.loader is None:
-        raise RuntimeError(f"Failed to load benchmark module: {module_path}")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    _MODULE_CACHE[module_path] = module
-    return module
-
-
-def benchmark_id(module_name: str, function_name: str) -> str:
-    module_suffix = module_name.removeprefix("bench_")
-    suffix = function_name.removeprefix("bench_")
-    return f"{module_suffix}.{suffix}"
-
-
-def _discover_module_functions(module_path: Path) -> list[str]:
-    tree = ast.parse(module_path.read_text(encoding="utf-8"), filename=str(module_path))
-    return [
-        node.name
-        for node in tree.body
-        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_")
-    ]
-
-
-def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
-    loaded_function: Callable[[int], float] | None = None
-
-    def run(loops: int) -> float:
-        nonlocal loaded_function
-        if loaded_function is None:
-            module = load_module(module_path)
-            loaded_function = getattr(module, function_name)
-        return loaded_function(loops)
-
-    run.__name__ = function_name
-    return run
-
-
-def discover_benchmarks() -> dict[str, Callable[[int], float]]:
-    """Discover bench_ functions.
-
-    Each bench_ function must have the signature: bench_*(loops: int) -> float
-    where it calls the operation `loops` times and returns the total elapsed
-    time in seconds (using time.perf_counter).
-    """
-    registry: dict[str, Callable[[int], float]] = {}
-    for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
-        module_name = module_path.stem
-        for function_name in _discover_module_functions(module_path):
-            bench_id = benchmark_id(module_name, function_name)
-            if bench_id in registry:
-                raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
-            registry[bench_id] = _lazy_benchmark(module_path, function_name)
-    return registry
-
-
-def strip_pyperf_output_args(argv: list[str]) -> list[str]:
-    cleaned: list[str] = []
-    skip_next = False
-    for arg in argv:
-        if skip_next:
-            skip_next = False
-            continue
-        if arg in ("-o", "--output", "--append"):
-            skip_next = True
-            continue
-        if arg.startswith(("-o=", "--output=", "--append=")):
-            continue
-        cleaned.append(arg)
-    return cleaned
-
-
-def _split_env_vars(arg_value: str) -> list[str]:
-    return [env_var for env_var in arg_value.split(",") if env_var]
-
-
-def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
-    if "--copy-env" in argv:
-        return list(argv)
-
-    inherited_env: list[str] = []
-    cleaned: list[str] = []
-    skip_next = False
-    for arg in argv:
-        if skip_next:
-            inherited_env.extend(_split_env_vars(arg))
-            skip_next = False
-            continue
-        if arg == "--inherit-environ":
-            skip_next = True
-            continue
-        if arg.startswith("--inherit-environ="):
-            inherited_env.extend(_split_env_vars(arg.partition("=")[2]))
-            continue
-        cleaned.append(arg)
-
-    if skip_next:
-        raise ValueError("Missing value for --inherit-environ")
-
-    for env_var in PYPERF_INHERITED_ENV_VARS:
-        if env_var in os.environ:
-            inherited_env.append(env_var)
-
-    deduped_env: list[str] = []
-    for env_var in inherited_env:
-        if env_var not in deduped_env:
-            deduped_env.append(env_var)
-
-    if deduped_env:
-        cleaned.extend(["--inherit-environ", ",".join(deduped_env)])
-
-    return cleaned
-
-
-def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
-    parser = argparse.ArgumentParser(add_help=False)
-    parser.add_argument(
-        "--benchmark",
-        action="append",
-        default=[],
-        help="Benchmark ID to run. Repeat to run multiple IDs. Defaults to all.",
-    )
-    parser.add_argument(
-        "--list",
-        action="store_true",
-        help="Print discovered benchmark IDs and exit.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=Path,
-        default=DEFAULT_OUTPUT,
-        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
-    )
-    parsed, remaining = parser.parse_known_args(argv)
-    return parsed, remaining
-
-
-def main() -> None:
-    parsed, remaining_argv = parse_args(sys.argv[1:])
-
-    registry = discover_benchmarks()
-    if not registry:
-        raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
-
-    if parsed.list:
-        for bench_id in sorted(registry):
-            print(bench_id)
-        return
-
-    if parsed.benchmark:
-        missing = sorted(set(parsed.benchmark) - set(registry))
-        if missing:
-            known = ", ".join(sorted(registry))
-            unknown = ", ".join(missing)
-            raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
-        benchmark_ids = parsed.benchmark
-    else:
-        benchmark_ids = sorted(registry)
-
-    # Strip any --output args to avoid conflicts with our output handling.
-    output_path = parsed.output.resolve()
-    remaining_argv = strip_pyperf_output_args(remaining_argv)
-    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
-    is_worker = "--worker" in remaining_argv
-
-    # Delete the file so this run starts fresh.
-    if not is_worker:
-        output_path.unlink(missing_ok=True)
-
-    sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
-
-    runner = pyperf.Runner()
-    for bench_id in benchmark_ids:
-        runner.bench_time_func(bench_id, registry[bench_id])
-
-    if not is_worker:
-        print(f"\nResults saved to {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/cuda_bindings/benchmarks/runner/runtime.py
deleted file mode 100644
index c985adb2e2..0000000000
--- a/cuda_bindings/benchmarks/runner/runtime.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import atexit
-
-from cuda.bindings import driver as cuda
-from cuda.bindings import nvrtc
-
-_ctx = None
-_device = None
-_persistent_ptrs: list[int] = []
-_modules: list = []
-
-
-def assert_drv(err) -> None:
-    if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError(f"Cuda Error: {err}")
-
-
-def ensure_context() -> int:
-    global _ctx, _device
-    if _ctx is not None:
-        return _ctx
-
-    (err,) = cuda.cuInit(0)
-    assert_drv(err)
-
-    err, device = cuda.cuDeviceGet(0)
-    assert_drv(err)
-    _device = device
-
-    err, ctx = cuda.cuCtxCreate(None, 0, device)
-    assert_drv(err)
-    _ctx = ctx
-    return ctx
-
-
-def alloc_persistent(size: int) -> int:
-    ensure_context()
-    err, ptr = cuda.cuMemAlloc(size)
-    assert_drv(err)
-    _persistent_ptrs.append(ptr)
-    return ptr
-
-
-def compile_and_load(kernel_source: str) -> int:
-    """Compile CUDA C source and returns the CUmodule handle"""
-    ensure_context()
-
-    err, major = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _device
-    )
-    assert_drv(err)
-    err, minor = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _device
-    )
-    assert_drv(err)
-
-    err, prog = nvrtc.nvrtcCreateProgram(kernel_source.encode(), b"benchmark_kernel.cu", 0, [], [])
-    assert_drv(err)
-
-    arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode()
-    (err,) = nvrtc.nvrtcCompileProgram(prog, 2, [b"--fmad=false", arch_flag])
-
-    # check for compile errors
-    err_log, log_size = nvrtc.nvrtcGetProgramLogSize(prog)
-    assert_drv(err_log)
-    log = b" " * log_size
-    (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
-    assert_drv(err_log)
-    assert_drv(err)
-
-    err, cubin_size = nvrtc.nvrtcGetCUBINSize(prog)
-    assert_drv(err)
-    cubin = b" " * cubin_size
-    (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
-    assert_drv(err)
-
-    err, module = cuda.cuModuleLoadData(cubin)
-    assert_drv(err)
-    _modules.append(module)
-    return module
-
-
-def cleanup() -> None:
-    global _ctx
-    for ptr in reversed(_persistent_ptrs):
-        (err,) = cuda.cuMemFree(ptr)
-        assert_drv(err)
-    _persistent_ptrs.clear()
-
-    for module in reversed(_modules):
-        (err,) = cuda.cuModuleUnload(module)
-        assert_drv(err)
-    _modules.clear()
-
-    if _ctx is None:
-        return
-    (err,) = cuda.cuCtxDestroy(_ctx)
-    assert_drv(err)
-    _ctx = None
-
-
-atexit.register(cleanup)
diff --git a/cuda_bindings/benchmarks/tests/test_runner.py b/cuda_bindings/benchmarks/tests/test_runner.py
deleted file mode 100644
index 612094dac9..0000000000
--- a/cuda_bindings/benchmarks/tests/test_runner.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import importlib.util
-import itertools
-import sys
-import types
-from pathlib import Path
-
-REPO_ROOT = Path(__file__).resolve().parents[3]
-RUNNER_MAIN_PATH = REPO_ROOT / "cuda_bindings/benchmarks/runner/main.py"
-BENCH_LAUNCH_PATH = REPO_ROOT / "cuda_bindings/benchmarks/benchmarks/bench_launch.py"
-
-
-def load_module_from_path(module_name: str, module_path: Path):
-    spec = importlib.util.spec_from_file_location(module_name, module_path)
-    if spec is None or spec.loader is None:
-        raise RuntimeError(f"Failed to load test module: {module_path}")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
-
-
-def load_runner_main(monkeypatch):
-    pyperf_module = types.ModuleType("pyperf")
-
-    class FakeRunner:
-        def bench_time_func(self, *_args, **_kwargs) -> None:
-            raise AssertionError("FakeRunner should not be used in these tests")
-
-    pyperf_module.Runner = FakeRunner
-    monkeypatch.setitem(sys.modules, "pyperf", pyperf_module)
-    return load_module_from_path("test_cuda_bindings_bench_runner_main", RUNNER_MAIN_PATH)
-
-
-def load_bench_launch(monkeypatch, calls: list[tuple]):
-    pointer_values = itertools.count(1000)
-
-    runtime_module = types.ModuleType("runner.runtime")
-
-    def alloc_persistent(size: int) -> int:
-        calls.append(("alloc_persistent", size))
-        return next(pointer_values)
-
-    def assert_drv(err) -> None:
-        calls.append(("assert_drv", err))
-        assert err == 0
-
-    def compile_and_load(source: str) -> str:
-        calls.append(("compile_and_load", source))
-        return "module"
-
-    runtime_module.alloc_persistent = alloc_persistent
-    runtime_module.assert_drv = assert_drv
-    runtime_module.compile_and_load = compile_and_load
-
-    runner_module = types.ModuleType("runner")
-    runner_module.runtime = runtime_module
-
-    driver_module = types.ModuleType("cuda.bindings.driver")
-
-    class FakeCUresult:
-        CUDA_SUCCESS = 0
-
-    class FakeCUstreamFlags:
-        CU_STREAM_NON_BLOCKING = types.SimpleNamespace(value=1)
-
-    def cuModuleGetFunction(module, name):
-        calls.append(("cuModuleGetFunction", module, name))
-        return 0, name
-
-    def cuStreamCreate(flags):
-        calls.append(("cuStreamCreate", flags))
-        return 0, "stream"
-
-    def cuLaunchKernel(*args):
-        calls.append(("cuLaunchKernel", args))
-        return 0
-
-    driver_module.CUresult = FakeCUresult
-    driver_module.CUstream_flags = FakeCUstreamFlags
-    driver_module.cuModuleGetFunction = cuModuleGetFunction
-    driver_module.cuStreamCreate = cuStreamCreate
-    driver_module.cuLaunchKernel = cuLaunchKernel
-
-    cuda_module = types.ModuleType("cuda")
-    bindings_module = types.ModuleType("cuda.bindings")
-    bindings_module.driver = driver_module
-    cuda_module.bindings = bindings_module
-
-    monkeypatch.setitem(sys.modules, "runner", runner_module)
-    monkeypatch.setitem(sys.modules, "runner.runtime", runtime_module)
-    monkeypatch.setitem(sys.modules, "cuda", cuda_module)
-    monkeypatch.setitem(sys.modules, "cuda.bindings", bindings_module)
-    monkeypatch.setitem(sys.modules, "cuda.bindings.driver", driver_module)
-
-    return load_module_from_path("test_cuda_bindings_bench_launch", BENCH_LAUNCH_PATH)
-
-
-def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
-    runner_main = load_runner_main(monkeypatch)
-
-    marker_path = tmp_path / "imported.txt"
-    bench_path = tmp_path / "bench_lazy.py"
-    bench_path.write_text(
-        "\n".join(
-            (
-                "from pathlib import Path",
-                f"Path({str(marker_path)!r}).write_text('imported')",
-                "",
-                "def helper() -> float:",
-                "    return 0.0",
-                "",
-                "def bench_visible(loops: int) -> float:",
-                "    return loops + 0.5",
-                "",
-            )
-        ),
-        encoding="utf-8",
-    )
-
-    monkeypatch.setattr(runner_main, "BENCH_DIR", tmp_path)
-    runner_main._MODULE_CACHE.clear()
-
-    registry = runner_main.discover_benchmarks()
-
-    assert sorted(registry) == ["lazy.visible"]
-    assert not marker_path.exists()
-    assert registry["lazy.visible"](3) == 3.5
-    assert marker_path.read_text(encoding="utf-8") == "imported"
-
-
-def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
-    runner_main = load_runner_main(monkeypatch)
-
-    for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
-        monkeypatch.delenv(env_var, raising=False)
-    monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
-    monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")
-
-    argv = runner_main.ensure_pyperf_worker_env(["--fast", "--inherit-environ=FOO,BAR"])
-
-    assert argv == ["--fast", "--inherit-environ", "FOO,BAR,CUDA_PATH,LD_LIBRARY_PATH"]
-
-
-def test_bench_launch_initializes_on_first_use(monkeypatch):
-    calls: list[tuple] = []
-    bench_launch = load_bench_launch(monkeypatch, calls)
-
-    assert calls == []
-
-    bench_launch.bench_launch_empty_kernel(1)
-    compile_calls = [call for call in calls if call[0] == "compile_and_load"]
-    launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"]
-
-    assert len(compile_calls) == 1
-    assert len(launch_calls) == 1
-
-    bench_launch.bench_launch_16_args_pre_packed(1)
-    compile_calls = [call for call in calls if call[0] == "compile_and_load"]
-    launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"]
-
-    assert len(compile_calls) == 1
-    assert len(launch_calls) == 2

From 5ecba207b527a72fb8465e3f4843ad4d3ba0b54e Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 16:04:39 -0500
Subject: [PATCH 05/10] lint

---
 .../benchmarks/bench_ctx_device.py            | 20 ++++++-------
 .../cuda_bindings/benchmarks/bench_event.py   | 20 ++++++-------
 .../cuda_bindings/benchmarks/bench_launch.py  | 16 +++++-----
 .../cuda_bindings/benchmarks/bench_memory.py  | 30 +++++++++----------
 .../benchmarks/bench_pointer_attributes.py    |  4 +--
 .../cuda_bindings/benchmarks/bench_stream.py  | 16 +++++-----
 6 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
index 1c82cd4046..2e2cd11d93 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
@@ -15,48 +15,48 @@
 
 
 def bench_ctx_get_current(loops: int) -> float:
-    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+    _fn = cuda.cuCtxGetCurrent
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetCurrent()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_ctx_set_current(loops: int) -> float:
-    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _fn = cuda.cuCtxSetCurrent
     _ctx = CTX
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxSetCurrent(_ctx)
+        _fn(_ctx)
     return time.perf_counter() - t0
 
 
 def bench_ctx_get_device(loops: int) -> float:
-    _cuCtxGetDevice = cuda.cuCtxGetDevice
+    _fn = cuda.cuCtxGetDevice
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetDevice()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_device_get(loops: int) -> float:
-    _cuDeviceGet = cuda.cuDeviceGet
+    _fn = cuda.cuDeviceGet
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGet(0)
+        _fn(0)
     return time.perf_counter() - t0
 
 
 def bench_device_get_attribute(loops: int) -> float:
-    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _fn = cuda.cuDeviceGetAttribute
     _attr = ATTRIBUTE
     _dev = DEVICE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGetAttribute(_attr, _dev)
+        _fn(_attr, _dev)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py
index e8e319115d..041adc2553 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_event.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py
@@ -20,43 +20,43 @@
 
 
 def bench_event_create_destroy(loops: int) -> float:
-    _cuEventCreate = cuda.cuEventCreate
-    _cuEventDestroy = cuda.cuEventDestroy
+    _create = cuda.cuEventCreate
+    _destroy = cuda.cuEventDestroy
     _flags = EVENT_FLAGS
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, e = _cuEventCreate(_flags)
-        _cuEventDestroy(e)
+        _, e = _create(_flags)
+        _destroy(e)
     return time.perf_counter() - t0
 
 
 def bench_event_record(loops: int) -> float:
-    _cuEventRecord = cuda.cuEventRecord
+    _fn = cuda.cuEventRecord
     _event = EVENT
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventRecord(_event, _stream)
+        _fn(_event, _stream)
     return time.perf_counter() - t0
 
 
 def bench_event_query(loops: int) -> float:
-    _cuEventQuery = cuda.cuEventQuery
+    _fn = cuda.cuEventQuery
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventQuery(_event)
+        _fn(_event)
     return time.perf_counter() - t0
 
 
 def bench_event_synchronize(loops: int) -> float:
-    _cuEventSynchronize = cuda.cuEventSynchronize
+    _fn = cuda.cuEventSynchronize
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventSynchronize(_event)
+        _fn(_event)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
index 931194fbd3..abf3f946cc 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_launch.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
@@ -82,19 +82,19 @@ def _ensure_launch_state() -> None:
 
 def bench_launch_empty_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = EMPTY_KERNEL
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_small_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = SMALL_KERNEL
     _stream = STREAM
     _args = (FLOAT_PTR,)
@@ -102,13 +102,13 @@ def bench_launch_small_kernel(loops: int) -> float:
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _args = INT_PTRS
@@ -116,18 +116,18 @@ def bench_launch_16_args(loops: int) -> float:
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args_pre_packed(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _packed = PACKED_16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
index faa4795580..875c060406 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_memory.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
@@ -2,11 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import ctypes
 import time
 
 import numpy as np
-
 from runner.runtime import alloc_persistent, ensure_context
 
 from cuda.bindings import driver as cuda
@@ -30,61 +28,61 @@
 
 
 def bench_mem_alloc_free(loops: int) -> float:
-    _cuMemAlloc = cuda.cuMemAlloc
-    _cuMemFree = cuda.cuMemFree
+    _alloc = cuda.cuMemAlloc
+    _free = cuda.cuMemFree
     _size = ALLOC_SIZE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, ptr = _cuMemAlloc(_size)
-        _cuMemFree(ptr)
+        _, ptr = _alloc(_size)
+        _free(ptr)
     return time.perf_counter() - t0
 
 
 def bench_mem_alloc_async_free_async(loops: int) -> float:
-    _cuMemAllocAsync = cuda.cuMemAllocAsync
-    _cuMemFreeAsync = cuda.cuMemFreeAsync
+    _alloc = cuda.cuMemAllocAsync
+    _free = cuda.cuMemFreeAsync
     _size = ALLOC_SIZE
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, ptr = _cuMemAllocAsync(_size, _stream)
-        _cuMemFreeAsync(ptr, _stream)
+        _, ptr = _alloc(_size, _stream)
+        _free(ptr, _stream)
     return time.perf_counter() - t0
 
 
 def bench_memcpy_htod(loops: int) -> float:
-    _cuMemcpyHtoD = cuda.cuMemcpyHtoD
+    _fn = cuda.cuMemcpyHtoD
     _dst = DST_DPTR
     _src = HOST_SRC
     _size = COPY_SIZE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuMemcpyHtoD(_dst, _src, _size)
+        _fn(_dst, _src, _size)
     return time.perf_counter() - t0
 
 
 def bench_memcpy_dtoh(loops: int) -> float:
-    _cuMemcpyDtoH = cuda.cuMemcpyDtoH
+    _fn = cuda.cuMemcpyDtoH
     _dst = HOST_DST
     _src = SRC_DPTR
     _size = COPY_SIZE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuMemcpyDtoH(_dst, _src, _size)
+        _fn(_dst, _src, _size)
     return time.perf_counter() - t0
 
 
 def bench_memcpy_dtod(loops: int) -> float:
-    _cuMemcpyDtoD = cuda.cuMemcpyDtoD
+    _fn = cuda.cuMemcpyDtoD
     _dst = DST_DPTR
     _src = SRC_DPTR
     _size = COPY_SIZE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuMemcpyDtoD(_dst, _src, _size)
+        _fn(_dst, _src, _size)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
index a02b82c399..191da263ee 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
@@ -15,11 +15,11 @@
 
 def bench_pointer_get_attribute(loops: int) -> float:
     # Local references to avoid global lookups in the hot loop
-    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
+    _fn = cuda.cuPointerGetAttribute
     _attr = ATTRIBUTE
     _ptr = PTR
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuPointerGetAttribute(_attr, _ptr)
+        _fn(_attr, _ptr)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
index d816099ed5..3aab9288fc 100644
--- a/benchmarks/cuda_bindings/benchmarks/bench_stream.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
@@ -14,32 +14,32 @@
 
 
 def bench_stream_create_destroy(loops: int) -> float:
-    _cuStreamCreate = cuda.cuStreamCreate
-    _cuStreamDestroy = cuda.cuStreamDestroy
+    _create = cuda.cuStreamCreate
+    _destroy = cuda.cuStreamDestroy
     _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, s = _cuStreamCreate(_flags)
-        _cuStreamDestroy(s)
+        _, s = _create(_flags)
+        _destroy(s)
     return time.perf_counter() - t0
 
 
 def bench_stream_query(loops: int) -> float:
-    _cuStreamQuery = cuda.cuStreamQuery
+    _fn = cuda.cuStreamQuery
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamQuery(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0
 
 
 def bench_stream_synchronize(loops: int) -> float:
-    _cuStreamSynchronize = cuda.cuStreamSynchronize
+    _fn = cuda.cuStreamSynchronize
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamSynchronize(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0

From e4682631fe63b84cdd27885b916c3cdda8486137 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 16:08:00 -0500
Subject: [PATCH 06/10] lint

---
 benchmarks/cuda_bindings/runner/main.py | 2 +-
 ruff.toml                               | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
index 4089aa5559..b0f6e76f41 100644
--- a/benchmarks/cuda_bindings/runner/main.py
+++ b/benchmarks/cuda_bindings/runner/main.py
@@ -53,7 +53,7 @@ def _discover_module_functions(module_path: Path) -> list[str]:
     return [
         node.name
         for node in tree.body
-        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_")
+        if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef) and node.name.startswith("bench_")
     ]
 
 
diff --git a/ruff.toml b/ruff.toml
index 704e422c19..b0df8060b6 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -87,7 +87,6 @@ inline-quotes = "double"
   "ARG001",  # unused function argument (fixtures)
   "ARG002",  # unused method argument
   "RUF012",  # mutable class default (ctypes _fields_ is standard)
-  "RUF059",  # unused unpacked variable (side-effect assignments)
   "F841",    # unused local variable (side-effect assignments)
   "E402",    # module-level import not at top of file
   "E702",    # multiple statements on one line (compact test tables)
@@ -111,17 +110,21 @@ inline-quotes = "double"
 "**/examples/**" = [
   "T201",    # print
   "E402",    # module-level import not at top of file
-  "RUF059",  # unused unpacked variable
 ]
 
 "**/benchmarks/**" = [
   "T201",    # print
   "RUF012",  # mutable class default (ctypes _fields_ is standard)
-  "RUF059",  # unused unpacked variable
   "F841",    # unused local variable
   "E402",    # module-level import not at top of file
 ]
 
+"**/pytest-legacy/**" = [
+  "N801",    # legacy CUDA naming conventions
+  "N802",
+  "N806",
+]
+
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
 # Keep examples opted-in to enforce naming conventions in example-local identifiers.
 "cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [

From ddd32480b4d3a6801e72981ce635c54935c8ea68 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 17 Apr 2026 16:10:33 -0500
Subject: [PATCH 07/10] lint

---
 ruff.toml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ruff.toml b/ruff.toml
index b0df8060b6..704e422c19 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -87,6 +87,7 @@ inline-quotes = "double"
   "ARG001",  # unused function argument (fixtures)
   "ARG002",  # unused method argument
   "RUF012",  # mutable class default (ctypes _fields_ is standard)
+  "RUF059",  # unused unpacked variable (side-effect assignments)
   "F841",    # unused local variable (side-effect assignments)
   "E402",    # module-level import not at top of file
   "E702",    # multiple statements on one line (compact test tables)
@@ -110,21 +111,17 @@ inline-quotes = "double"
 "**/examples/**" = [
   "T201",    # print
   "E402",    # module-level import not at top of file
+  "RUF059",  # unused unpacked variable
 ]
 
 "**/benchmarks/**" = [
   "T201",    # print
   "RUF012",  # mutable class default (ctypes _fields_ is standard)
+  "RUF059",  # unused unpacked variable
   "F841",    # unused local variable
   "E402",    # module-level import not at top of file
 ]
 
-"**/pytest-legacy/**" = [
-  "N801",    # legacy CUDA naming conventions
-  "N802",
-  "N806",
-]
-
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
 # Keep examples opted-in to enforce naming conventions in example-local identifiers.
 "cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [

From f2c083818bdfad9a4c97c211092ab91239c8895a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 17 Apr 2026 15:57:46 -0700
Subject: [PATCH 08/10] Recover missing SPDX follow-ups from PR 1913

GitHub merged PR 1913 before the later local commits were pushed, so replay the recovered SPDX policy follow-ups and related license fixes here.
Context: https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4271701561

Made-with: Cursor
---
 .coveragerc                                   |  2 +-
 .spdx-ignore                                  |  3 -
 .../cuda_python_test_helpers/nvvm_bitcode.py  |  2 +-
 toolshed/build_static_bitcode_input.py        |  2 +-
 toolshed/check_spdx.py                        | 74 ++++++++++++++-----
 toolshed/dump_cutile_b64.py                   |  2 +-
 6 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index 36f0f7879a..1e1776fd56 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 [paths]
 source =
diff --git a/.spdx-ignore b/.spdx-ignore
index 8c1d155c47..7263b5414f 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -8,9 +8,6 @@ LICENSE
 requirements*.txt
 cuda_bindings/examples/*
 
-# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
-cuda_bindings/benchmarks/*
-
 # Vendored
 cuda_core/cuda/core/_include/dlpack.h
 
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index ddb6eae107..e6366ac95d 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import binascii
 
diff --git a/toolshed/build_static_bitcode_input.py b/toolshed/build_static_bitcode_input.py
index 273ce33244..e2400100dd 100755
--- a/toolshed/build_static_bitcode_input.py
+++ b/toolshed/build_static_bitcode_input.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 """
 Helper to produce static bitcode input for test_nvvm.py.
diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py
index 6be42282bf..d769ded66c 100644
--- a/toolshed/check_spdx.py
+++ b/toolshed/check_spdx.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import datetime
+import fnmatch
 import os
 import re
 import subprocess
@@ -17,12 +18,27 @@
 
 LICENSE_IDENTIFIER_REGEX = re.compile(re.escape(SPDX_LICENSE_IDENTIFIER_PREFIX) + rb"(?P<license_identifier>[^\r\n]+)")
 
-EXPECTED_LICENSE_IDENTIFIERS = (
-    ("cuda_bindings/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"),
-    ("cuda_core/", "Apache-2.0"),
-    ("cuda_pathfinder/", "Apache-2.0"),
-    ("cuda_python/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"),
-)
+TOP_LEVEL_FILE_LICENSE_IDENTIFIER = "Apache-2.0"
+
+# Every top-level directory needs to have an entry here, so new paths
+# can't slip in without a reviewed license decision.
+TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = {
+    ".github": "Apache-2.0",
+    "ci": "Apache-2.0",
+    "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+    "cuda_core": "Apache-2.0",
+    "cuda_pathfinder": "Apache-2.0",
+    "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+    "cuda_python_test_helpers": "Apache-2.0",
+    "scripts": "Apache-2.0",
+    "toolshed": "Apache-2.0",
+}
+
+SPECIAL_CASE_LICENSE_IDENTIFIERS = {
+    # key: repo-relative path or glob, value: expected SPDX license identifier
+    "cuda_bindings/benchmarks/*": "Apache-2.0",
+    "cuda_bindings/benchmarks/pytest-legacy/*": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+}
 
 SPDX_IGNORE_FILENAME = ".spdx-ignore"
 
@@ -63,12 +79,34 @@ def normalize_repo_path(filepath):
     return PureWindowsPath(filepath).as_posix()
 
 
+def get_top_level_directory(normalized_path):
+    if "/" not in normalized_path:
+        return None
+    return normalized_path.split("/", 1)[0]
+
+
 def get_expected_license_identifier(filepath):
     normalized_path = normalize_repo_path(filepath)
-    for prefix, license_identifier in EXPECTED_LICENSE_IDENTIFIERS:
-        if normalized_path.startswith(prefix):
-            return license_identifier
-    return None
+    matching_special_cases = [
+        (prefix, license_identifier)
+        for prefix, license_identifier in SPECIAL_CASE_LICENSE_IDENTIFIERS.items()
+        if fnmatch.fnmatchcase(normalized_path, prefix)
+    ]
+    if matching_special_cases:
+        return max(matching_special_cases, key=lambda item: len(item[0]))[1], None
+
+    top_level_directory = get_top_level_directory(normalized_path)
+    if top_level_directory is None:
+        return TOP_LEVEL_FILE_LICENSE_IDENTIFIER, None
+
+    if top_level_directory not in TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS:
+        return (
+            None,
+            f"MISSING TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS entry for top-level directory "
+            f"{top_level_directory!r} required by {filepath!r}",
+        )
+
+    return TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS[top_level_directory], None
 
 
 def validate_required_spdx_field(filepath, blob, expected_bytes):
@@ -82,10 +120,11 @@ def extract_license_identifier(blob):
     match = LICENSE_IDENTIFIER_REGEX.search(blob)
     if match is None:
         return None
-    try:
-        return match.group("license_identifier").decode("ascii")
-    except UnicodeDecodeError:
-        return None
+    license_identifier = match.group("license_identifier").decode("ascii", errors="replace").strip()
+    for comment_suffix in ("-->", "*/"):
+        if license_identifier.endswith(comment_suffix):
+            license_identifier = license_identifier.removesuffix(comment_suffix).rstrip()
+    return license_identifier or None
 
 
 def validate_license_identifier(filepath, blob):
@@ -94,9 +133,10 @@ def validate_license_identifier(filepath, blob):
         print(f"MISSING valid SPDX license identifier in {filepath!r}")
         return False
 
-    expected_license_identifier = get_expected_license_identifier(filepath)
-    if expected_license_identifier is None:
-        return True
+    expected_license_identifier, configuration_error = get_expected_license_identifier(filepath)
+    if configuration_error is not None:
+        print(configuration_error)
+        return False
 
     if license_identifier != expected_license_identifier:
         print(
diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py
index 84013ea94b..422bf95232 100644
--- a/toolshed/dump_cutile_b64.py
+++ b/toolshed/dump_cutile_b64.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 """
 Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=.,

From efc34e8f0b0b208739614851572bcb5ae431b2e9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 17 Apr 2026 16:06:55 -0700
Subject: [PATCH 09/10] Fix benchmarks/cuda_bindings/pytest-legacy license
 identifiers

---
 benchmarks/cuda_bindings/pytest-legacy/conftest.py              | 2 +-
 benchmarks/cuda_bindings/pytest-legacy/kernels.py               | 2 +-
 benchmarks/cuda_bindings/pytest-legacy/test_cupy.py             | 2 +-
 benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py   | 2 +-
 benchmarks/cuda_bindings/pytest-legacy/test_numba.py            | 2 +-
 .../cuda_bindings/pytest-legacy/test_pointer_attributes.py      | 2 +-
 toolshed/check_spdx.py                                          | 1 +
 7 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/benchmarks/cuda_bindings/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
index 0ea7b1d772..5d0cc95e7a 100644
--- a/benchmarks/cuda_bindings/pytest-legacy/conftest.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
 import pytest
diff --git a/benchmarks/cuda_bindings/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
index 36646fba00..7e741110a3 100644
--- a/benchmarks/cuda_bindings/pytest-legacy/kernels.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 kernel_string = """\
 #define ITEM_PARAM(x, T) T x
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
index 76dd6e6a45..3eea752ce0 100644
--- a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import ctypes
 
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
index dd994081a0..ad421de382 100755
--- a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import ctypes
 
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
index dfe084c6b1..d9ae0cdfee 100644
--- a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
 import pytest
diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
index fae72ffd79..6df32ec511 100644
--- a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import random
 
diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py
index d769ded66c..3d52142554 100644
--- a/toolshed/check_spdx.py
+++ b/toolshed/check_spdx.py
@@ -24,6 +24,7 @@
 # can't slip in without a reviewed license decision.
 TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = {
     ".github": "Apache-2.0",
+    "benchmarks": "Apache-2.0",
     "ci": "Apache-2.0",
     "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
     "cuda_core": "Apache-2.0",

From e17681724a02820996af41ad723e5d27d1a2427a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 17 Apr 2026 16:20:04 -0700
Subject: [PATCH 10/10] Move legacy benchmark Ruff suppressions with code move

The naming-rule suppressions used to live under cuda_bindings/benchmarks, so move the needed legacy-path suppressions to the relocated benchmarks/cuda_bindings pytest-legacy path and drop the stale old-path entry.

Made-with: Cursor
---
 ruff.toml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 704e422c19..210f852cd3 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -124,13 +124,18 @@ inline-quotes = "double"
 
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
 # Keep examples opted-in to enforce naming conventions in example-local identifiers.
-"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [
+"cuda_bindings/{cuda,docs,tests}/**" = [
   "N801",    # invalid-class-name
   "N802",    # invalid-function-name
   "N803",    # invalid-argument-name
   "N806",    # non-lowercase-variable-in-function
   "N816",    # mixed-case-variable-in-global-scope
 ]
+"benchmarks/cuda_bindings/pytest-legacy/**" = [
+  "N801",    # invalid-class-name
+  "N802",    # invalid-function-name
+  "N806",    # non-lowercase-variable-in-function
+]
 "cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"]
 
 # scripts and build tooling — print is the expected output method