From 9a83e1dec9de4ae67cb144bedf1ae22cc2d9bfde Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 11:11:16 -0500 Subject: [PATCH 01/10] Migrate to /benchmarks directory --- benchmarks/cuda_bindings/.gitignore | 16 + benchmarks/cuda_bindings/AGENTS.md | 4 + benchmarks/cuda_bindings/README.md | 74 + .../benchmarks/bench_ctx_device.py | 62 + .../cuda_bindings/benchmarks/bench_event.py | 62 + .../cuda_bindings/benchmarks/bench_launch.py | 133 ++ .../benchmarks/bench_pointer_attributes.py | 25 + .../cuda_bindings/benchmarks/bench_stream.py | 45 + .../benchmarks/cpp/CMakeLists.txt | 92 + .../benchmarks/cpp/bench_ctx_device.cpp | 87 + .../benchmarks/cpp/bench_event.cpp | 90 + .../benchmarks/cpp/bench_launch.cpp | 216 ++ .../cpp/bench_pointer_attributes.cpp | 59 + .../benchmarks/cpp/bench_stream.cpp | 74 + .../benchmarks/cpp/bench_support.hpp | 309 +++ benchmarks/cuda_bindings/compare.py | 118 ++ benchmarks/cuda_bindings/pixi.lock | 1767 +++++++++++++++++ benchmarks/cuda_bindings/pixi.toml | 87 + .../cuda_bindings/pytest-legacy/conftest.py | 93 + .../cuda_bindings/pytest-legacy/kernels.py | 159 ++ .../cuda_bindings/pytest-legacy/test_cupy.py | 199 ++ .../pytest-legacy/test_launch_latency.py | 336 ++++ .../cuda_bindings/pytest-legacy/test_numba.py | 52 + .../pytest-legacy/test_pointer_attributes.py | 112 ++ benchmarks/cuda_bindings/run_cpp.py | 8 + benchmarks/cuda_bindings/run_pyperf.py | 8 + benchmarks/cuda_bindings/runner/__init__.py | 3 + benchmarks/cuda_bindings/runner/cpp.py | 180 ++ benchmarks/cuda_bindings/runner/main.py | 217 ++ benchmarks/cuda_bindings/runner/runtime.py | 105 + benchmarks/cuda_bindings/tests/test_runner.py | 166 ++ 31 files changed, 4958 insertions(+) create mode 100644 benchmarks/cuda_bindings/.gitignore create mode 100644 benchmarks/cuda_bindings/AGENTS.md create mode 100644 benchmarks/cuda_bindings/README.md create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_event.py create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_launch.py create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_stream.py create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp create mode 100644 benchmarks/cuda_bindings/compare.py create mode 100644 benchmarks/cuda_bindings/pixi.lock create mode 100644 benchmarks/cuda_bindings/pixi.toml create mode 100644 benchmarks/cuda_bindings/pytest-legacy/conftest.py create mode 100644 benchmarks/cuda_bindings/pytest-legacy/kernels.py create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_cupy.py create mode 100755 benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_numba.py create mode 100644 benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py create mode 100644 benchmarks/cuda_bindings/run_cpp.py create mode 100644 benchmarks/cuda_bindings/run_pyperf.py create mode 100644 benchmarks/cuda_bindings/runner/__init__.py create mode 100644 benchmarks/cuda_bindings/runner/cpp.py create mode 100644 benchmarks/cuda_bindings/runner/main.py create mode 100644 benchmarks/cuda_bindings/runner/runtime.py create mode 100644 benchmarks/cuda_bindings/tests/test_runner.py diff --git a/benchmarks/cuda_bindings/.gitignore b/benchmarks/cuda_bindings/.gitignore new file mode 100644 index 0000000000..b795782a32 --- /dev/null +++ b/benchmarks/cuda_bindings/.gitignore @@ -0,0 +1,16 @@ +# Build artifacts +.build/ +__pycache__/ + +# Benchmark results +*.json +.benchmarks/ + +# Pixi environments +.pixi/ + +# Override root .gitignore *.cpp rule (which targets Cython-generated files) +!benchmarks/cpp/*.cpp + +results-python.json +results-cpp.json diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md new file mode 100644 index 0000000000..04f2f713fa --- /dev/null +++ b/benchmarks/cuda_bindings/AGENTS.md @@ -0,0 +1,4 @@ +# cuda.bindings benchmarks + +When generating code verify that that the code is correct based on the source for cuda-bindings +that can be found in ../cuda_bindings diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md new file mode 100644 index 0000000000..75e16db031 --- /dev/null +++ b/benchmarks/cuda_bindings/README.md @@ -0,0 +1,74 @@ +# cuda.bindings benchmarks + +These benchmarks are intended to measure the latency overhead of calling CUDA +Driver APIs through cuda.bindings, relative to a similar C++ baseline. + +The goal is to benchmark how much overhead does the Python layer adds to calling +CUDA APIs and what operations are not in our target of less than 1us of overhead. + +Each Python benchmark has a C++ counterpart, which is used to compare the +operations. We try to make each implementation perform small operations +and nearly the same work as possible and are run under similar conditions. + +These are **not** throughput benchmarks to measure the overall performance +of kernels and applications. + +## Usage + +Requires pixi. + +There are a couple of environments defined based on how `cuda.bindings` is installed: + +- `wheel`: Installs from conda packages +- `source`: Installs from source + +There are a couple of tasks defined: + +- `bench`: Runs the Python benchmarks +- `bench-cpp`: Runs the C++ benchmarks + +### System tuning + +For more stable results on Linux, tune the system before running benchmarks. +See: https://pyperf.readthedocs.io/en/latest/system.html#system + +```bash +# Show current system state +pixi run -e wheel -- python -m pyperf system show + +# Apply tuning (may require root) +sudo $(pixi run -e wheel -- which python) -m pyperf system tune +``` + +### Running benchmarks + +To run the benchmarks combine the environment and task: + +```bash +# Run the Python benchmarks in the wheel environment +pixi run -e wheel bench + +# Run the Python benchmarks in the source environment +pixi run -e source bench + +# Run the C++ benchmarks +pixi run -e wheel bench-cpp +``` + +Both runners automatically save results to JSON files in the benchmarks +directory: `results-python.json` and `results-cpp.json`. + +## Output JSON and analysis + +The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/). +Both Python and C++ results are saved in pyperf-compatible JSON format, +which can be analyzed with pyperf commands: + +```bash +# Show results and statistics +pixi run -e wheel -- python -m pyperf stats results-python.json +pixi run -e wheel -- python -m pyperf stats results-cpp.json + +# Compare C++ vs Python results +pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json +``` diff --git a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py new file mode 100644 index 0000000000..1c82cd4046 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +CTX = ensure_context() + +_, DEVICE = cuda.cuDeviceGet(0) +ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR + + +def bench_ctx_get_current(loops: int) -> float: + _cuCtxGetCurrent = cuda.cuCtxGetCurrent + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxGetCurrent() + return time.perf_counter() - t0 + + +def bench_ctx_set_current(loops: int) -> float: + _cuCtxSetCurrent = cuda.cuCtxSetCurrent + _ctx = CTX + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxSetCurrent(_ctx) + return time.perf_counter() - t0 + + +def bench_ctx_get_device(loops: int) -> float: + _cuCtxGetDevice = cuda.cuCtxGetDevice + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxGetDevice() + return time.perf_counter() - t0 + + +def bench_device_get(loops: int) -> float: + _cuDeviceGet = cuda.cuDeviceGet + + t0 = time.perf_counter() + for _ in range(loops): + _cuDeviceGet(0) + return time.perf_counter() - t0 + + +def bench_device_get_attribute(loops: int) -> float: + _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute + _attr = ATTRIBUTE + _dev = DEVICE + + t0 = time.perf_counter() + for _ in range(loops): + _cuDeviceGetAttribute(_attr, _dev) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py new file mode 100644 index 0000000000..e8e319115d --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) +_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value) + +cuda.cuEventRecord(EVENT, STREAM) +cuda.cuStreamSynchronize(STREAM) + +EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value + + +def bench_event_create_destroy(loops: int) -> float: + _cuEventCreate = cuda.cuEventCreate + _cuEventDestroy = cuda.cuEventDestroy + _flags = EVENT_FLAGS + + t0 = time.perf_counter() + for _ in range(loops): + _, e = _cuEventCreate(_flags) + _cuEventDestroy(e) + return time.perf_counter() - t0 + + +def bench_event_record(loops: int) -> float: + _cuEventRecord = cuda.cuEventRecord + _event = EVENT + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventRecord(_event, _stream) + return time.perf_counter() - t0 + + +def bench_event_query(loops: int) -> float: + _cuEventQuery = cuda.cuEventQuery + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventQuery(_event) + return time.perf_counter() - t0 + + +def bench_event_synchronize(loops: int) -> float: + _cuEventSynchronize = cuda.cuEventSynchronize + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventSynchronize(_event) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py new file mode 100644 index 0000000000..931194fbd3 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes +import time + +from runner.runtime import alloc_persistent, assert_drv, compile_and_load + +from cuda.bindings import driver as cuda + +# Compile kernels lazily so benchmark discovery does not need NVRTC. +KERNEL_SOURCE = """\ +extern "C" __global__ void empty_kernel() { return; } +extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } + +#define ITEM_PARAM(x, T) T x +#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) +#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) +#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) +#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) + +extern "C" __global__ +void small_kernel_16_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*)) +{ *F = 0; } +""" + +MODULE = None +EMPTY_KERNEL = None +SMALL_KERNEL = None +KERNEL_16_ARGS = None +STREAM = None +FLOAT_PTR = None +INT_PTRS = None +_VAL_PS = None +PACKED_16 = None + + +def _ensure_launch_state() -> None: + global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM + global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16 + + if EMPTY_KERNEL is not None: + return + + module = compile_and_load(KERNEL_SOURCE) + + err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel") + assert_drv(err) + err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel") + assert_drv(err) + err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") + assert_drv(err) + + err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + assert_drv(err) + + float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float)) + int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)) + + val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs] + packed_16 = (ctypes.c_void_p * 16)() + for index, value_ptr in enumerate(val_ps): + packed_16[index] = ctypes.addressof(value_ptr) + + MODULE = module + EMPTY_KERNEL = empty_kernel + SMALL_KERNEL = small_kernel + KERNEL_16_ARGS = kernel_16_args + STREAM = stream + FLOAT_PTR = float_ptr + INT_PTRS = int_ptrs + _VAL_PS = val_ps + PACKED_16 = packed_16 + + +def bench_launch_empty_kernel(loops: int) -> float: + _ensure_launch_state() + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = EMPTY_KERNEL + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) + return time.perf_counter() - t0 + + +def bench_launch_small_kernel(loops: int) -> float: + _ensure_launch_state() + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = SMALL_KERNEL + _stream = STREAM + _args = (FLOAT_PTR,) + _arg_types = (None,) + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + return time.perf_counter() - t0 + + +def bench_launch_16_args(loops: int) -> float: + _ensure_launch_state() + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = KERNEL_16_ARGS + _stream = STREAM + _args = INT_PTRS + _arg_types = (None,) * 16 + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + return time.perf_counter() - t0 + + +def bench_launch_16_args_pre_packed(loops: int) -> float: + _ensure_launch_state() + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = KERNEL_16_ARGS + _stream = STREAM + _packed = PACKED_16 + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py new file mode 100644 index 0000000000..a02b82c399 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import alloc_persistent + +from cuda.bindings import driver as cuda + +# Allocate memory used by the tests +PTR = alloc_persistent(1 << 18) +ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE + + +def bench_pointer_get_attribute(loops: int) -> float: + # Local references to avoid global lookups in the hot loop + _cuPointerGetAttribute = cuda.cuPointerGetAttribute + _attr = ATTRIBUTE + _ptr = PTR + + t0 = time.perf_counter() + for _ in range(loops): + _cuPointerGetAttribute(_attr, _ptr) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py new file mode 100644 index 0000000000..d816099ed5 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + + +def bench_stream_create_destroy(loops: int) -> float: + _cuStreamCreate = cuda.cuStreamCreate + _cuStreamDestroy = cuda.cuStreamDestroy + _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value + + t0 = time.perf_counter() + for _ in range(loops): + _, s = _cuStreamCreate(_flags) + _cuStreamDestroy(s) + return time.perf_counter() - t0 + + +def bench_stream_query(loops: int) -> float: + _cuStreamQuery = cuda.cuStreamQuery + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuStreamQuery(_stream) + return time.perf_counter() - t0 + + +def bench_stream_synchronize(loops: int) -> float: + _cuStreamSynchronize = cuda.cuStreamSynchronize + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuStreamSynchronize(_stream) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt new file mode 100644 index 0000000000..83326911af --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.24) +project(cuda_bindings_cpp_benchmarks LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CUDA_HOME_HINT "$ENV{CUDA_HOME}") +set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}") + +# Find cuda.h (driver API header) +find_path( + CUDA_DRIVER_INCLUDE_DIR + cuda.h + HINTS + "${CUDA_HOME_HINT}/include" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include" + "${CONDA_PREFIX_HINT}/include" +) + +# Find libcuda (driver API library) — lives on the system, not in toolkit +find_library( + CUDA_DRIVER_LIBRARY + NAMES cuda + HINTS + "/usr/lib/x86_64-linux-gnu" + "/usr/lib64" + "${CUDA_HOME_HINT}/lib64/stubs" + "${CUDA_HOME_HINT}/lib/stubs" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs" + "${CONDA_PREFIX_HINT}/lib/stubs" +) + +# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks) +find_path( + NVRTC_INCLUDE_DIR + nvrtc.h + HINTS + "${CUDA_HOME_HINT}/include" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include" + "${CONDA_PREFIX_HINT}/include" +) + +find_library( + NVRTC_LIBRARY + NAMES nvrtc + HINTS + "${CUDA_HOME_HINT}/lib64" + "${CUDA_HOME_HINT}/lib" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib" + "${CONDA_PREFIX_HINT}/lib" +) + +if(NOT CUDA_DRIVER_INCLUDE_DIR) + message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.") +endif() + +if(NOT CUDA_DRIVER_LIBRARY) + message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.") +endif() + +# Helper: add a benchmark that only needs the driver API +function(add_driver_benchmark name) + add_executable(${name}_cpp ${name}.cpp) + target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") + target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") +endfunction() + +# Helper: add a benchmark that needs driver API + NVRTC +function(add_nvrtc_benchmark name) + add_executable(${name}_cpp ${name}.cpp) + target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}") + target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}") +endfunction() + +# Driver-only benchmarks +add_driver_benchmark(bench_pointer_attributes) +add_driver_benchmark(bench_ctx_device) +add_driver_benchmark(bench_stream) +add_driver_benchmark(bench_event) +add_driver_benchmark(bench_memory) + +# NVRTC benchmarks (require nvrtc for kernel compilation) +if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY) + add_nvrtc_benchmark(bench_launch) +else() + message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.") +endif() diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp new file mode 100644 index 0000000000..052df9cc1d --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp @@ -0,0 +1,87 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup: init CUDA and create a context + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + bench::BenchmarkSuite suite(options); + + // --- ctx_get_current --- + { + CUcontext current_ctx = nullptr; + suite.run("ctx_device.ctx_get_current", [&]() { + check_cu(cuCtxGetCurrent(¤t_ctx), "cuCtxGetCurrent failed"); + }); + } + + // --- ctx_set_current --- + { + suite.run("ctx_device.ctx_set_current", [&]() { + check_cu(cuCtxSetCurrent(ctx), "cuCtxSetCurrent failed"); + }); + } + + // --- ctx_get_device --- + { + CUdevice dev; + suite.run("ctx_device.ctx_get_device", [&]() { + check_cu(cuCtxGetDevice(&dev), "cuCtxGetDevice failed"); + }); + } + + // --- device_get --- + { + CUdevice dev; + suite.run("ctx_device.device_get", [&]() { + check_cu(cuDeviceGet(&dev, 0), "cuDeviceGet failed"); + }); + } + + // --- device_get_attribute --- + { + int value = 0; + suite.run("ctx_device.device_get_attribute", [&]() { + check_cu( + cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), + "cuDeviceGetAttribute failed" + ); + }); + } + + // Cleanup + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + // Write all results + suite.write(); + + return 0; +} diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp new file mode 100644 index 0000000000..44cd617778 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Persistent event for query/synchronize/record benchmarks + CUevent event; + check_cu(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); + + // Record and sync so the event starts in a completed state + check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + bench::BenchmarkSuite suite(options); + + // --- event_create_destroy --- + { + CUevent e; + suite.run("event.event_create_destroy", [&]() { + check_cu(cuEventCreate(&e, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); + check_cu(cuEventDestroy(e), "cuEventDestroy failed"); + }); + } + + // --- event_record --- + { + suite.run("event.event_record", [&]() { + check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); + }); + } + + // Re-sync so event is in a known completed state after the record benchmark + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + { + suite.run("event.event_query", [&]() { + // Returns CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if not + cuEventQuery(event); + }); + } + + // --- event_synchronize --- + { + suite.run("event.event_synchronize", [&]() { + check_cu(cuEventSynchronize(event), "cuEventSynchronize failed"); + }); + } + + // Cleanup + check_cu(cuEventDestroy(event), "cuEventDestroy failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp new file mode 100644 index 0000000000..fb65da6d74 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp @@ -0,0 +1,216 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "bench_support.hpp" + +#include +#include +#include +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + +static void check_nvrtc(nvrtcResult status, const char* message) { + if (status != NVRTC_SUCCESS) { + std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n'; + std::exit(1); + } +} + +static CUmodule compile_and_load(const char* source, CUdevice device) { + int major = 0, minor = 0; + check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), + "cuDeviceGetAttribute failed"); + check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), + "cuDeviceGetAttribute failed"); + + nvrtcProgram prog; + check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr), + "nvrtcCreateProgram failed"); + + std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor); + const char* opts[] = {"--fmad=false", arch.c_str()}; + nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts); + + // Print log on failure + if (compile_result != NVRTC_SUCCESS) { + size_t log_size = 0; + nvrtcGetProgramLogSize(prog, &log_size); + std::vector log(log_size); + nvrtcGetProgramLog(prog, log.data()); + std::cerr << "NVRTC compile failed:\n" << log.data() << '\n'; + std::exit(1); + } + + size_t cubin_size = 0; + check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed"); + std::vector cubin(cubin_size); + check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed"); + nvrtcDestroyProgram(&prog); + + CUmodule module; + check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed"); + return module; +} + + +static const char* KERNEL_SOURCE = R"( +extern "C" __global__ void empty_kernel() { return; } +extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } + +extern "C" __global__ +void small_kernel_16_args( + int* a0, int* a1, int* a2, int* a3, + int* a4, int* a5, int* a6, int* a7, + int* a8, int* a9, int* a10, int* a11, + int* a12, int* a13, int* a14, int* a15) +{ *a0 = 0; } +)"; + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUmodule module = compile_and_load(KERNEL_SOURCE, device); + + CUfunction empty_kernel, small_kernel, kernel_16_args; + check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed"); + check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed"); + check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Allocate device memory for arguments + CUdeviceptr float_ptr; + check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed"); + + CUdeviceptr int_ptrs[16]; + for (int i = 0; i < 16; ++i) { + check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed"); + } + + // Pre-pack kernel params for the pre-packed benchmark + void* packed_16[16]; + for (int i = 0; i < 16; ++i) { + packed_16[i] = &int_ptrs[i]; + } + + bench::BenchmarkSuite suite(options); + + // --- launch_empty_kernel --- + { + suite.run("launch.launch_empty_kernel", [&]() { + check_cu( + cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // Drain the stream between benchmarks so each starts with a clean queue + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + { + void* params[] = {&float_ptr}; + suite.run("launch.launch_small_kernel", [&]() { + check_cu( + cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + { + suite.run("launch.launch_16_args", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + // In C++ the params are always pre-packed, so this is identical to launch_16_args. + // We include it for naming parity with the Python benchmark. + { + suite.run("launch.launch_16_args_pre_packed", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_small_kernel --- + { + void* params[] = {&float_ptr}; + suite.run("launch.launch_small_kernel", [&]() { + check_cu( + cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_16_args --- + { + suite.run("launch.launch_16_args", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) --- + // In C++ the params are always pre-packed, so this is identical to launch_16_args. + // We include it for naming parity with the Python benchmark. + { + suite.run("launch.launch_16_args_pre_packed", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // Cleanup + for (int i = 0; i < 16; ++i) { + check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed"); + } + check_cu(cuMemFree(float_ptr), "cuMemFree failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuModuleUnload(module), "cuModuleUnload failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp new file mode 100644 index 0000000000..4d9afc6566 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup: init CUDA, allocate memory + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUdeviceptr ptr; + check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed"); + + bench::BenchmarkSuite suite(options); + + // --- pointer_get_attribute --- + { + unsigned int memory_type = 0; + suite.run("pointer_attributes.pointer_get_attribute", [&]() { + check_cu( + cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr), + "cuPointerGetAttribute failed" + ); + }); + } + + // Cleanup + check_cu(cuMemFree(ptr), "cuMemFree failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp new file mode 100644 index 0000000000..702e86aef0 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + // Persistent stream for query/synchronize benchmarks + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + bench::BenchmarkSuite suite(options); + + // --- stream_create_destroy --- + { + CUstream s; + suite.run("stream.stream_create_destroy", [&]() { + check_cu(cuStreamCreate(&s, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + check_cu(cuStreamDestroy(s), "cuStreamDestroy failed"); + }); + } + + // --- stream_query --- + { + suite.run("stream.stream_query", [&]() { + // cuStreamQuery returns CUDA_SUCCESS if stream is idle, + // CUDA_ERROR_NOT_READY if busy — both are valid here. + cuStreamQuery(stream); + }); + } + + // --- stream_synchronize --- + { + suite.run("stream.stream_synchronize", [&]() { + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + }); + } + + // Cleanup + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp new file mode 100644 index 0000000000..837c15a9d1 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp @@ -0,0 +1,309 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace bench { + +struct Options { + std::uint64_t loops = 1000; + std::uint64_t warmups = 5; + std::uint64_t values = 20; + std::uint64_t runs = 20; + std::string output_path; + std::string benchmark_name; +}; + +// A single run result: warmup values and timed values (seconds per loop) +struct RunResult { + std::string date; + double duration_sec; + std::vector warmup_values; // seconds per loop + std::vector values; // seconds per loop +}; + +inline Options parse_args(int argc, char** argv) { + Options options; + for (int i = 1; i < argc; ++i) { + const std::string arg(argv[i]); + if (arg == "--loops" && i + 1 < argc) { + options.loops = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--warmups" && i + 1 < argc) { + options.warmups = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--values" && i + 1 < argc) { + options.values = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--runs" && i + 1 < argc) { + options.runs = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if ((arg == "-o" || arg == "--output") && i + 1 < argc) { + options.output_path = argv[++i]; + continue; + } + if (arg == "--name" && i + 1 < argc) { + options.benchmark_name = argv[++i]; + continue; + } + if (arg == "--help" || arg == "-h") { + std::cout << "Usage: benchmark [options]\n" + << " --loops N Loop iterations per value (default: 1000)\n" + << " --warmups N Warmup values per run (default: 5)\n" + << " --values N Timed values per run (default: 20)\n" + << " --runs N Number of runs (default: 20)\n" + << " -o, --output F Write pyperf-compatible JSON to file\n" + << " --name S Benchmark name (overrides default)\n"; + std::exit(0); + } + + std::cerr << "Unknown argument: " << arg << '\n'; + std::exit(2); + } + return options; +} + +inline std::string iso_now() { + const auto now = std::chrono::system_clock::now(); + const std::time_t t = std::chrono::system_clock::to_time_t(now); + std::tm tm{}; +#ifdef _WIN32 + gmtime_s(&tm, &t); +#else + gmtime_r(&t, &tm); +#endif + char buf[64]; + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm); + return std::string(buf); +} + +// Run a benchmark function. The function signature is: void fn() — one call = one operation. +// The harness calls fn() in a tight loop `loops` times per value. +template +std::vector run_benchmark(const Options& options, Fn&& fn) { + std::vector results; + results.reserve(options.runs); + + for (std::uint64_t r = 0; r < options.runs; ++r) { + RunResult run; + run.date = iso_now(); + const auto run_start = std::chrono::steady_clock::now(); + + // Warmups + for (std::uint64_t w = 0; w < options.warmups; ++w) { + const auto t0 = std::chrono::steady_clock::now(); + for (std::uint64_t i = 0; i < options.loops; ++i) { + fn(); + } + const auto t1 = std::chrono::steady_clock::now(); + const double elapsed = std::chrono::duration(t1 - t0).count(); + run.warmup_values.push_back(elapsed / static_cast(options.loops)); + } + + // Timed values + for (std::uint64_t v = 0; v < options.values; ++v) { + const auto t0 = std::chrono::steady_clock::now(); + for (std::uint64_t i = 0; i < options.loops; ++i) { + fn(); + } + const auto t1 = std::chrono::steady_clock::now(); + const double elapsed = std::chrono::duration(t1 - t0).count(); + run.values.push_back(elapsed / static_cast(options.loops)); + } + + const auto run_end = std::chrono::steady_clock::now(); + run.duration_sec = std::chrono::duration(run_end - run_start).count(); + results.push_back(std::move(run)); + } + + return results; +} + +inline void print_summary(const std::string& name, const std::vector& results) { + // Collect all timed values + std::vector all_values; + for (const auto& run : results) { + for (double v : run.values) { + all_values.push_back(v); + } + } + if (all_values.empty()) + return; + + double sum = 0; + for (double v : all_values) + sum += v; + + double mean = sum / static_cast(all_values.size()); + + double sq_sum = 0; + for (double v : all_values) { + double diff = v - mean; + sq_sum += diff * diff; + } + double stdev = std::sqrt(sq_sum / static_cast(all_values.size())); + + std::cout << name << ": Mean +- std dev: " + << std::fixed << std::setprecision(0) + << (mean * 1e9) << " ns +- " + << (stdev * 1e9) << " ns\n"; +} + +// Escape a JSON string (minimal — no control chars expected) +inline std::string json_str(const std::string& s) { + return "\"" + s + "\""; +} + +inline void write_pyperf_json( + const std::string& output_path, + const std::string& name, + std::uint64_t loops, + const std::vector& results +) { + std::ofstream out(output_path); + if (!out) { + std::cerr << "Failed to open output file: " << output_path << '\n'; + std::exit(3); + } + + out << std::setprecision(17); + + out << "{\"version\": \"1.0\", "; + out << "\"metadata\": {"; + out << "\"name\": " << json_str(name) << ", "; + out << "\"loops\": " << loops << ", "; + out << "\"unit\": \"second\""; + out << "}, "; + + out << "\"benchmarks\": [{\"runs\": ["; + + for (std::size_t r = 0; r < results.size(); ++r) { + const auto& run = results[r]; + if (r > 0) out << ", "; + + out << "{\"metadata\": {"; + out << "\"date\": " << json_str(run.date) << ", "; + out << "\"duration\": " << run.duration_sec; + out << "}, "; + + // Warmups: array of [loops, value] pairs + out << "\"warmups\": ["; + for (std::size_t w = 0; w < run.warmup_values.size(); ++w) { + if (w > 0) out << ", "; + out << "[" << loops << ", " << run.warmup_values[w] << "]"; + } + out << "], "; + + // Values + out << "\"values\": ["; + for (std::size_t v = 0; v < run.values.size(); ++v) { + if (v > 0) out << ", "; + out << run.values[v]; + } + out << "]}"; + } + + out << "]}]}\n"; +} + +// A collected benchmark entry: name, loops, and run results +struct BenchmarkEntry { + std::string name; + std::uint64_t loops; + std::vector results; +}; + +// Collect multiple benchmarks from a single binary and write them all +// to one pyperf-compatible JSON file. +class BenchmarkSuite { +public: + explicit BenchmarkSuite(Options options) : options_(std::move(options)) {} + + // Run a benchmark and record it. The name is used as the benchmark ID. + template + void run(const std::string& name, Fn&& fn) { + auto results = run_benchmark(options_, std::forward(fn)); + print_summary(name, results); + entries_.push_back({name, options_.loops, std::move(results)}); + } + + // Write all collected benchmarks to the output file (if -o was given). + void write() const { + if (options_.output_path.empty() || entries_.empty()) + return; + write_multi_pyperf_json(options_.output_path, entries_); + } + +private: + Options options_; + std::vector entries_; + + static void write_multi_pyperf_json( + const std::string& output_path, + const std::vector& entries + ) { + std::ofstream out(output_path); + if (!out) { + std::cerr << "Failed to open output file: " << output_path << '\n'; + std::exit(3); + } + + out << std::setprecision(17); + out << "{\"version\": \"1.0\", \"benchmarks\": ["; + + for (std::size_t e = 0; e < entries.size(); ++e) { + const auto& entry = entries[e]; + if (e > 0) out << ", "; + + out << "{\"metadata\": {"; + out << "\"name\": " << json_str(entry.name) << ", "; + out << "\"loops\": " << entry.loops << ", "; + out << "\"unit\": \"second\""; + out << "}, \"runs\": ["; + + for (std::size_t r = 0; r < entry.results.size(); ++r) { + const auto& run = entry.results[r]; + if (r > 0) out << ", "; + + out << "{\"metadata\": {"; + out << "\"date\": " << json_str(run.date) << ", "; + out << "\"duration\": " << run.duration_sec; + out << "}, "; + + out << "\"warmups\": ["; + for (std::size_t w = 0; w < run.warmup_values.size(); ++w) { + if (w > 0) out << ", "; + out << "[" << entry.loops << ", " << run.warmup_values[w] << "]"; + } + out << "], "; + + out << "\"values\": ["; + for (std::size_t v = 0; v < run.values.size(); ++v) { + if (v > 0) out << ", "; + out << run.values[v]; + } + out << "]}"; + } + out << "]}"; + } + out << "]}\n"; + } +}; + +} // namespace bench diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py new file mode 100644 index 0000000000..6a3e94f344 --- /dev/null +++ b/benchmarks/cuda_bindings/compare.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Compare Python and C++ benchmark results in a summary table.""" + +import argparse +import json +import statistics +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent +DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json" +DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json" + + +def load_benchmarks(path: Path) -> dict[str, list[float]]: + """Load a pyperf JSON file and return {name: [values]}.""" + with open(path) as f: + data = json.load(f) + + results: dict[str, list[float]] = {} + for bench in data.get("benchmarks", []): + name = bench.get("metadata", {}).get("name", "") + if not name: + # Try to find name in run metadata + for run in bench.get("runs", []): + name = run.get("metadata", {}).get("name", "") + if name: + break + values = [] + for run in bench.get("runs", []): + values.extend(run.get("values", [])) + if name and values: + results[name] = values + return results + + +def fmt_ns(seconds: float) -> str: + ns = seconds * 1e9 + if ns >= 1000: + return f"{ns / 1000:.2f} us" + return f"{ns:.0f} ns" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results") + parser.add_argument( + "--python", + type=Path, + default=DEFAULT_PYTHON, + help=f"Python results JSON (default: {DEFAULT_PYTHON.name})", + ) + parser.add_argument( + "--cpp", + type=Path, + default=DEFAULT_CPP, + help=f"C++ results JSON (default: {DEFAULT_CPP.name})", + ) + args = parser.parse_args() + + if not args.python.exists(): + print(f"Python results not found: {args.python}", file=sys.stderr) + print("Run: pixi run -e wheel bench", file=sys.stderr) + sys.exit(1) + + py_benchmarks = load_benchmarks(args.python) + cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {} + + if not py_benchmarks: + print("No benchmarks found in Python results.", file=sys.stderr) + sys.exit(1) + + # Column widths + all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks)) + name_width = max(len(n) for n in all_names) + name_width = max(name_width, len("Benchmark")) + + # Header + if cpp_benchmarks: + header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" + sep = "-" * len(header) + print(sep) + print(header) + print(sep) + else: + header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" + sep = "-" * len(header) + print(sep) + print(header) + print(sep) + + for name in all_names: + py_vals = py_benchmarks.get(name) + cpp_vals = cpp_benchmarks.get(name) + + py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" + cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" + + if py_vals and cpp_vals: + py_mean = statistics.mean(py_vals) + cpp_mean = statistics.mean(cpp_vals) + overhead_ns = (py_mean - cpp_mean) * 1e9 + overhead_str = f"+{overhead_ns:.0f} ns" + else: + overhead_str = "-" + + if cpp_benchmarks: + print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") + else: + print(f"{name:<{name_width}} {py_str:>14}") + + print(sep) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_bindings/pixi.lock b/benchmarks/cuda_bindings/pixi.lock new file mode 100644 index 0000000000..c610db2f45 --- /dev/null +++ b/benchmarks/cuda_bindings/pixi.lock @@ -0,0 +1,1767 @@ +version: 6 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: {} + source: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - conda: .. + - conda: ../../cuda_pathfinder + wheel: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + build_number: 20 + sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 + md5: a9f577daf3de00bca7c3c76c0ecbd1de + depends: + - __glibc >=2.17,<3.0.a0 + - libgomp >=7.5.0 + constrains: + - openmp_impl <0.0a0 + license: BSD-3-Clause + license_family: BSD + size: 28948 + timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda + sha256: a9c114cbfeda42a226e2db1809a538929d2f118ef855372293bd188f71711c48 + md5: 791365c5f65975051e4e017b5da3abf5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: GPL-2.0-or-later + license_family: GPL + size: 68072 + timestamp: 1756738968573 +- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda + sha256: 2851d34944b056d028543f0440fb631aeeff204151ea09589d8d9c13882395de + md5: 9902aeb08445c03fb31e01beeb173988 + depends: + - binutils_impl_linux-64 >=2.45.1,<2.45.2.0a0 + license: GPL-3.0-only + license_family: GPL + size: 35128 + timestamp: 1770267175160 +- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda + sha256: 74341b26a2b9475dc14ba3cf12432fcd10a23af285101883e720216d81d44676 + md5: 83aa53cb3f5fc849851a84d777a60551 + depends: + - ld_impl_linux-64 2.45.1 default_hbd61a6d_101 + - sysroot_linux-64 + - zstd >=1.5.7,<1.6.0a0 + license: GPL-3.0-only + license_family: GPL + size: 3744895 + timestamp: 1770267152681 +- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda + sha256: 4826f97d33cbe54459970a1e84500dbe0cccf8326aaf370e707372ae20ec5a47 + md5: dec96579f9a7035a59492bf6ee613b53 + depends: + - binutils_impl_linux-64 2.45.1 default_hfdba357_101 + license: GPL-3.0-only + license_family: GPL + size: 36060 + timestamp: 1770267177798 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 + md5: d2ffd7602c02f2b316fd921d39876885 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + size: 260182 + timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda + sha256: cc9accf72fa028d31c2a038460787751127317dcfa991f8d1f1babf216bb454e + md5: 920bb03579f15389b9e512095ad995b7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + size: 207882 + timestamp: 1765214722852 +- conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda + sha256: 8e7a40f16400d7839c82581410aa05c1f8324a693c9d50079f8c50dc9fb241f0 + md5: abd85120de1187b0d1ec305c2173c71b + depends: + - binutils + - gcc + - gcc_linux-64 14.* + license: BSD-3-Clause + license_family: BSD + size: 6693 + timestamp: 1753098721814 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda + sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc + md5: 4492fd26db29495f0ba23f146cd5638d + depends: + - __unix + license: ISC + size: 147413 + timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e + md5: cf45f4278afd6f4e6d03eda0f435d527 + depends: + - __glibc >=2.17,<3.0.a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - pycparser + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + size: 300271 + timestamp: 1761203085220 +- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449 + md5: 381bd45fb7aa032691f3063aff47e3a1 + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 13589 + timestamp: 1763607964133 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda + sha256: 5ece78754577b8d9030ec1f09ce1cd481125f27d8d6fcdcfe2c1017661830c61 + md5: 51d37989c1758b5edfe98518088bf700 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - libcurl >=8.18.0,<9.0a0 + - libexpat >=2.7.4,<3.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libstdcxx >=14 + - libuv >=1.51.0,<2.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - rhash >=1.4.6,<2.0a0 + - zstd >=1.5.7,<1.6.0a0 + license: BSD-3-Clause + license_family: BSD + size: 22330508 + timestamp: 1771383666798 +- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287 + md5: 962b9857ee8e7018c22f2776ffa0b2d7 + depends: + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + size: 27011 + timestamp: 1733218222191 +- conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda + sha256: b90ec0e6a9eb22f7240b3584fe785457cff961fec68d40e6aece5d596f9bbd9a + md5: 0e3e144115c43c9150d18fa20db5f31c + depends: + - gcc_impl_linux-64 >=14.3.0,<14.3.1.0a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 31705 + timestamp: 1771378159534 +- conda: .. + name: cuda-bindings + version: 13.2.0 + build: hb0f4dca_0 + subdir: linux-64 + variants: + target_platform: linux-64 + depends: + - python + - cuda-version + - cuda-pathfinder + - libnvjitlink + - cuda-nvrtc + - cuda-nvrtc >=13.2.51,<14.0a0 + - cuda-nvvm + - libnvfatbin + - libcufile + - libcufile >=1.17.0.44,<2.0a0 + - libgcc >=15 + - libgcc >=15 + - libstdcxx >=15 + - python_abi 3.14.* *_cp314 + license: LicenseRef-NVIDIA-SOFTWARE-LICENSE + sources: + cuda-pathfinder: + path: ../cuda_pathfinder +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda + sha256: aecfbbc9a687e5daba66b896613a00c617e3eadc21a31b19e53e8e642e83d7a7 + md5: 3bd3abdf71e1b8c53310195677bf00be + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc >=13,<14.0a0 + - cuda-nvvm-impl >=13,<14.0a0 + - cuda-pathfinder >=1.1.0,<2 + - cuda-version >=13,<14.0a0 + - libcufile >=1,<2.0a0 + - libgcc >=14 + - libnvjitlink >=13.0,<14.0a0 + - libstdcxx >=14 + - numpy + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + constrains: + - cuda-python >=13.1.0,<13.2.0a0 + - cuda-cudart >=13,<14.0a0 + license: LicenseRef-NVIDIA-SOFTWARE-LICENSE + size: 7267159 + timestamp: 1764919647948 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda + sha256: 0715f15da71587238600f0584bc8d243d8fde602c3d8856f421b58dff3fb9422 + md5: a179486129ff28d053bb16fdb533568e + depends: + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1277295 + timestamp: 1768272295906 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda + sha256: e539baa32e3be63f89bd11d421911363faac322903caf58a15a46ba68ae29867 + md5: 4910b7b709f1168baffc2a742b39a222 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1415308 + timestamp: 1773098874302 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda + sha256: 82ae1f3e492146722e258e237daa537f4d4df8157b2dfa49a0869eb41a11d284 + md5: 3723bca2a84e6cc0f0a98427b71bec73 + depends: + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 96480 + timestamp: 1768280269206 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda + sha256: dd9a74a40b196b1ea150b17ca8fb539dd8f75edd349af354a7bae6dbb43e43b4 + md5: 6f4a609f3d142d4b22728823955249e9 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 97122 + timestamp: 1773115163637 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda + sha256: 00acb7564e7c7dd60be431bd2a1a937856e38a86535d72281461cd193500a0a4 + md5: 2e2b71c8d67f6ceb1d3820aa438f3580 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart_linux-64 13.1.80 h376f20c_0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24159 + timestamp: 1764883525821 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda + sha256: 9cc44fd4914738a32cf5c801925a08c61ce45b5534833cf1df1621236a9a321d + md5: 29f5b46965bd82b0e9cc27a96d13f2bd + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart_linux-64 13.2.51 h376f20c_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24534 + timestamp: 1773104357094 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda + sha256: 12aa5dcf82cdf863be18a48a9ad4d271aa864ef985752bc9707371b84085f0c8 + md5: e3cbe24bf8ae135e9f82450be520e886 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart 13.1.80 hecca717_0 + - cuda-cudart-dev_linux-64 13.1.80 h376f20c_0 + - cuda-cudart-static 13.1.80 hecca717_0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24597 + timestamp: 1764883573873 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda + sha256: f6d81c961b6212389c07ffc9dc1268966db63aa351d46875effee40447eb9dd8 + md5: 9b35a56418b6cbbde5ea5f7d84c26317 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart 13.2.51 hecca717_0 + - cuda-cudart-dev_linux-64 13.2.51 h376f20c_0 + - cuda-cudart-static 13.2.51 hecca717_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24961 + timestamp: 1773104406956 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda + sha256: 41a1cc86f2759ef6ae47cc68e2180baaeb4b989709931366ee0cdc90f8e10f5f + md5: a36776a49ae0e47a26e129bdc82aeb3e + depends: + - cuda-cccl_linux-64 + - cuda-cudart-static_linux-64 + - cuda-cudart_linux-64 + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 392459 + timestamp: 1764883538793 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda + sha256: 86dd0dc301bab5263d63f13d47b02507e0cf2fd22ff9aefa37dea2dd03c6df83 + md5: 7e5cf4b991525b7b1a2cfa3f1c81462e + depends: + - cuda-cccl_linux-64 + - cuda-cudart-static_linux-64 + - cuda-cudart_linux-64 + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 399921 + timestamp: 1773104368666 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda + sha256: 7cbf145b3e59d360052556bfe9425753b119c33cbba0c1f20f0191a7330ced5c + md5: 0e5edde73725a13f7d62ddf96b7656b9 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart-static_linux-64 13.1.80 h376f20c_0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24119 + timestamp: 1764883551735 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda + sha256: d4a316038b02161e04a864c8cd146d2ec62cbd114eb951197c6ef6042d3c46c4 + md5: daec4c4dc0355adcdf009dceb3b94259 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart-static_linux-64 13.2.51 h376f20c_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24494 + timestamp: 1773104383494 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda + sha256: 2252e12fa9a806f685684b6395a660d845dc95bdc95e52a6bc09dba8a9eccec3 + md5: be9f8ef5a01fca1f28c8d523f8501771 + depends: + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1121385 + timestamp: 1764883490595 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda + sha256: e3cc51809bd8be0a96bbe01a668f08e6e611c8fba60426c4d9f10926f3159456 + md5: aa9c7d5cd427042ffbd59c9ef6014f98 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1103784 + timestamp: 1773104321614 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda + sha256: fca2951815564c36cf5a4e0f7ed0222429d206fda3d4e1aa3d52a969a293b868 + md5: 4dc4c3a1e010e06035f01d661c1b70bd + depends: + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 199654 + timestamp: 1764883502803 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda + sha256: e1d943a5582c8e171c9dcf2c0c72ddd5bf0a2ac9acd6ed15898d69d618cf53c6 + md5: 51a1624c7e26d8821b5d959ee7ecb517 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 203460 + timestamp: 1773104333900 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda + sha256: 83bf37d5a3b4a85853cded6a8b90db302b014845b7d9461ccdb84db8c2abfbc3 + md5: 1d7073905d0359ff234545494a933d59 + depends: + - cuda-version >=13.1,<13.2.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 38992 + timestamp: 1764883514338 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda + sha256: 1b372b7af937a3a2fdb1cbd5356e6b365f3495d899a413ebf98369ab0c5c0c79 + md5: 970891239574056829fc1cfc208278a7 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 39485 + timestamp: 1773104345638 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b + md5: df16c9049d882cdaf4f83a5b90079589 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35339417 + timestamp: 1768272955912 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda + sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5 + md5: 0413baaa73be1a39d5d8e442184acc78 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35736655 + timestamp: 1773100338749 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda + sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3 + md5: 1096fce4abad7dd975ce6d9953fceb6a + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc 13.1.115 hecca717_0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - cuda-nvrtc-static >=13.1.115 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35845 + timestamp: 1768273073971 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda + sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879 + md5: 83535200a9e77165d5291b4ac82ebf6a + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc 13.2.51 hecca717_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - cuda-nvrtc-static >=13.2.51 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 36305 + timestamp: 1773100458841 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda + sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b + md5: 03cd3639b8e13623c7b91b1cb0136402 + depends: + - cuda-nvvm-dev_linux-64 13.2.51.* + - cuda-nvvm-impl 13.2.51.* + - cuda-nvvm-tools 13.2.51.* + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 25494 + timestamp: 1773157399568 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda + sha256: f00fce92bf7f1da314654f7693f571a014aaa2ba1fae3762634f3e5be254da83 + md5: 57724ac113f7435762d0c39e1b1ad341 + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 28399 + timestamp: 1773115185916 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda + sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c + md5: bf76661bc0de83a60537c4913f339fb3 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=12 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21873791 + timestamp: 1768280315627 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda + sha256: bea7cbd2ff0f8bf07e0b90d522b4834533b4024237322c09f1b3875970c4abc9 + md5: 3c3872ff2bd6cc6368dcd4b35bb995f2 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=12 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 22202489 + timestamp: 1773115209641 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda + sha256: da5fd2dc57df2047215ff76f295685b1e1e586a46c2e46214120458cee18ee80 + md5: 2df6cd3b3d6d1365a2979285703056f9 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=12 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 25988523 + timestamp: 1773115248060 +- conda: ../../cuda_pathfinder + name: cuda-pathfinder + version: 1.3.4a0 + build: pyh4616a5c_0 + subdir: noarch + variants: + target_platform: noarch + depends: + - python >=3.10 + - python * + license: Apache-2.0 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda + sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151 + md5: 2d13e524da66b60e6e7d5c6585729ea8 + depends: + - python >=3.10 + - cuda-version >=12.0,<14 + - python + license: Apache-2.0 + license_family: APACHE + size: 39327 + timestamp: 1772059437166 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda + sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992 + md5: d85448460c25ee43ff2f8346bb9ad52b + constrains: + - cudatoolkit 13.1|13.1.* + - __cuda >=13 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21511 + timestamp: 1757017115788 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679 + md5: 45676e3dd76b30ec613f1f822d450eff + constrains: + - __cuda >=13 + - cudatoolkit 13.2|13.2.* + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21908 + timestamp: 1773093709154 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda + sha256: 3fcc97ae3e89c150401a50a4de58794ffc67b1ed0e1851468fcc376980201e25 + md5: 5da8c935dca9186673987f79cef0b2a5 + depends: + - c-compiler 1.11.0 h4d9bdce_0 + - gxx + - gxx_linux-64 14.* + license: BSD-3-Clause + license_family: BSD + size: 6635 + timestamp: 1753098722177 +- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e + md5: 003b8ba0a94e2f1e117d0bd46aebc901 + depends: + - python >=3.9 + license: Apache-2.0 + license_family: APACHE + size: 275642 + timestamp: 1752823081585 +- conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + sha256: ee6cf346d017d954255bbcbdb424cddea4d14e4ed7e9813e429db1d795d01144 + md5: 8e662bd460bda79b1ea39194e3c4c9ab + depends: + - python >=3.10 + - typing_extensions >=4.6.0 + license: MIT and PSF-2.0 + size: 21333 + timestamp: 1763918099466 +- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda + sha256: dddea9ec53d5e179de82c24569d41198f98db93314f0adae6b15195085d5567f + md5: f58064cec97b12a7136ebb8a6f8a129b + depends: + - python >=3.10 + license: Unlicense + size: 25845 + timestamp: 1773314012590 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda + sha256: 9b34b57b06b485e33a40d430f71ac88c8f381673592507cf7161c50ff0832772 + md5: 52d6457abc42e320787ada5f9033fa99 + depends: + - conda-gcc-specs + - gcc_impl_linux-64 14.3.0 hbdf3cc3_18 + license: BSD-3-Clause + license_family: BSD + size: 29506 + timestamp: 1771378321585 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda + sha256: 3b31a273b806c6851e16e9cf63ef87cae28d19be0df148433f3948e7da795592 + md5: 30bb690150536f622873758b0e8d6712 + depends: + - binutils_impl_linux-64 >=2.45 + - libgcc >=14.3.0 + - libgcc-devel_linux-64 14.3.0 hf649bbc_118 + - libgomp >=14.3.0 + - libsanitizer 14.3.0 h8f1669f_18 + - libstdcxx >=14.3.0 + - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118 + - sysroot_linux-64 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 76302378 + timestamp: 1771378056505 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda + sha256: 27ad0cd10dccffca74e20fb38c9f8643ff8fce56eee260bf89fa257d5ab0c90a + md5: 1403ed5fe091bd7442e4e8a229d14030 + depends: + - gcc_impl_linux-64 14.3.0.* + - binutils_linux-64 + - sysroot_linux-64 + license: BSD-3-Clause + license_family: BSD + size: 28946 + timestamp: 1770908213807 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda + sha256: 1b490c9be9669f9c559db7b2a1f7d8b973c58ca0c6f21a5d2ba3f0ab2da63362 + md5: 19189121d644d4ef75fed05383bc75f5 + depends: + - gcc 14.3.0 h0dff253_18 + - gxx_impl_linux-64 14.3.0 h2185e75_18 + license: BSD-3-Clause + license_family: BSD + size: 28883 + timestamp: 1771378355605 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda + sha256: 38ffca57cc9c264d461ac2ce9464a9d605e0f606d92d831de9075cb0d95fc68a + md5: 6514b3a10e84b6a849e1b15d3753eb22 + depends: + - gcc_impl_linux-64 14.3.0 hbdf3cc3_18 + - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118 + - sysroot_linux-64 + - tzdata + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 14566100 + timestamp: 1771378271421 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda + sha256: 1e07c197e0779fa9105e59cd55a835ded96bfde59eb169439736a89b27b48e5d + md5: 7b51f4ff82eeb1f386bfee20a7bed3ed + depends: + - gxx_impl_linux-64 14.3.0.* + - gcc_linux-64 ==14.3.0 h298d278_21 + - binutils_linux-64 + - sysroot_linux-64 + license: BSD-3-Clause + license_family: BSD + size: 27503 + timestamp: 1770908213813 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + sha256: 142a722072fa96cf16ff98eaaf641f54ab84744af81754c292cb81e0881c0329 + md5: 186a18e3ba246eccfc7cff00cd19a870 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + size: 12728445 + timestamp: 1767969922681 +- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda + sha256: 7cd5eccdb171a0adbf83a1ad8fc4e17822f4fc3f5518da9040de64e88bc07343 + md5: 5b7ae2ec4e0750e094f804a6cf1b2a37 + depends: + - python >=3.10 + - ukkonen + license: MIT + license_family: MIT + size: 79520 + timestamp: 1772402363021 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + sha256: c18ab120a0613ada4391b15981d86ff777b5690ca461ea7e9e49531e8f374745 + md5: 63ccfdc3a3ce25b027b8767eb722fca8 + depends: + - python >=3.9 + - zipp >=3.20 + - python + license: Apache-2.0 + license_family: APACHE + size: 34641 + timestamp: 1747934053147 +- conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + sha256: e1a9e3b1c8fe62dc3932a616c284b5d8cbe3124bbfbedcf4ce5c828cb166ee19 + md5: 9614359868482abba1bd15ce465e3c42 + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 13387 + timestamp: 1760831448842 +- conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda + sha256: 41557eeadf641de6aeae49486cef30d02a6912d8da98585d687894afd65b356a + md5: 86d9cba083cd041bfbf242a01a7a1999 + constrains: + - sysroot_linux-64 ==2.28 + license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later + license_family: GPL + size: 1278712 + timestamp: 1765578681495 +- conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + sha256: 0960d06048a7185d3542d850986d807c6e37ca2e644342dd0c72feefcf26c2a4 + md5: b38117a3c920364aff79f870c984b4a3 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: LGPL-2.1-or-later + size: 134088 + timestamp: 1754905959823 +- conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda + sha256: 3e307628ca3527448dd1cb14ad7bb9d04d1d28c7d4c5f97ba196ae984571dd25 + md5: fb53fb07ce46a575c5d004bbc96032c2 + depends: + - __glibc >=2.17,<3.0.a0 + - keyutils >=1.6.3,<2.0a0 + - libedit >=3.1.20250104,<3.2.0a0 + - libedit >=3.1.20250104,<4.0a0 + - libgcc >=14 + - libstdcxx >=14 + - openssl >=3.5.5,<4.0a0 + license: MIT + license_family: MIT + size: 1386730 + timestamp: 1769769569681 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3 + md5: 12bd9a3f089ee6c9266a37dab82afabd + depends: + - __glibc >=2.17,<3.0.a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-64 2.45.1 + license: GPL-3.0-only + license_family: GPL + size: 725507 + timestamp: 1770267139900 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + build_number: 5 + sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c + md5: c160954f7418d7b6e87eaf05a8913fa9 + depends: + - libopenblas >=0.3.30,<0.3.31.0a0 + - libopenblas >=0.3.30,<1.0a0 + constrains: + - mkl <2026 + - liblapack 3.11.0 5*_openblas + - libcblas 3.11.0 5*_openblas + - blas 2.305 openblas + - liblapacke 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + size: 18213 + timestamp: 1765818813880 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda + sha256: 9517cce5193144af0fcbf19b7bd67db0a329c2cc2618f28ffecaa921a1cbe9d3 + md5: 09c264d40c67b82b49a3f3b89037bd2e + depends: + - __glibc >=2.17,<3.0.a0 + - attr >=2.5.2,<2.6.0a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + size: 121429 + timestamp: 1762349484074 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + build_number: 5 + sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 + md5: 6636a2b6f1a87572df2970d3ebc87cc0 + depends: + - libblas 3.11.0 5_h4a7cf45_openblas + constrains: + - liblapacke 3.11.0 5*_openblas + - blas 2.305 openblas + - liblapack 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + size: 18194 + timestamp: 1765818837135 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1 + md5: 48418c48dac04671fa46cb446122b8a5 + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + - rdma-core >=60.0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 990938 + timestamp: 1768273732081 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19 + md5: 6d8ed8335d144ec7303b8d3587b2205c + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + - rdma-core >=61.0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1085341 + timestamp: 1773100191342 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b + md5: 1707cdd636af2ff697b53186572c9f77 + depends: + - __glibc >=2.17,<3.0.a0 + - krb5 >=1.22.2,<1.23.0a0 + - libgcc >=14 + - libnghttp2 >=1.67.0,<2.0a0 + - libssh2 >=1.11.1,<2.0a0 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.5,<4.0a0 + - zstd >=1.5.7,<1.6.0a0 + license: curl + license_family: MIT + size: 463621 + timestamp: 1770892808818 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724 + md5: c277e0a4d549b03ac1e9d6cbbe3d017b + depends: + - ncurses + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - ncurses >=6.5,<7.0a0 + license: BSD-2-Clause + license_family: BSD + size: 134676 + timestamp: 1738479519902 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4 + md5: 172bf1cd1ff8629f2b1179945ed45055 + depends: + - libgcc-ng >=12 + license: BSD-2-Clause + license_family: BSD + size: 112766 + timestamp: 1702146165126 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda + sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5 + md5: e7f7ce06ec24cfcfb9e36d28cf82ba57 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.7.4.* + license: MIT + license_family: MIT + size: 76798 + timestamp: 1771259418166 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 + md5: a360c33a5abe61c07959e449fa1453eb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + size: 58592 + timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 + md5: 0aa00f03f9e39fb9876085dee11a85d4 + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_18 + - libgomp 15.2.0 he0feb66_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 1041788 + timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda + sha256: 1abc6a81ee66e8ac9ac09a26e2d6ad7bba23f0a0cc3a6118654f036f9c0e1854 + md5: 06901733131833f5edd68cf3d9679798 + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 3084533 + timestamp: 1771377786730 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda + sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893 + md5: d5e96b1ed75ca01906b3d2469b4ce493 + depends: + - libgcc 15.2.0 he0feb66_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 27526 + timestamp: 1771378224552 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee + md5: 9063115da5bc35fdc3e1002e69b9ef6e + depends: + - libgfortran5 15.2.0 h68bc16d_18 + constrains: + - libgfortran-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 27523 + timestamp: 1771378269450 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 + md5: 646855f357199a12f02a87382d429b75 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=15.2.0 + constrains: + - libgfortran 15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 2482475 + timestamp: 1771378241063 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 + md5: 239c5e9546c38a1e884d69effcf4c882 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 603262 + timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + build_number: 5 + sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 + md5: b38076eb5c8e40d0106beda6f95d7609 + depends: + - libblas 3.11.0 5_h4a7cf45_openblas + constrains: + - blas 2.305 openblas + - liblapacke 3.11.0 5*_openblas + - libcblas 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + size: 18200 + timestamp: 1765818857876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb + md5: c7c83eecbb72d88b940c249af56c8b17 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.2.* + license: 0BSD + size: 113207 + timestamp: 1768752626120 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 + md5: 2c21e66f50753a083cbe6b80f38268fa + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + size: 92400 + timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda + sha256: a4a7dab8db4dc81c736e9a9b42bdfd97b087816e029e221380511960ac46c690 + md5: b499ce4b026493a13774bcf0f4c33849 + depends: + - __glibc >=2.17,<3.0.a0 + - c-ares >=1.34.5,<2.0a0 + - libev >=4.33,<4.34.0a0 + - libev >=4.33,<5.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.2,<4.0a0 + license: MIT + license_family: MIT + size: 666600 + timestamp: 1756834976695 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4 + md5: db63358239cbe1ff86242406d440e44a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: LGPL-2.1-or-later + license_family: LGPL + size: 741323 + timestamp: 1731846827427 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda + sha256: 66b7bbe40d259e4927b9c264569afd49d0e31a3813c585beea63f3415577f1b3 + md5: 7e6534bce7252c84efdedae1fae2148e + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 471076 + timestamp: 1773100181931 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda + sha256: 6b5300bf9952da4bfdbfb45c13b042d786a0daffb1bd2fa45ea9ad971703fe96 + md5: 851acc1af02d31c732b931b9ffddc2d9 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 31328660 + timestamp: 1771443943495 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + sha256: 2ca45a2c9e6cc307cea3c8a1bf27bceb745fa5e1150d7b768b63a781eeaee7a2 + md5: 20a82402e6851e5d4e0b13ee1083d370 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 31691081 + timestamp: 1773100788615 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 + md5: be43915efc66345cccb3c310b6ed0374 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libgfortran + - libgfortran5 >=14.3.0 + constrains: + - openblas >=0.3.30,<0.3.31.0a0 + license: BSD-3-Clause + license_family: BSD + size: 5927939 + timestamp: 1763114673331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda + sha256: e03ed186eefb46d7800224ad34bad1268c9d19ecb8f621380a50601c6221a4a7 + md5: ad3a0e2dc4cce549b2860e2ef0e6d75b + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14.3.0 + - libstdcxx >=14.3.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 7949259 + timestamp: 1771377982207 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217 + md5: da5be73701eecd0e8454423fd6ffcf30 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.2,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + license: blessing + size: 942808 + timestamp: 1768147973361 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda + sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661 + md5: eecce068c7e4eddeb169591baac20ac4 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.0,<4.0a0 + license: BSD-3-Clause + license_family: BSD + size: 304790 + timestamp: 1745608545575 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e + md5: 1b08cd684f34175e4514474793d44bcb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc 15.2.0 he0feb66_18 + constrains: + - libstdcxx-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 5852330 + timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda + sha256: b1c3824769b92a1486bf3e2cc5f13304d83ae613ea061b7bc47bb6080d6dfdba + md5: 865a399bce236119301ebd1532fced8d + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 20171098 + timestamp: 1771377827750 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda + sha256: f0356bb344a684e7616fc84675cfca6401140320594e8686be30e8ac7547aed2 + md5: 1d4c18d75c51ed9d00092a891a547a7d + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.77,<2.78.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + size: 491953 + timestamp: 1770738638119 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda + sha256: ed4d2c01fbeb1330f112f7e399408634db277d3dfb2dec1d0395f56feaa24351 + md5: 6c74fba677b61a0842cbf0f63eee683b + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.77,<2.78.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + size: 144654 + timestamp: 1770738650966 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee + md5: db409b7c1720428638e7c0d509d3e1b5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + size: 40311 + timestamp: 1766271528534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda + sha256: c180f4124a889ac343fc59d15558e93667d894a966ec6fdb61da1604481be26b + md5: 0f03292cc56bf91a077a134ea8747118 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + size: 895108 + timestamp: 1753948278280 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + md5: edb0dca6bc32e4f4789199455a1dbeb8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - zlib 1.3.1 *_2 + license: Zlib + license_family: Other + size: 60963 + timestamp: 1727963148474 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 + md5: 47e340acb35de30501a76c7c799c41d7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: X11 AND BSD-3-Clause + size: 891641 + timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda + sha256: 6f7d59dbec0a7b00bf5d103a4306e8886678b796ff2151b62452d4582b2a53fb + md5: b518e9e92493721281a60fa975bddc65 + depends: + - libstdcxx >=14 + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: Apache-2.0 + license_family: APACHE + size: 186323 + timestamp: 1763688260928 +- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee + md5: eb52d14a901e23c39e9e7b4a1a5c015f + depends: + - python >=3.10 + - setuptools + license: BSD-3-Clause + license_family: BSD + size: 40866 + timestamp: 1766261270149 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda + sha256: 1d8377c8001c15ed12c2713b723213474b435706ab9d34ede69795d64af9e94d + md5: 4ea6b620fdf24a1a0bc4f1c7134dfafb + depends: + - python + - libstdcxx >=14 + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - libcblas >=3.9.0,<4.0a0 + - python_abi 3.14.* *_cp314 + - libblas >=3.9.0,<4.0a0 + - liblapack >=3.9.0,<4.0a0 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + size: 8926994 + timestamp: 1770098474394 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c + md5: f61eb8cd60ff9057122a3d338b99c00f + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + size: 3164551 + timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 + md5: b76541e68fea4d511b1ac46a28dcd2c6 + depends: + - python >=3.8 + - python + license: Apache-2.0 + license_family: APACHE + size: 72010 + timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda + sha256: 0289f0a38337ee201d984f8f31f11f6ef076cfbbfd0ab9181d12d9d1d099bf46 + md5: 82c1787f2a65c0155ef9652466ee98d6 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 25646 + timestamp: 1773199142345 +- conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + sha256: e14aafa63efa0528ca99ba568eaf506eb55a0371d12e6250aaaa61718d2eb62e + md5: d7585b6550ad04c8c5e21097ada2888e + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + size: 25877 + timestamp: 1764896838868 +- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda + sha256: 5b81b7516d4baf43d0c185896b245fa7384b25dc5615e7baa504b7fa4e07b706 + md5: 7f3ac694319c7eaf81a0325d6405e974 + depends: + - cfgv >=2.0.0 + - identify >=1.0.0 + - nodeenv >=0.11.1 + - python >=3.10 + - pyyaml >=5.1 + - virtualenv >=20.10.0 + license: MIT + license_family: MIT + size: 200827 + timestamp: 1765937577534 +- conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda + sha256: 6d8f03c13d085a569fde931892cded813474acbef2e03381a1a87f420c7da035 + md5: 46830ee16925d5ed250850503b5dc3a8 + depends: + - python >=3.9 + license: MIT + license_family: MIT + size: 25766 + timestamp: 1733236452235 +- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6 + md5: 12c566707c80111f9799308d9e265aef + depends: + - python >=3.9 + - python + license: BSD-3-Clause + license_family: BSD + size: 110100 + timestamp: 1733195786147 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a + md5: 6b6ece66ebcae2d5f326c77ef2c5a066 + depends: + - python >=3.9 + license: BSD-2-Clause + license_family: BSD + size: 889287 + timestamp: 1750615908735 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07 + md5: ed166875b3876d5d7e6e39d2e8d1c6e3 + depends: + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - six + license: MIT + license_family: MIT + size: 273897 + timestamp: 1765980972868 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda + sha256: 9e749fb465a8bedf0184d8b8996992a38de351f7c64e967031944978de03a520 + md5: 2b694bad8a50dc2f712f5368de866480 + depends: + - pygments >=2.7.2 + - python >=3.10 + - iniconfig >=1.0.1 + - packaging >=22 + - pluggy >=1.5,<2 + - tomli >=1 + - colorama >=0.4 + - exceptiongroup >=1 + - python + constrains: + - pytest-faulthandler >=2 + license: MIT + license_family: MIT + size: 299581 + timestamp: 1765062031645 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda + sha256: 2f2229415a6e5387c1faaedf442ea8c07471cb2bf5ad1007b9cfb83ea85ca29a + md5: 0e7294ed4af8b833fcd2c101d647c3da + depends: + - py-cpuinfo + - pytest >=8.1 + - python >=3.10 + license: BSD-2-Clause + license_family: BSD + size: 43976 + timestamp: 1762716480208 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda + build_number: 101 + sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd + md5: c014ad06e60441661737121d3eae8a60 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.3,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.51.2,<4.0a0 + - libuuid >=2.41.3,<3.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.5,<4.0a0 + - python_abi 3.14.* *_cp314 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - zstd >=1.5.7,<1.6.0a0 + license: Python-2.0 + size: 36702440 + timestamp: 1770675584356 + python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda + sha256: 36429765f626c345710fbae14aeeda676c1745427667eb480bb855b7089affba + md5: 69fc0a99fc21b26b81026c72e00f83df + depends: + - python >=3.10 + - filelock >=3.15.4 + - platformdirs <5,>=4.3.6 + - python + license: MIT + license_family: MIT + size: 33996 + timestamp: 1773161039118 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + build_number: 8 + sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 + md5: 0539938c55b6b1a59b560e843ad864a4 + constrains: + - python 3.14.* *_cp314 + license: BSD-3-Clause + license_family: BSD + size: 6989 + timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d + md5: 2035f68f96be30dc60a5dfd7452c7941 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - yaml >=0.2.5,<0.3.0a0 + license: MIT + license_family: MIT + size: 202391 + timestamp: 1770223462836 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717 + md5: d487d93d170e332ab39803e05912a762 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libnl >=3.11.0,<4.0a0 + - libstdcxx >=14 + - libsystemd0 >=257.10 + - libudev1 >=257.10 + license: Linux-OpenIB + license_family: BSD + size: 1268666 + timestamp: 1769154883613 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 + md5: d7d95fc8287ea7bf33e0e7116d2b95ec + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + size: 345073 + timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda + sha256: d5c73079c1dd2c2a313c3bfd81c73dbd066b7eb08d213778c8bff520091ae894 + md5: c1c9b02933fdb2cfb791d936c20e887e + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: MIT + license_family: MIT + size: 193775 + timestamp: 1748644872902 +- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1 + md5: 8e194e7b992f99a5015edbd4ebd38efd + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 639697 + timestamp: 1773074868565 +- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d + md5: 3339e3b65d58accf4ca4fb8748ab16b3 + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + size: 18455 + timestamp: 1753199211006 +- conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda + sha256: c47299fe37aebb0fcf674b3be588e67e4afb86225be4b0d452c7eb75c086b851 + md5: 13dc3adbc692664cd3beabd216434749 + depends: + - __glibc >=2.28 + - kernel-headers_linux-64 4.18.0 he073ed8_9 + - tzdata + license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later + license_family: GPL + size: 24008591 + timestamp: 1765578833462 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac + md5: cffd3bdd58090148f4cfcd831f4b26ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + size: 3301196 + timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda + sha256: 62940c563de45790ba0f076b9f2085a842a65662268b02dd136a8e9b1eaf47a8 + md5: 72e780e9aa2d0a3295f59b1874e3768b + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 21453 + timestamp: 1768146676791 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 + md5: 0caa1af407ecff61170c9437a808404d + depends: + - python >=3.10 + - python + license: PSF-2.0 + license_family: PSF + size: 51692 + timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c + md5: ad659d0a2b3e47e38d829aa8cad2d610 + license: LicenseRef-Public-Domain + size: 119135 + timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43 + md5: 5d3c008e54c7f49592fca9c32896a76f + depends: + - __glibc >=2.17,<3.0.a0 + - cffi + - libgcc >=14 + - libstdcxx >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + size: 15004 + timestamp: 1769438727085 +- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda + sha256: b83246d145ba0e6814d2ed0b616293e56924e6c7d6649101f5a4f97f9e757ed1 + md5: 704c22301912f7e37d0a92b2e7d5942d + depends: + - python >=3.10 + - distlib >=0.3.7,<1 + - filelock <4,>=3.24.2 + - importlib-metadata >=6.6 + - platformdirs >=3.9.1,<5 + - python-discovery >=1 + - typing_extensions >=4.13.2 + - python + license: MIT + license_family: MIT + size: 4647775 + timestamp: 1773133660203 +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad + md5: a77f85f77be52ff59391544bfe73390a + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: MIT + license_family: MIT + size: 85189 + timestamp: 1753484064210 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae + md5: 30cd29cb87d819caead4d55184c1d115 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 24194 + timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 + md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 + depends: + - __glibc >=2.17,<3.0.a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + size: 601375 + timestamp: 1764777111296 diff --git a/benchmarks/cuda_bindings/pixi.toml b/benchmarks/cuda_bindings/pixi.toml new file mode 100644 index 0000000000..a448e8d3e4 --- /dev/null +++ b/benchmarks/cuda_bindings/pixi.toml @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +[workspace] +channels = ["conda-forge"] +platforms = ["linux-64"] +preview = ["pixi-build"] +channel-priority = "disabled" + +[feature.cu13.system-requirements] +cuda = "13" + +[feature.cu13-pinned.dependencies] +cuda-version = "13.1.*" + +[feature.cu13-source.dependencies] +cuda-version = "13.*" + +[feature.bench.dependencies] +python = "3.14.*" +pyperf = "*" +pytest = "*" +pytest-benchmark = "*" +numpy = "*" + +[feature.cpp-bench.dependencies] +cmake = "*" +ninja = "*" +cxx-compiler = "*" +cuda-cudart-dev = "*" +cuda-nvrtc-dev = "*" + +[feature.cpp-bench.target.linux-64.dependencies] +cuda-crt-dev_linux-64 = "*" +cuda-driver-dev_linux-64 = "*" + +[feature.cpp-bench.target.linux-64.activation.env] +CUDA_HOME = "$CONDA_PREFIX/targets/x86_64-linux" + +[feature.dev.dependencies] +pre-commit = "*" + +[feature.bindings-wheel.dependencies] +cuda-bindings = "==13.1.0" + +[feature.bindings-source.dependencies] +cuda-bindings = { path = ".." } + +[environments] +wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] } +source = { features = ["cu13", "cu13-source", "bench", "cpp-bench", "dev", "bindings-source"] } + +[target.linux.tasks.bench] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"] + +[target.linux.tasks.bench-smoke-test] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--fast", "--min-time", "1" +] + +[target.linux.tasks.bench-legacy] +cmd = "pytest --benchmark-only --override-ini 'addopts=' $PIXI_PROJECT_ROOT/pytest-legacy/" + +[target.linux.tasks.bench-cpp-configure] +cmd = [ + "cmake", + "-S", + "$PIXI_PROJECT_ROOT/benchmarks/cpp", + "-B", + "$PIXI_PROJECT_ROOT/.build/cpp", + "-G", + "Ninja", +] + +[target.linux.tasks.bench-cpp-build] +cmd = ["cmake", "--build", "$PIXI_PROJECT_ROOT/.build/cpp"] +depends-on = [{ task = "bench-cpp-configure" }] + +[target.linux.tasks.bench-cpp] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"] +depends-on = [{ task = "bench-cpp-build" }] + +[target.linux.tasks.bench-compare] +cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"] + +[target.linux.tasks.lint] +cmd = ["pre-commit", "run", "--all-files"] diff --git a/benchmarks/cuda_bindings/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py new file mode 100644 index 0000000000..0ea7b1d772 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import numpy as np +import pytest + +from cuda.bindings import driver as cuda +from cuda.bindings import nvrtc +from cuda.bindings import runtime as cudart + + +def ASSERT_DRV(err): + if isinstance(err, cuda.CUresult): + if err != cuda.CUresult.CUDA_SUCCESS: + raise RuntimeError(f"Cuda Error: {err}") + elif isinstance(err, cudart.cudaError_t): + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"Cudart Error: {err}") + elif isinstance(err, nvrtc.nvrtcResult): + if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: + raise RuntimeError(f"Nvrtc Error: {err}") + else: + raise RuntimeError(f"Unknown error type: {err}") + + +@pytest.fixture +def init_cuda(): + # Initialize + (err,) = cuda.cuInit(0) + ASSERT_DRV(err) + err, device = cuda.cuDeviceGet(0) + ASSERT_DRV(err) + err, ctx = cuda.cuCtxCreate(None, 0, device) + ASSERT_DRV(err) + + # create stream + err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + ASSERT_DRV(err) + + yield device, ctx, stream + + (err,) = cuda.cuStreamDestroy(stream) + ASSERT_DRV(err) + (err,) = cuda.cuCtxDestroy(ctx) + ASSERT_DRV(err) + + +@pytest.fixture +def load_module(): + module = None + + def _load_module(kernel_string, device): + nonlocal module + # Get module + err, major = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device + ) + ASSERT_DRV(err) + err, minor = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device + ) + ASSERT_DRV(err) + + err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], []) + ASSERT_DRV(err) + opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")] + (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts) + + err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog) + ASSERT_DRV(err_log) + log = b" " * logSize + (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) + ASSERT_DRV(err_log) + result = log.decode() + if len(result) > 1: + print(result) + + ASSERT_DRV(err) + err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog) + ASSERT_DRV(err) + cubin = b" " * cubinSize + (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) + ASSERT_DRV(err) + cubin = np.char.array(cubin) + err, module = cuda.cuModuleLoadData(cubin) + ASSERT_DRV(err) + + return module + + yield _load_module + + (err,) = cuda.cuModuleUnload(module) + ASSERT_DRV(err) diff --git a/benchmarks/cuda_bindings/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py new file mode 100644 index 0000000000..36646fba00 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +kernel_string = """\ +#define ITEM_PARAM(x, T) T x +#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) +#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) +#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) +#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) +#define REP32(x, T) REP16(x##0, T) REP16(x##1, T) +#define REP64(x, T) REP32(x##0, T) REP32(x##1, T) +#define REP128(x, T) REP64(x##0, T) REP64(x##1, T) +#define REP256(x, T) REP128(x##0, T) REP128(x##1, T) + +template +struct KernelFunctionParam +{ + unsigned char p[maxBytes]; +}; + +extern "C" __global__ void small_kernel(float *f) +{ + *f = 0.0f; +} + +extern "C" __global__ void empty_kernel() +{ + return; +} + +extern "C" __global__ +void small_kernel_512_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*) + REP16(A, int*) + REP32(A, int*) + REP64(A, int*) + REP128(A, int*) + REP256(A, int*)) +{ + *F = 0; +} + +extern "C" __global__ +void small_kernel_512_bools( + ITEM_PARAM(F, bool) + REP1(A, bool) + REP2(A, bool) + REP4(A, bool) + REP8(A, bool) + REP16(A, bool) + REP32(A, bool) + REP64(A, bool) + REP128(A, bool) + REP256(A, bool)) +{ + return; +} + +extern "C" __global__ +void small_kernel_512_ints( + ITEM_PARAM(F, int) + REP1(A, int) + REP2(A, int) + REP4(A, int) + REP8(A, int) + REP16(A, int) + REP32(A, int) + REP64(A, int) + REP128(A, int) + REP256(A, int)) +{ + return; +} + +extern "C" __global__ +void small_kernel_512_doubles( + ITEM_PARAM(F, double) + REP1(A, double) + REP2(A, double) + REP4(A, double) + REP8(A, double) + REP16(A, double) + REP32(A, double) + REP64(A, double) + REP128(A, double) + REP256(A, double)) +{ + return; +} + +extern "C" __global__ +void small_kernel_512_chars( + ITEM_PARAM(F, char) + REP1(A, char) + REP2(A, char) + REP4(A, char) + REP8(A, char) + REP16(A, char) + REP32(A, char) + REP64(A, char) + REP128(A, char) + REP256(A, char)) +{ + return; +} + +extern "C" __global__ +void small_kernel_512_longlongs( + ITEM_PARAM(F, long long) + REP1(A, long long) + REP2(A, long long) + REP4(A, long long) + REP8(A, long long) + REP16(A, long long) + REP32(A, long long) + REP64(A, long long) + REP128(A, long long) + REP256(A, long long)) +{ + return; +} + +extern "C" __global__ +void small_kernel_256_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*) + REP16(A, int*) + REP32(A, int*) + REP64(A, int*) + REP128(A, int*)) +{ + *F = 0; +} + +extern "C" __global__ +void small_kernel_16_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*)) +{ + *F = 0; +} + +extern "C" __global__ void small_kernel_2048B(KernelFunctionParam<2048> param) +{ + // Do not touch param to prevent compiler from copying + // the whole structure from const bank to lmem. +} +""" diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py new file mode 100644 index 0000000000..76dd6e6a45 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py @@ -0,0 +1,199 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import ctypes + +import pytest + +try: + import cupy + + skip_tests = False +except ImportError: + skip_tests = True + +from kernels import kernel_string + + +def launch(kernel, args=()): + kernel((1,), (1,), args) + + +# Measure launch latency with no parmaeters +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_empty_kernel(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("empty_kernel") + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel) + stream.synchronize() + + +# Measure launch latency with a single parameter +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel") + cupy.cuda.set_allocator() + arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float)) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, (arg,)) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_args(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_args") + cupy.cuda.set_allocator() + + args = [] + for _ in range(512): + args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_bools(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_bools") + cupy.cuda.set_allocator() + + args = [True] * 512 + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_doubles(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_doubles") + cupy.cuda.set_allocator() + + args = [1.2345] * 512 + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_ints(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_ints") + cupy.cuda.set_allocator() + + args = [123] * 512 + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_bytes(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_chars") + cupy.cuda.set_allocator() + + args = [127] * 512 + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_512_longlongs(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_512_longlongs") + cupy.cuda.set_allocator() + + args = [9223372036854775806] * 512 + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_256_args(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_256_args") + cupy.cuda.set_allocator() + + args = [] + for _ in range(256): + args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +@pytest.mark.benchmark(group="cupy") +def test_launch_latency_small_kernel_16_args(benchmark): + module = cupy.RawModule(code=kernel_string) + kernel = module.get_function("small_kernel_16_args") + cupy.cuda.set_allocator() + + args = [] + for _ in range(16): + args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) + args = tuple(args) + + stream = cupy.cuda.stream.Stream(non_blocking=True) + + with stream: + benchmark(launch, kernel, args) + stream.synchronize() diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py new file mode 100755 index 0000000000..dd994081a0 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py @@ -0,0 +1,336 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import ctypes + +import pytest +from kernels import kernel_string + +from conftest import ASSERT_DRV +from cuda.bindings import driver as cuda + + +def launch(kernel, stream, args=(), arg_types=()): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (args, arg_types), + 0, + ) # arguments + + +def launch_packed(kernel, stream, params): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + params, + 0, + ) # arguments + + +# Measure launch latency with no parmaeters +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") + ASSERT_DRV(err) + + benchmark(launch, func, stream) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with a single parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel") + ASSERT_DRV(err) + + err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) + ASSERT_DRV(err) + + benchmark(launch, func, stream, args=(f,), arg_types=(None,)) + + cuda.cuCtxSynchronize() + + (err,) = cuda.cuMemFree(f) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 512 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") + ASSERT_DRV(err) + + args = [True] * 512 + arg_types = [ctypes.c_bool] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") + ASSERT_DRV(err) + + args = [1.2345] * 512 + arg_types = [ctypes.c_double] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") + ASSERT_DRV(err) + + args = [123] * 512 + arg_types = [ctypes.c_int] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") + ASSERT_DRV(err) + + args = [127] * 512 + arg_types = [ctypes.c_byte] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") + ASSERT_DRV(err) + + args = [9223372036854775806] * 512 + arg_types = [ctypes.c_longlong] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 256 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 16 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters, excluding parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + val_ps = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + val_ps.append(ctypes.c_void_p(int(vals[i]))) + + packagedParams = (ctypes.c_void_p * 512)() + for i in range(512): + packagedParams[i] = ctypes.addressof(val_ps[i]) + + benchmark(launch_packed, func, stream, packagedParams) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +def pack_and_launch(kernel, stream, params): + packed_params = (ctypes.c_void_p * len(params))() + ptrs = [0] * len(params) + for i in range(len(params)): + ptrs[i] = ctypes.c_void_p(int(params[i])) + packed_params[i] = ctypes.addressof(ptrs[i]) + + cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) + + +# Measure launch latency plus parameter packing using ctypes +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + + benchmark(pack_and_launch, func, stream, vals) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with a single large struct parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") + ASSERT_DRV(err) + + class struct_2048B(ctypes.Structure): + _fields_ = [("values", ctypes.c_uint8 * 2048)] + + benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) + + cuda.cuCtxSynchronize() diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py new file mode 100644 index 0000000000..dfe084c6b1 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import numpy as np +import pytest + +try: + from numba import cuda + + skip_tests = False +except ImportError: + skip_tests = True + + +def launch_empty(kernel, stream): + kernel[1, 1, stream]() + + +def launch(kernel, stream, arg): + kernel[1, 1, stream](arg) + + +# Measure launch latency with no parmaeters +@pytest.mark.skipif(skip_tests, reason="Numba is not installed") +@pytest.mark.benchmark(group="numba", min_rounds=1000) +def test_launch_latency_empty_kernel(benchmark): + stream = cuda.stream() + + @cuda.jit + def empty_kernel(): + return + + benchmark(launch_empty, empty_kernel, stream) + + cuda.synchronize() + + +# Measure launch latency with a single parameter +@pytest.mark.skipif(skip_tests, reason="Numba is not installed") +@pytest.mark.benchmark(group="numba", min_rounds=1000) +def test_launch_latency_small_kernel(benchmark): + stream = cuda.stream() + + arg = cuda.device_array(1, dtype=np.float32, stream=stream) + + @cuda.jit + def small_kernel(array): + array[0] = 0.0 + + benchmark(launch, small_kernel, stream, arg) + + cuda.synchronize() diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py new file mode 100644 index 0000000000..fae72ffd79 --- /dev/null +++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import random + +import pytest + +from conftest import ASSERT_DRV +from cuda.bindings import driver as cuda + +random.seed(0) + +idx = 0 + + +def query_attribute(attribute, ptrs): + global idx + ptr = ptrs[idx] + idx = (idx + 1) % len(ptrs) + + cuda.cuPointerGetAttribute(attribute, ptr) + + +def query_attributes(attributes, ptrs): + global idx + ptr = ptrs[idx] + idx = (idx + 1) % len(ptrs) + + cuda.cuPointerGetAttributes(len(attributes), attributes, ptr) + + +@pytest.mark.benchmark(group="pointer-attributes") +# Measure cuPointerGetAttribute in the same way as C benchmarks +def test_pointer_get_attribute(benchmark, init_cuda): + _ = init_cuda + + ptrs = [] + for _ in range(500): + err, ptr = cuda.cuMemAlloc(1 << 18) + ASSERT_DRV(err) + ptrs.append(ptr) + + random.shuffle(ptrs) + + benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs) + + for p in ptrs: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +@pytest.mark.benchmark(group="pointer-attributes") +# Measure cuPointerGetAttributes with all attributes +def test_pointer_get_attributes_all(benchmark, init_cuda): + _ = init_cuda + + ptrs = [] + for _ in range(500): + err, ptr = cuda.cuMemAlloc(1 << 18) + ASSERT_DRV(err) + ptrs.append(ptr) + + random.shuffle(ptrs) + + attributes = [ + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE, + ] + + benchmark(query_attributes, attributes, ptrs) + + for p in ptrs: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +@pytest.mark.benchmark(group="pointer-attributes") +# Measure cuPointerGetAttributes with a single attribute +def test_pointer_get_attributes_single(benchmark, init_cuda): + _ = init_cuda + + ptrs = [] + for _ in range(500): + err, ptr = cuda.cuMemAlloc(1 << 18) + ASSERT_DRV(err) + ptrs.append(ptr) + + random.shuffle(ptrs) + + attributes = [ + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + ] + + benchmark(query_attributes, attributes, ptrs) + + for p in ptrs: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) diff --git a/benchmarks/cuda_bindings/run_cpp.py b/benchmarks/cuda_bindings/run_cpp.py new file mode 100644 index 0000000000..96e50cb890 --- /dev/null +++ b/benchmarks/cuda_bindings/run_cpp.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from runner.cpp import main + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_bindings/run_pyperf.py b/benchmarks/cuda_bindings/run_pyperf.py new file mode 100644 index 0000000000..f45af8c69a --- /dev/null +++ b/benchmarks/cuda_bindings/run_pyperf.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from runner.main import main + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_bindings/runner/__init__.py b/benchmarks/cuda_bindings/runner/__init__.py new file mode 100644 index 0000000000..27422b3cb7 --- /dev/null +++ b/benchmarks/cuda_bindings/runner/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/benchmarks/cuda_bindings/runner/cpp.py b/benchmarks/cuda_bindings/runner/cpp.py new file mode 100644 index 0000000000..f8c3490381 --- /dev/null +++ b/benchmarks/cuda_bindings/runner/cpp.py @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +BUILD_DIR = PROJECT_ROOT / ".build" / "cpp" +DEFAULT_OUTPUT = PROJECT_ROOT / "results-cpp.json" + +BINARY_PREFIX = "bench_" +BINARY_SUFFIX = "_cpp" + + +def discover_binaries() -> dict[str, Path]: + """Discover C++ benchmark binaries in the build directory""" + if not BUILD_DIR.is_dir(): + return {} + + registry: dict[str, Path] = {} + for path in sorted(BUILD_DIR.iterdir()): + if not path.is_file() or not path.name.startswith(BINARY_PREFIX): + continue + if not path.name.endswith(BINARY_SUFFIX): + continue + name = path.name.removeprefix(BINARY_PREFIX).removesuffix(BINARY_SUFFIX) + registry[name] = path + return registry + + +def strip_output_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for arg in argv: + if skip_next: + skip_next = False + continue + if arg in ("-o", "--output"): + skip_next = True + continue + if arg.startswith(("-o=", "--output=")): + continue + cleaned.append(arg) + return cleaned + + +def merge_pyperf_json(individual_files: list[Path], output_path: Path) -> int: + """Merge individual pyperf JSON files into a single BenchmarkSuite file. + + Each C++ binary produces a file with structure: + {"version": "1.0", "metadata": {...}, "benchmarks": [{...}]} + + We merge them by collecting all benchmark entries into one file. + """ + all_benchmarks = [] + + for path in individual_files: + with open(path) as f: + data = json.load(f) + + file_metadata = data.get("metadata", {}) + bench_name = file_metadata.get("name", "") + loops = file_metadata.get("loops") + unit = file_metadata.get("unit", "second") + + for bench in data.get("benchmarks", []): + for run in bench.get("runs", []): + run_meta = run.setdefault("metadata", {}) + if bench_name: + run_meta.setdefault("name", bench_name) + if loops is not None: + run_meta.setdefault("loops", loops) + run_meta.setdefault("unit", unit) + + all_benchmarks.append(bench) + + merged = { + "version": "1.0", + "benchmarks": all_benchmarks, + } + + with open(output_path, "w") as f: + json.dump(merged, f) + + return len(all_benchmarks) + + +def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: + parser = argparse.ArgumentParser( + description="Run C++ CUDA benchmarks", + add_help=False, + ) + parser.add_argument( + "--benchmark", + action="append", + default=[], + help="Benchmark name to run (e.g. 'ctx_device'). Repeat for multiple. Defaults to all.", + ) + parser.add_argument( + "--list", + action="store_true", + help="Print discovered benchmark names and exit.", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=DEFAULT_OUTPUT, + help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", + ) + parsed, remaining = parser.parse_known_args(argv) + return parsed, remaining + + +def main() -> None: + parsed, remaining_argv = parse_args(sys.argv[1:]) + + registry = discover_binaries() + if not registry: + print( + f"No C++ benchmark binaries found in {BUILD_DIR}.\nRun 'pixi run bench-cpp-build' first.", + file=sys.stderr, + ) + sys.exit(1) + + if parsed.list: + for name in sorted(registry): + print(name) + return + + if parsed.benchmark: + missing = sorted(set(parsed.benchmark) - set(registry)) + if missing: + known = ", ".join(sorted(registry)) + unknown = ", ".join(missing) + print( + f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}", + file=sys.stderr, + ) + sys.exit(1) + names = parsed.benchmark + else: + names = sorted(registry) + + # Strip any --output args to avoid conflicts with our output handling + passthrough_argv = strip_output_args(remaining_argv) + + output_path = parsed.output.resolve() + failed = False + individual_files: list[Path] = [] + + with tempfile.TemporaryDirectory(prefix="cuda_bench_cpp_") as tmpdir: + tmpdir_path = Path(tmpdir) + + for name in names: + binary = registry[name] + tmp_json = tmpdir_path / f"{name}.json" + cmd = [str(binary), "-o", str(tmp_json), *passthrough_argv] + result = subprocess.run(cmd, check=False) # noqa: S603 + if result.returncode != 0: + print(f"FAILED: {name} (exit code {result.returncode})", file=sys.stderr) + failed = True + elif tmp_json.exists(): + individual_files.append(tmp_json) + + if individual_files: + count = merge_pyperf_json(individual_files, output_path) + print(f"\nResults saved to {output_path} ({count} benchmark(s))") + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py new file mode 100644 index 0000000000..4089aa5559 --- /dev/null +++ b/benchmarks/cuda_bindings/runner/main.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import ast +import importlib.util +import os +import sys +from collections.abc import Callable +from pathlib import Path +from types import ModuleType + +import pyperf + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +BENCH_DIR = PROJECT_ROOT / "benchmarks" +DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json" +PYPERF_INHERITED_ENV_VARS = ( + "CUDA_HOME", + "CUDA_PATH", + "CUDA_VISIBLE_DEVICES", + "LD_LIBRARY_PATH", + "NVIDIA_VISIBLE_DEVICES", +) +_MODULE_CACHE: dict[Path, ModuleType] = {} + + +def load_module(module_path: Path) -> ModuleType: + module_path = module_path.resolve() + cached_module = _MODULE_CACHE.get(module_path) + if cached_module is not None: + return cached_module + + module_name = f"cuda_bindings_bench_{module_path.stem}" + spec = importlib.util.spec_from_file_location(module_name, module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Failed to load benchmark module: {module_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + _MODULE_CACHE[module_path] = module + return module + + +def benchmark_id(module_name: str, function_name: str) -> str: + module_suffix = module_name.removeprefix("bench_") + suffix = function_name.removeprefix("bench_") + return f"{module_suffix}.{suffix}" + + +def _discover_module_functions(module_path: Path) -> list[str]: + tree = ast.parse(module_path.read_text(encoding="utf-8"), filename=str(module_path)) + return [ + node.name + for node in tree.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_") + ] + + +def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]: + loaded_function: Callable[[int], float] | None = None + + def run(loops: int) -> float: + nonlocal loaded_function + if loaded_function is None: + module = load_module(module_path) + loaded_function = getattr(module, function_name) + return loaded_function(loops) + + run.__name__ = function_name + return run + + +def discover_benchmarks() -> dict[str, Callable[[int], float]]: + """Discover bench_ functions. + + Each bench_ function must have the signature: bench_*(loops: int) -> float + where it calls the operation `loops` times and returns the total elapsed + time in seconds (using time.perf_counter). + """ + registry: dict[str, Callable[[int], float]] = {} + for module_path in sorted(BENCH_DIR.glob("bench_*.py")): + module_name = module_path.stem + for function_name in _discover_module_functions(module_path): + bench_id = benchmark_id(module_name, function_name) + if bench_id in registry: + raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}") + registry[bench_id] = _lazy_benchmark(module_path, function_name) + return registry + + +def strip_pyperf_output_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for arg in argv: + if skip_next: + skip_next = False + continue + if arg in ("-o", "--output", "--append"): + skip_next = True + continue + if arg.startswith(("-o=", "--output=", "--append=")): + continue + cleaned.append(arg) + return cleaned + + +def _split_env_vars(arg_value: str) -> list[str]: + return [env_var for env_var in arg_value.split(",") if env_var] + + +def ensure_pyperf_worker_env(argv: list[str]) -> list[str]: + if "--copy-env" in argv: + return list(argv) + + inherited_env: list[str] = [] + cleaned: list[str] = [] + skip_next = False + for arg in argv: + if skip_next: + inherited_env.extend(_split_env_vars(arg)) + skip_next = False + continue + if arg == "--inherit-environ": + skip_next = True + continue + if arg.startswith("--inherit-environ="): + inherited_env.extend(_split_env_vars(arg.partition("=")[2])) + continue + cleaned.append(arg) + + if skip_next: + raise ValueError("Missing value for --inherit-environ") + + for env_var in PYPERF_INHERITED_ENV_VARS: + if env_var in os.environ: + inherited_env.append(env_var) + + deduped_env: list[str] = [] + for env_var in inherited_env: + if env_var not in deduped_env: + deduped_env.append(env_var) + + if deduped_env: + cleaned.extend(["--inherit-environ", ",".join(deduped_env)]) + + return cleaned + + +def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument( + "--benchmark", + action="append", + default=[], + help="Benchmark ID to run. Repeat to run multiple IDs. Defaults to all.", + ) + parser.add_argument( + "--list", + action="store_true", + help="Print discovered benchmark IDs and exit.", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=DEFAULT_OUTPUT, + help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", + ) + parsed, remaining = parser.parse_known_args(argv) + return parsed, remaining + + +def main() -> None: + parsed, remaining_argv = parse_args(sys.argv[1:]) + + registry = discover_benchmarks() + if not registry: + raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}") + + if parsed.list: + for bench_id in sorted(registry): + print(bench_id) + return + + if parsed.benchmark: + missing = sorted(set(parsed.benchmark) - set(registry)) + if missing: + known = ", ".join(sorted(registry)) + unknown = ", ".join(missing) + raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}") + benchmark_ids = parsed.benchmark + else: + benchmark_ids = sorted(registry) + + # Strip any --output args to avoid conflicts with our output handling. + output_path = parsed.output.resolve() + remaining_argv = strip_pyperf_output_args(remaining_argv) + remaining_argv = ensure_pyperf_worker_env(remaining_argv) + is_worker = "--worker" in remaining_argv + + # Delete the file so this run starts fresh. + if not is_worker: + output_path.unlink(missing_ok=True) + + sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv] + + runner = pyperf.Runner() + for bench_id in benchmark_ids: + runner.bench_time_func(bench_id, registry[bench_id]) + + if not is_worker: + print(f"\nResults saved to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_bindings/runner/runtime.py b/benchmarks/cuda_bindings/runner/runtime.py new file mode 100644 index 0000000000..c985adb2e2 --- /dev/null +++ b/benchmarks/cuda_bindings/runner/runtime.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import atexit + +from cuda.bindings import driver as cuda +from cuda.bindings import nvrtc + +_ctx = None +_device = None +_persistent_ptrs: list[int] = [] +_modules: list = [] + + +def assert_drv(err) -> None: + if err != cuda.CUresult.CUDA_SUCCESS: + raise RuntimeError(f"Cuda Error: {err}") + + +def ensure_context() -> int: + global _ctx, _device + if _ctx is not None: + return _ctx + + (err,) = cuda.cuInit(0) + assert_drv(err) + + err, device = cuda.cuDeviceGet(0) + assert_drv(err) + _device = device + + err, ctx = cuda.cuCtxCreate(None, 0, device) + assert_drv(err) + _ctx = ctx + return ctx + + +def alloc_persistent(size: int) -> int: + ensure_context() + err, ptr = cuda.cuMemAlloc(size) + assert_drv(err) + _persistent_ptrs.append(ptr) + return ptr + + +def compile_and_load(kernel_source: str) -> int: + """Compile CUDA C source and returns the CUmodule handle""" + ensure_context() + + err, major = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _device + ) + assert_drv(err) + err, minor = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _device + ) + assert_drv(err) + + err, prog = nvrtc.nvrtcCreateProgram(kernel_source.encode(), b"benchmark_kernel.cu", 0, [], []) + assert_drv(err) + + arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode() + (err,) = nvrtc.nvrtcCompileProgram(prog, 2, [b"--fmad=false", arch_flag]) + + # check for compile errors + err_log, log_size = nvrtc.nvrtcGetProgramLogSize(prog) + assert_drv(err_log) + log = b" " * log_size + (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) + assert_drv(err_log) + assert_drv(err) + + err, cubin_size = nvrtc.nvrtcGetCUBINSize(prog) + assert_drv(err) + cubin = b" " * cubin_size + (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) + assert_drv(err) + + err, module = cuda.cuModuleLoadData(cubin) + assert_drv(err) + _modules.append(module) + return module + + +def cleanup() -> None: + global _ctx + for ptr in reversed(_persistent_ptrs): + (err,) = cuda.cuMemFree(ptr) + assert_drv(err) + _persistent_ptrs.clear() + + for module in reversed(_modules): + (err,) = cuda.cuModuleUnload(module) + assert_drv(err) + _modules.clear() + + if _ctx is None: + return + (err,) = cuda.cuCtxDestroy(_ctx) + assert_drv(err) + _ctx = None + + +atexit.register(cleanup) diff --git a/benchmarks/cuda_bindings/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py new file mode 100644 index 0000000000..612094dac9 --- /dev/null +++ b/benchmarks/cuda_bindings/tests/test_runner.py @@ -0,0 +1,166 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import importlib.util +import itertools +import sys +import types +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[3] +RUNNER_MAIN_PATH = REPO_ROOT / "cuda_bindings/benchmarks/runner/main.py" +BENCH_LAUNCH_PATH = REPO_ROOT / "cuda_bindings/benchmarks/benchmarks/bench_launch.py" + + +def load_module_from_path(module_name: str, module_path: Path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Failed to load test module: {module_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def load_runner_main(monkeypatch): + pyperf_module = types.ModuleType("pyperf") + + class FakeRunner: + def bench_time_func(self, *_args, **_kwargs) -> None: + raise AssertionError("FakeRunner should not be used in these tests") + + pyperf_module.Runner = FakeRunner + monkeypatch.setitem(sys.modules, "pyperf", pyperf_module) + return load_module_from_path("test_cuda_bindings_bench_runner_main", RUNNER_MAIN_PATH) + + +def load_bench_launch(monkeypatch, calls: list[tuple]): + pointer_values = itertools.count(1000) + + runtime_module = types.ModuleType("runner.runtime") + + def alloc_persistent(size: int) -> int: + calls.append(("alloc_persistent", size)) + return next(pointer_values) + + def assert_drv(err) -> None: + calls.append(("assert_drv", err)) + assert err == 0 + + def compile_and_load(source: str) -> str: + calls.append(("compile_and_load", source)) + return "module" + + runtime_module.alloc_persistent = alloc_persistent + runtime_module.assert_drv = assert_drv + runtime_module.compile_and_load = compile_and_load + + runner_module = types.ModuleType("runner") + runner_module.runtime = runtime_module + + driver_module = types.ModuleType("cuda.bindings.driver") + + class FakeCUresult: + CUDA_SUCCESS = 0 + + class FakeCUstreamFlags: + CU_STREAM_NON_BLOCKING = types.SimpleNamespace(value=1) + + def cuModuleGetFunction(module, name): + calls.append(("cuModuleGetFunction", module, name)) + return 0, name + + def cuStreamCreate(flags): + calls.append(("cuStreamCreate", flags)) + return 0, "stream" + + def cuLaunchKernel(*args): + calls.append(("cuLaunchKernel", args)) + return 0 + + driver_module.CUresult = FakeCUresult + driver_module.CUstream_flags = FakeCUstreamFlags + driver_module.cuModuleGetFunction = cuModuleGetFunction + driver_module.cuStreamCreate = cuStreamCreate + driver_module.cuLaunchKernel = cuLaunchKernel + + cuda_module = types.ModuleType("cuda") + bindings_module = types.ModuleType("cuda.bindings") + bindings_module.driver = driver_module + cuda_module.bindings = bindings_module + + monkeypatch.setitem(sys.modules, "runner", runner_module) + monkeypatch.setitem(sys.modules, "runner.runtime", runtime_module) + monkeypatch.setitem(sys.modules, "cuda", cuda_module) + monkeypatch.setitem(sys.modules, "cuda.bindings", bindings_module) + monkeypatch.setitem(sys.modules, "cuda.bindings.driver", driver_module) + + return load_module_from_path("test_cuda_bindings_bench_launch", BENCH_LAUNCH_PATH) + + +def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path): + runner_main = load_runner_main(monkeypatch) + + marker_path = tmp_path / "imported.txt" + bench_path = tmp_path / "bench_lazy.py" + bench_path.write_text( + "\n".join( + ( + "from pathlib import Path", + f"Path({str(marker_path)!r}).write_text('imported')", + "", + "def helper() -> float:", + " return 0.0", + "", + "def bench_visible(loops: int) -> float:", + " return loops + 0.5", + "", + ) + ), + encoding="utf-8", + ) + + monkeypatch.setattr(runner_main, "BENCH_DIR", tmp_path) + runner_main._MODULE_CACHE.clear() + + registry = runner_main.discover_benchmarks() + + assert sorted(registry) == ["lazy.visible"] + assert not marker_path.exists() + assert registry["lazy.visible"](3) == 3.5 + assert marker_path.read_text(encoding="utf-8") == "imported" + + +def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch): + runner_main = load_runner_main(monkeypatch) + + for env_var in runner_main.PYPERF_INHERITED_ENV_VARS: + monkeypatch.delenv(env_var, raising=False) + monkeypatch.setenv("CUDA_PATH", "/opt/cuda") + monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64") + + argv = runner_main.ensure_pyperf_worker_env(["--fast", "--inherit-environ=FOO,BAR"]) + + assert argv == ["--fast", "--inherit-environ", "FOO,BAR,CUDA_PATH,LD_LIBRARY_PATH"] + + +def test_bench_launch_initializes_on_first_use(monkeypatch): + calls: list[tuple] = [] + bench_launch = load_bench_launch(monkeypatch, calls) + + assert calls == [] + + bench_launch.bench_launch_empty_kernel(1) + compile_calls = [call for call in calls if call[0] == "compile_and_load"] + launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"] + + assert len(compile_calls) == 1 + assert len(launch_calls) == 1 + + bench_launch.bench_launch_16_args_pre_packed(1) + compile_calls = [call for call in calls if call[0] == "compile_and_load"] + launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"] + + assert len(compile_calls) == 1 + assert len(launch_calls) == 2 From dcf93f87e2b46934afa1c2c9f74f7be1b414a489 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 11:11:25 -0500 Subject: [PATCH 02/10] Add Memory benchmarks --- .../cuda_bindings/benchmarks/bench_memory.py | 90 +++++++++++++++ .../benchmarks/cpp/bench_memory.cpp | 106 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 benchmarks/cuda_bindings/benchmarks/bench_memory.py create mode 100644 benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py new file mode 100644 index 0000000000..faa4795580 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes +import time + +import numpy as np + +from runner.runtime import alloc_persistent, ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +# Allocation size for alloc/free benchmarks +ALLOC_SIZE = 1024 + +# Small transfer size (8 bytes) to measure call overhead, not bandwidth +COPY_SIZE = 8 + +# Pre-allocate device memory and host buffers for memcpy benchmarks +DST_DPTR = alloc_persistent(COPY_SIZE) +SRC_DPTR = alloc_persistent(COPY_SIZE) +HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8) +HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8) + +# Stream for async operations +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + + +def bench_mem_alloc_free(loops: int) -> float: + _cuMemAlloc = cuda.cuMemAlloc + _cuMemFree = cuda.cuMemFree + _size = ALLOC_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _, ptr = _cuMemAlloc(_size) + _cuMemFree(ptr) + return time.perf_counter() - t0 + + +def bench_mem_alloc_async_free_async(loops: int) -> float: + _cuMemAllocAsync = cuda.cuMemAllocAsync + _cuMemFreeAsync = cuda.cuMemFreeAsync + _size = ALLOC_SIZE + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _, ptr = _cuMemAllocAsync(_size, _stream) + _cuMemFreeAsync(ptr, _stream) + return time.perf_counter() - t0 + + +def bench_memcpy_htod(loops: int) -> float: + _cuMemcpyHtoD = cuda.cuMemcpyHtoD + _dst = DST_DPTR + _src = HOST_SRC + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _cuMemcpyHtoD(_dst, _src, _size) + return time.perf_counter() - t0 + + +def bench_memcpy_dtoh(loops: int) -> float: + _cuMemcpyDtoH = cuda.cuMemcpyDtoH + _dst = HOST_DST + _src = SRC_DPTR + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _cuMemcpyDtoH(_dst, _src, _size) + return time.perf_counter() - t0 + + +def bench_memcpy_dtod(loops: int) -> float: + _cuMemcpyDtoD = cuda.cuMemcpyDtoD + _dst = DST_DPTR + _src = SRC_DPTR + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _cuMemcpyDtoD(_dst, _src, _size) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp new file mode 100644 index 0000000000..4e71b73fb5 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +static constexpr size_t ALLOC_SIZE = 1024; +static constexpr size_t COPY_SIZE = 8; + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Pre-allocate device memory for memcpy benchmarks + CUdeviceptr dst_dptr, src_dptr; + check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed"); + check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed"); + + // Host buffers for memcpy + uint8_t host_src[COPY_SIZE] = {}; + uint8_t host_dst[COPY_SIZE] = {}; + + bench::BenchmarkSuite suite(options); + + // --- mem_alloc_free --- + { + CUdeviceptr ptr; + suite.run("memory.mem_alloc_free", [&]() { + check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed"); + check_cu(cuMemFree(ptr), "cuMemFree failed"); + }); + } + + // --- mem_alloc_async_free_async --- + { + CUdeviceptr ptr; + suite.run("memory.mem_alloc_async_free_async", [&]() { + check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed"); + check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed"); + }); + } + + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + // --- memcpy_htod --- + { + suite.run("memory.memcpy_htod", [&]() { + check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed"); + }); + } + + // --- memcpy_dtoh --- + { + suite.run("memory.memcpy_dtoh", [&]() { + check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed"); + }); + } + + // --- memcpy_dtod --- + { + suite.run("memory.memcpy_dtod", [&]() { + check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed"); + }); + } + + // Cleanup + check_cu(cuMemFree(dst_dptr), "cuMemFree failed"); + check_cu(cuMemFree(src_dptr), "cuMemFree failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} From 4097d747c42213ba32dd406d67e39cdfac1b7e6c Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 13:54:40 -0500 Subject: [PATCH 03/10] Add memory benchmarks --- benchmarks/cuda_bindings/AGENTS.md | 4 +- benchmarks/cuda_bindings/README.md | 2 +- .../benchmarks/cpp/bench_launch.cpp | 33 ------------- benchmarks/cuda_bindings/pixi.lock | 48 +++++++++---------- benchmarks/cuda_bindings/pixi.toml | 2 +- 5 files changed, 29 insertions(+), 60 deletions(-) diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md index 04f2f713fa..b9096a737f 100644 --- a/benchmarks/cuda_bindings/AGENTS.md +++ b/benchmarks/cuda_bindings/AGENTS.md @@ -1,4 +1,6 @@ # cuda.bindings benchmarks +Read the README.md in this directory for more details about the benchmarks. + When generating code verify that that the code is correct based on the source for cuda-bindings -that can be found in ../cuda_bindings +that can be found in ../../cuda_bindings diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md index 75e16db031..f8d5ccf043 100644 --- a/benchmarks/cuda_bindings/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system pixi run -e wheel -- python -m pyperf system show # Apply tuning (may require root) -sudo $(pixi run -e wheel -- which python) -m pyperf system tune +$(pixi run -e wheel -- which python) -m pyperf system tune ``` ### Running benchmarks diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp index fb65da6d74..a249426963 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp @@ -168,39 +168,6 @@ int main(int argc, char** argv) { }); } - // --- launch_small_kernel --- - { - void* params[] = {&float_ptr}; - suite.run("launch.launch_small_kernel", [&]() { - check_cu( - cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args --- - { - suite.run("launch.launch_16_args", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) --- - // In C++ the params are always pre-packed, so this is identical to launch_16_args. - // We include it for naming parity with the Python benchmark. - { - suite.run("launch.launch_16_args_pre_packed", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - // Cleanup for (int i = 0; i < 16; ++i) { check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed"); diff --git a/benchmarks/cuda_bindings/pixi.lock b/benchmarks/cuda_bindings/pixi.lock index c610db2f45..c571d4756c 100644 --- a/benchmarks/cuda_bindings/pixi.lock +++ b/benchmarks/cuda_bindings/pixi.lock @@ -38,8 +38,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda @@ -66,7 +66,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda @@ -130,7 +130,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - - conda: .. + - conda: ../../cuda_bindings - conda: ../../cuda_pathfinder wheel: channels: @@ -406,7 +406,7 @@ packages: license_family: GPL size: 31705 timestamp: 1771378159534 -- conda: .. +- conda: ../../cuda_bindings name: cuda-bindings version: 13.2.0 build: hb0f4dca_0 @@ -419,11 +419,11 @@ packages: - cuda-pathfinder - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.2.51,<14.0a0 + - cuda-nvrtc >=13.2.78,<14.0a0 - cuda-nvvm - libnvfatbin - libcufile - - libcufile >=1.17.0.44,<2.0a0 + - libcufile >=1.17.1.22,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -643,17 +643,17 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 35339417 timestamp: 1768272955912 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5 - md5: 0413baaa73be1a39d5d8e442184acc78 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8 + md5: 66623d882c42506fa3f1780b90841400 depends: - __glibc >=2.17,<3.0.a0 - cuda-version >=13.2,<13.3.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35736655 - timestamp: 1773100338749 + size: 35670504 + timestamp: 1776109867257 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3 md5: 1096fce4abad7dd975ce6d9953fceb6a @@ -668,20 +668,20 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 35845 timestamp: 1768273073971 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda - sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879 - md5: 83535200a9e77165d5291b4ac82ebf6a +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda + sha256: 12505f1bbc222acf2a63da5c84e4176d2f9c18b458e2bde28939fdf326b6d292 + md5: cc313f0ea18ebc6e713a8980611431f5 depends: - __glibc >=2.17,<3.0.a0 - - cuda-nvrtc 13.2.51 hecca717_0 + - cuda-nvrtc 13.2.78 hecca717_0 - cuda-version >=13.2,<13.3.0a0 - libgcc >=14 - libstdcxx >=14 constrains: - - cuda-nvrtc-static >=13.2.51 + - cuda-nvrtc-static >=13.2.78 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 36305 - timestamp: 1773100458841 + size: 36312 + timestamp: 1776109983818 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b md5: 03cd3639b8e13623c7b91b1cb0136402 @@ -1018,9 +1018,9 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 990938 timestamp: 1768273732081 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda - sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19 - md5: 6d8ed8335d144ec7303b8d3587b2205c +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda + sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74 + md5: 056a67706108efd1f9c24682ba8d3685 depends: - __glibc >=2.28,<3.0.a0 - cuda-version >=13.2,<13.3.0a0 @@ -1028,8 +1028,8 @@ packages: - libstdcxx >=14 - rdma-core >=61.0 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1085341 - timestamp: 1773100191342 + size: 1082447 + timestamp: 1776110053053 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b md5: 1707cdd636af2ff697b53186572c9f77 diff --git a/benchmarks/cuda_bindings/pixi.toml b/benchmarks/cuda_bindings/pixi.toml index a448e8d3e4..dbbddcd939 100644 --- a/benchmarks/cuda_bindings/pixi.toml +++ b/benchmarks/cuda_bindings/pixi.toml @@ -45,7 +45,7 @@ pre-commit = "*" cuda-bindings = "==13.1.0" [feature.bindings-source.dependencies] -cuda-bindings = { path = ".." } +cuda-bindings = { path = "../../cuda_bindings" } [environments] wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] } From a368a48e07969e911f62d35589c2bc8328da4b1e Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 13:59:42 -0500 Subject: [PATCH 04/10] Move to top level of the repo --- cuda_bindings/benchmarks/.gitignore | 16 - cuda_bindings/benchmarks/README.md | 74 - .../benchmarks/benchmarks/bench_ctx_device.py | 62 - .../benchmarks/benchmarks/bench_event.py | 62 - .../benchmarks/benchmarks/bench_launch.py | 133 -- .../benchmarks/bench_pointer_attributes.py | 25 - .../benchmarks/benchmarks/bench_stream.py | 45 - .../benchmarks/benchmarks/cpp/CMakeLists.txt | 91 - .../benchmarks/cpp/bench_ctx_device.cpp | 87 - .../benchmarks/benchmarks/cpp/bench_event.cpp | 90 - .../benchmarks/cpp/bench_launch.cpp | 216 -- .../cpp/bench_pointer_attributes.cpp | 59 - .../benchmarks/cpp/bench_stream.cpp | 74 - .../benchmarks/cpp/bench_support.hpp | 309 --- cuda_bindings/benchmarks/compare.py | 118 -- cuda_bindings/benchmarks/pixi.lock | 1767 ----------------- cuda_bindings/benchmarks/pixi.toml | 87 - .../benchmarks/pytest-legacy/conftest.py | 93 - .../benchmarks/pytest-legacy/kernels.py | 159 -- .../benchmarks/pytest-legacy/test_cupy.py | 199 -- .../pytest-legacy/test_launch_latency.py | 336 ---- .../benchmarks/pytest-legacy/test_numba.py | 52 - .../pytest-legacy/test_pointer_attributes.py | 112 -- cuda_bindings/benchmarks/run_cpp.py | 8 - cuda_bindings/benchmarks/run_pyperf.py | 8 - cuda_bindings/benchmarks/runner/__init__.py | 3 - cuda_bindings/benchmarks/runner/cpp.py | 180 -- cuda_bindings/benchmarks/runner/main.py | 217 -- cuda_bindings/benchmarks/runner/runtime.py | 105 - cuda_bindings/benchmarks/tests/test_runner.py | 166 -- 30 files changed, 4953 deletions(-) delete mode 100644 cuda_bindings/benchmarks/.gitignore delete mode 100644 cuda_bindings/benchmarks/README.md delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_event.py delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_launch.py delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py delete mode 100644 cuda_bindings/benchmarks/benchmarks/bench_stream.py delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp delete mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp delete mode 100644 cuda_bindings/benchmarks/compare.py delete mode 100644 cuda_bindings/benchmarks/pixi.lock delete mode 100644 cuda_bindings/benchmarks/pixi.toml delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/conftest.py delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/kernels.py delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_cupy.py delete mode 100755 cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_numba.py delete mode 100644 cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py delete mode 100644 cuda_bindings/benchmarks/run_cpp.py delete mode 100644 cuda_bindings/benchmarks/run_pyperf.py delete mode 100644 cuda_bindings/benchmarks/runner/__init__.py delete mode 100644 cuda_bindings/benchmarks/runner/cpp.py delete mode 100644 cuda_bindings/benchmarks/runner/main.py delete mode 100644 cuda_bindings/benchmarks/runner/runtime.py delete mode 100644 cuda_bindings/benchmarks/tests/test_runner.py diff --git a/cuda_bindings/benchmarks/.gitignore b/cuda_bindings/benchmarks/.gitignore deleted file mode 100644 index b795782a32..0000000000 --- a/cuda_bindings/benchmarks/.gitignore +++ /dev/null @@ -1,16 +0,0 @@ -# Build artifacts -.build/ -__pycache__/ - -# Benchmark results -*.json -.benchmarks/ - -# Pixi environments -.pixi/ - -# Override root .gitignore *.cpp rule (which targets Cython-generated files) -!benchmarks/cpp/*.cpp - -results-python.json -results-cpp.json diff --git a/cuda_bindings/benchmarks/README.md b/cuda_bindings/benchmarks/README.md deleted file mode 100644 index 75e16db031..0000000000 --- a/cuda_bindings/benchmarks/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# cuda.bindings benchmarks - -These benchmarks are intended to measure the latency overhead of calling CUDA -Driver APIs through cuda.bindings, relative to a similar C++ baseline. - -The goal is to benchmark how much overhead does the Python layer adds to calling -CUDA APIs and what operations are not in our target of less than 1us of overhead. - -Each Python benchmark has a C++ counterpart, which is used to compare the -operations. We try to make each implementation perform small operations -and nearly the same work as possible and are run under similar conditions. - -These are **not** throughput benchmarks to measure the overall performance -of kernels and applications. - -## Usage - -Requires pixi. - -There are a couple of environments defined based on how `cuda.bindings` is installed: - -- `wheel`: Installs from conda packages -- `source`: Installs from source - -There are a couple of tasks defined: - -- `bench`: Runs the Python benchmarks -- `bench-cpp`: Runs the C++ benchmarks - -### System tuning - -For more stable results on Linux, tune the system before running benchmarks. -See: https://pyperf.readthedocs.io/en/latest/system.html#system - -```bash -# Show current system state -pixi run -e wheel -- python -m pyperf system show - -# Apply tuning (may require root) -sudo $(pixi run -e wheel -- which python) -m pyperf system tune -``` - -### Running benchmarks - -To run the benchmarks combine the environment and task: - -```bash -# Run the Python benchmarks in the wheel environment -pixi run -e wheel bench - -# Run the Python benchmarks in the source environment -pixi run -e source bench - -# Run the C++ benchmarks -pixi run -e wheel bench-cpp -``` - -Both runners automatically save results to JSON files in the benchmarks -directory: `results-python.json` and `results-cpp.json`. - -## Output JSON and analysis - -The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/). -Both Python and C++ results are saved in pyperf-compatible JSON format, -which can be analyzed with pyperf commands: - -```bash -# Show results and statistics -pixi run -e wheel -- python -m pyperf stats results-python.json -pixi run -e wheel -- python -m pyperf stats results-cpp.json - -# Compare C++ vs Python results -pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json -``` diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py deleted file mode 100644 index 1c82cd4046..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import time - -from runner.runtime import ensure_context - -from cuda.bindings import driver as cuda - -CTX = ensure_context() - -_, DEVICE = cuda.cuDeviceGet(0) -ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR - - -def bench_ctx_get_current(loops: int) -> float: - _cuCtxGetCurrent = cuda.cuCtxGetCurrent - - t0 = time.perf_counter() - for _ in range(loops): - _cuCtxGetCurrent() - return time.perf_counter() - t0 - - -def bench_ctx_set_current(loops: int) -> float: - _cuCtxSetCurrent = cuda.cuCtxSetCurrent - _ctx = CTX - - t0 = time.perf_counter() - for _ in range(loops): - _cuCtxSetCurrent(_ctx) - return time.perf_counter() - t0 - - -def bench_ctx_get_device(loops: int) -> float: - _cuCtxGetDevice = cuda.cuCtxGetDevice - - t0 = time.perf_counter() - for _ in range(loops): - _cuCtxGetDevice() - return time.perf_counter() - t0 - - -def bench_device_get(loops: int) -> float: - _cuDeviceGet = cuda.cuDeviceGet - - t0 = time.perf_counter() - for _ in range(loops): - _cuDeviceGet(0) - return time.perf_counter() - t0 - - -def bench_device_get_attribute(loops: int) -> float: - _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute - _attr = ATTRIBUTE - _dev = DEVICE - - t0 = time.perf_counter() - for _ in range(loops): - _cuDeviceGetAttribute(_attr, _dev) - return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_event.py b/cuda_bindings/benchmarks/benchmarks/bench_event.py deleted file mode 100644 index e8e319115d..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/bench_event.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import time - -from runner.runtime import ensure_context - -from cuda.bindings import driver as cuda - -ensure_context() - -_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) -_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value) - -cuda.cuEventRecord(EVENT, STREAM) -cuda.cuStreamSynchronize(STREAM) - -EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value - - -def bench_event_create_destroy(loops: int) -> float: - _cuEventCreate = cuda.cuEventCreate - _cuEventDestroy = cuda.cuEventDestroy - _flags = EVENT_FLAGS - - t0 = time.perf_counter() - for _ in range(loops): - _, e = _cuEventCreate(_flags) - _cuEventDestroy(e) - return time.perf_counter() - t0 - - -def bench_event_record(loops: int) -> float: - _cuEventRecord = cuda.cuEventRecord - _event = EVENT - _stream = STREAM - - t0 = time.perf_counter() - for _ in range(loops): - _cuEventRecord(_event, _stream) - return time.perf_counter() - t0 - - -def bench_event_query(loops: int) -> float: - _cuEventQuery = cuda.cuEventQuery - _event = EVENT - - t0 = time.perf_counter() - for _ in range(loops): - _cuEventQuery(_event) - return time.perf_counter() - t0 - - -def bench_event_synchronize(loops: int) -> float: - _cuEventSynchronize = cuda.cuEventSynchronize - _event = EVENT - - t0 = time.perf_counter() - for _ in range(loops): - _cuEventSynchronize(_event) - return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/cuda_bindings/benchmarks/benchmarks/bench_launch.py deleted file mode 100644 index 931194fbd3..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/bench_launch.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import ctypes -import time - -from runner.runtime import alloc_persistent, assert_drv, compile_and_load - -from cuda.bindings import driver as cuda - -# Compile kernels lazily so benchmark discovery does not need NVRTC. -KERNEL_SOURCE = """\ -extern "C" __global__ void empty_kernel() { return; } -extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } - -#define ITEM_PARAM(x, T) T x -#define REP1(x, T) , ITEM_PARAM(x, T) -#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) -#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) -#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) -#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) - -extern "C" __global__ -void small_kernel_16_args( - ITEM_PARAM(F, int*) - REP1(A, int*) - REP2(A, int*) - REP4(A, int*) - REP8(A, int*)) -{ *F = 0; } -""" - -MODULE = None -EMPTY_KERNEL = None -SMALL_KERNEL = None -KERNEL_16_ARGS = None -STREAM = None -FLOAT_PTR = None -INT_PTRS = None -_VAL_PS = None -PACKED_16 = None - - -def _ensure_launch_state() -> None: - global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM - global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16 - - if EMPTY_KERNEL is not None: - return - - module = compile_and_load(KERNEL_SOURCE) - - err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel") - assert_drv(err) - err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel") - assert_drv(err) - err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") - assert_drv(err) - - err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) - assert_drv(err) - - float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float)) - int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)) - - val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs] - packed_16 = (ctypes.c_void_p * 16)() - for index, value_ptr in enumerate(val_ps): - packed_16[index] = ctypes.addressof(value_ptr) - - MODULE = module - EMPTY_KERNEL = empty_kernel - SMALL_KERNEL = small_kernel - KERNEL_16_ARGS = kernel_16_args - STREAM = stream - FLOAT_PTR = float_ptr - INT_PTRS = int_ptrs - _VAL_PS = val_ps - PACKED_16 = packed_16 - - -def bench_launch_empty_kernel(loops: int) -> float: - _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel - _kernel = EMPTY_KERNEL - _stream = STREAM - - t0 = time.perf_counter() - for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) - return time.perf_counter() - t0 - - -def bench_launch_small_kernel(loops: int) -> float: - _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel - _kernel = SMALL_KERNEL - _stream = STREAM - _args = (FLOAT_PTR,) - _arg_types = (None,) - - t0 = time.perf_counter() - for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) - return time.perf_counter() - t0 - - -def bench_launch_16_args(loops: int) -> float: - _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel - _kernel = KERNEL_16_ARGS - _stream = STREAM - _args = INT_PTRS - _arg_types = (None,) * 16 - - t0 = time.perf_counter() - for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) - return time.perf_counter() - t0 - - -def bench_launch_16_args_pre_packed(loops: int) -> float: - _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel - _kernel = KERNEL_16_ARGS - _stream = STREAM - _packed = PACKED_16 - - t0 = time.perf_counter() - for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) - return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py b/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py deleted file mode 100644 index a02b82c399..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import time - -from runner.runtime import alloc_persistent - -from cuda.bindings import driver as cuda - -# Allocate memory used by the tests -PTR = alloc_persistent(1 << 18) -ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE - - -def bench_pointer_get_attribute(loops: int) -> float: - # Local references to avoid global lookups in the hot loop - _cuPointerGetAttribute = cuda.cuPointerGetAttribute - _attr = ATTRIBUTE - _ptr = PTR - - t0 = time.perf_counter() - for _ in range(loops): - _cuPointerGetAttribute(_attr, _ptr) - return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_stream.py b/cuda_bindings/benchmarks/benchmarks/bench_stream.py deleted file mode 100644 index d816099ed5..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/bench_stream.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import time - -from runner.runtime import ensure_context - -from cuda.bindings import driver as cuda - -ensure_context() - -_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) - - -def bench_stream_create_destroy(loops: int) -> float: - _cuStreamCreate = cuda.cuStreamCreate - _cuStreamDestroy = cuda.cuStreamDestroy - _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value - - t0 = time.perf_counter() - for _ in range(loops): - _, s = _cuStreamCreate(_flags) - _cuStreamDestroy(s) - return time.perf_counter() - t0 - - -def bench_stream_query(loops: int) -> float: - _cuStreamQuery = cuda.cuStreamQuery - _stream = STREAM - - t0 = time.perf_counter() - for _ in range(loops): - _cuStreamQuery(_stream) - return time.perf_counter() - t0 - - -def bench_stream_synchronize(loops: int) -> float: - _cuStreamSynchronize = cuda.cuStreamSynchronize - _stream = STREAM - - t0 = time.perf_counter() - for _ in range(loops): - _cuStreamSynchronize(_stream) - return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt deleted file mode 100644 index b4285834aa..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.24) -project(cuda_bindings_cpp_benchmarks LANGUAGES CXX) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) - -set(CUDA_HOME_HINT "$ENV{CUDA_HOME}") -set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}") - -# Find cuda.h (driver API header) -find_path( - CUDA_DRIVER_INCLUDE_DIR - cuda.h - HINTS - "${CUDA_HOME_HINT}/include" - "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include" - "${CONDA_PREFIX_HINT}/include" -) - -# Find libcuda (driver API library) — lives on the system, not in toolkit -find_library( - CUDA_DRIVER_LIBRARY - NAMES cuda - HINTS - "/usr/lib/x86_64-linux-gnu" - "/usr/lib64" - "${CUDA_HOME_HINT}/lib64/stubs" - "${CUDA_HOME_HINT}/lib/stubs" - "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs" - "${CONDA_PREFIX_HINT}/lib/stubs" -) - -# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks) -find_path( - NVRTC_INCLUDE_DIR - nvrtc.h - HINTS - "${CUDA_HOME_HINT}/include" - "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include" - "${CONDA_PREFIX_HINT}/include" -) - -find_library( - NVRTC_LIBRARY - NAMES nvrtc - HINTS - "${CUDA_HOME_HINT}/lib64" - "${CUDA_HOME_HINT}/lib" - "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib" - "${CONDA_PREFIX_HINT}/lib" -) - -if(NOT CUDA_DRIVER_INCLUDE_DIR) - message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.") -endif() - -if(NOT CUDA_DRIVER_LIBRARY) - message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.") -endif() - -# Helper: add a benchmark that only needs the driver API -function(add_driver_benchmark name) - add_executable(${name}_cpp ${name}.cpp) - target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") - target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") -endfunction() - -# Helper: add a benchmark that needs driver API + NVRTC -function(add_nvrtc_benchmark name) - add_executable(${name}_cpp ${name}.cpp) - target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}") - target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}") -endfunction() - -# Driver-only benchmarks -add_driver_benchmark(bench_pointer_attributes) -add_driver_benchmark(bench_ctx_device) -add_driver_benchmark(bench_stream) -add_driver_benchmark(bench_event) - -# NVRTC benchmarks (require nvrtc for kernel compilation) -if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY) - add_nvrtc_benchmark(bench_launch) -else() - message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.") -endif() diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp deleted file mode 100644 index 052df9cc1d..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "bench_support.hpp" - -#include -#include - - -static void check_cu(CUresult status, const char* message) { - if (status != CUDA_SUCCESS) { - const char* error_name = nullptr; - cuGetErrorName(status, &error_name); - std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; - std::exit(1); - } -} - - -int main(int argc, char** argv) { - bench::Options options = bench::parse_args(argc, argv); - - // Setup: init CUDA and create a context - check_cu(cuInit(0), "cuInit failed"); - - CUdevice device; - check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); - - CUcontext ctx; - CUctxCreateParams ctxParams = {}; - check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - - bench::BenchmarkSuite suite(options); - - // --- ctx_get_current --- - { - CUcontext current_ctx = nullptr; - suite.run("ctx_device.ctx_get_current", [&]() { - check_cu(cuCtxGetCurrent(¤t_ctx), "cuCtxGetCurrent failed"); - }); - } - - // --- ctx_set_current --- - { - suite.run("ctx_device.ctx_set_current", [&]() { - check_cu(cuCtxSetCurrent(ctx), "cuCtxSetCurrent failed"); - }); - } - - // --- ctx_get_device --- - { - CUdevice dev; - suite.run("ctx_device.ctx_get_device", [&]() { - check_cu(cuCtxGetDevice(&dev), "cuCtxGetDevice failed"); - }); - } - - // --- device_get --- - { - CUdevice dev; - suite.run("ctx_device.device_get", [&]() { - check_cu(cuDeviceGet(&dev, 0), "cuDeviceGet failed"); - }); - } - - // --- device_get_attribute --- - { - int value = 0; - suite.run("ctx_device.device_get_attribute", [&]() { - check_cu( - cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), - "cuDeviceGetAttribute failed" - ); - }); - } - - // Cleanup - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - - // Write all results - suite.write(); - - return 0; -} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp deleted file mode 100644 index 44cd617778..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "bench_support.hpp" - -#include -#include - - -static void check_cu(CUresult status, const char* message) { - if (status != CUDA_SUCCESS) { - const char* error_name = nullptr; - cuGetErrorName(status, &error_name); - std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; - std::exit(1); - } -} - - -int main(int argc, char** argv) { - bench::Options options = bench::parse_args(argc, argv); - - // Setup - check_cu(cuInit(0), "cuInit failed"); - - CUdevice device; - check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); - - CUcontext ctx; - CUctxCreateParams ctxParams = {}; - check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - - CUstream stream; - check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); - - // Persistent event for query/synchronize/record benchmarks - CUevent event; - check_cu(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); - - // Record and sync so the event starts in a completed state - check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - - bench::BenchmarkSuite suite(options); - - // --- event_create_destroy --- - { - CUevent e; - suite.run("event.event_create_destroy", [&]() { - check_cu(cuEventCreate(&e, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); - check_cu(cuEventDestroy(e), "cuEventDestroy failed"); - }); - } - - // --- event_record --- - { - suite.run("event.event_record", [&]() { - check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); - }); - } - - // Re-sync so event is in a known completed state after the record benchmark - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - - { - suite.run("event.event_query", [&]() { - // Returns CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if not - cuEventQuery(event); - }); - } - - // --- event_synchronize --- - { - suite.run("event.event_synchronize", [&]() { - check_cu(cuEventSynchronize(event), "cuEventSynchronize failed"); - }); - } - - // Cleanup - check_cu(cuEventDestroy(event), "cuEventDestroy failed"); - check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - - suite.write(); - - return 0; -} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp deleted file mode 100644 index fb65da6d74..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include "bench_support.hpp" - -#include -#include -#include -#include -#include - - -static void check_cu(CUresult status, const char* message) { - if (status != CUDA_SUCCESS) { - const char* error_name = nullptr; - cuGetErrorName(status, &error_name); - std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; - std::exit(1); - } -} - -static void check_nvrtc(nvrtcResult status, const char* message) { - if (status != NVRTC_SUCCESS) { - std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n'; - std::exit(1); - } -} - -static CUmodule compile_and_load(const char* source, CUdevice device) { - int major = 0, minor = 0; - check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), - "cuDeviceGetAttribute failed"); - check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), - "cuDeviceGetAttribute failed"); - - nvrtcProgram prog; - check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr), - "nvrtcCreateProgram failed"); - - std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor); - const char* opts[] = {"--fmad=false", arch.c_str()}; - nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts); - - // Print log on failure - if (compile_result != NVRTC_SUCCESS) { - size_t log_size = 0; - nvrtcGetProgramLogSize(prog, &log_size); - std::vector log(log_size); - nvrtcGetProgramLog(prog, log.data()); - std::cerr << "NVRTC compile failed:\n" << log.data() << '\n'; - std::exit(1); - } - - size_t cubin_size = 0; - check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed"); - std::vector cubin(cubin_size); - check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed"); - nvrtcDestroyProgram(&prog); - - CUmodule module; - check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed"); - return module; -} - - -static const char* KERNEL_SOURCE = R"( -extern "C" __global__ void empty_kernel() { return; } -extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } - -extern "C" __global__ -void small_kernel_16_args( - int* a0, int* a1, int* a2, int* a3, - int* a4, int* a5, int* a6, int* a7, - int* a8, int* a9, int* a10, int* a11, - int* a12, int* a13, int* a14, int* a15) -{ *a0 = 0; } -)"; - - -int main(int argc, char** argv) { - bench::Options options = bench::parse_args(argc, argv); - - // Setup - check_cu(cuInit(0), "cuInit failed"); - - CUdevice device; - check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); - - CUcontext ctx; - CUctxCreateParams ctxParams = {}; - check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - - CUmodule module = compile_and_load(KERNEL_SOURCE, device); - - CUfunction empty_kernel, small_kernel, kernel_16_args; - check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed"); - check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed"); - check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed"); - - CUstream stream; - check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); - - // Allocate device memory for arguments - CUdeviceptr float_ptr; - check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed"); - - CUdeviceptr int_ptrs[16]; - for (int i = 0; i < 16; ++i) { - check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed"); - } - - // Pre-pack kernel params for the pre-packed benchmark - void* packed_16[16]; - for (int i = 0; i < 16; ++i) { - packed_16[i] = &int_ptrs[i]; - } - - bench::BenchmarkSuite suite(options); - - // --- launch_empty_kernel --- - { - suite.run("launch.launch_empty_kernel", [&]() { - check_cu( - cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // Drain the stream between benchmarks so each starts with a clean queue - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - - { - void* params[] = {&float_ptr}; - suite.run("launch.launch_small_kernel", [&]() { - check_cu( - cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - - { - suite.run("launch.launch_16_args", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - - // In C++ the params are always pre-packed, so this is identical to launch_16_args. - // We include it for naming parity with the Python benchmark. - { - suite.run("launch.launch_16_args_pre_packed", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_small_kernel --- - { - void* params[] = {&float_ptr}; - suite.run("launch.launch_small_kernel", [&]() { - check_cu( - cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args --- - { - suite.run("launch.launch_16_args", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) --- - // In C++ the params are always pre-packed, so this is identical to launch_16_args. - // We include it for naming parity with the Python benchmark. - { - suite.run("launch.launch_16_args_pre_packed", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // Cleanup - for (int i = 0; i < 16; ++i) { - check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed"); - } - check_cu(cuMemFree(float_ptr), "cuMemFree failed"); - check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); - check_cu(cuModuleUnload(module), "cuModuleUnload failed"); - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - - suite.write(); - - return 0; -} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp deleted file mode 100644 index 4d9afc6566..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "bench_support.hpp" - -#include -#include - - -static void check_cu(CUresult status, const char* message) { - if (status != CUDA_SUCCESS) { - const char* error_name = nullptr; - cuGetErrorName(status, &error_name); - std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; - std::exit(1); - } -} - - -int main(int argc, char** argv) { - bench::Options options = bench::parse_args(argc, argv); - - // Setup: init CUDA, allocate memory - check_cu(cuInit(0), "cuInit failed"); - - CUdevice device; - check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); - - CUcontext ctx; - CUctxCreateParams ctxParams = {}; - check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - - CUdeviceptr ptr; - check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed"); - - bench::BenchmarkSuite suite(options); - - // --- pointer_get_attribute --- - { - unsigned int memory_type = 0; - suite.run("pointer_attributes.pointer_get_attribute", [&]() { - check_cu( - cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr), - "cuPointerGetAttribute failed" - ); - }); - } - - // Cleanup - check_cu(cuMemFree(ptr), "cuMemFree failed"); - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - - suite.write(); - - return 0; -} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp deleted file mode 100644 index 702e86aef0..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "bench_support.hpp" - -#include -#include - - -static void check_cu(CUresult status, const char* message) { - if (status != CUDA_SUCCESS) { - const char* error_name = nullptr; - cuGetErrorName(status, &error_name); - std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; - std::exit(1); - } -} - - -int main(int argc, char** argv) { - bench::Options options = bench::parse_args(argc, argv); - - // Setup - check_cu(cuInit(0), "cuInit failed"); - - CUdevice device; - check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); - - CUcontext ctx; - CUctxCreateParams ctxParams = {}; - check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - - // Persistent stream for query/synchronize benchmarks - CUstream stream; - check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); - - bench::BenchmarkSuite suite(options); - - // --- stream_create_destroy --- - { - CUstream s; - suite.run("stream.stream_create_destroy", [&]() { - check_cu(cuStreamCreate(&s, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); - check_cu(cuStreamDestroy(s), "cuStreamDestroy failed"); - }); - } - - // --- stream_query --- - { - suite.run("stream.stream_query", [&]() { - // cuStreamQuery returns CUDA_SUCCESS if stream is idle, - // CUDA_ERROR_NOT_READY if busy — both are valid here. - cuStreamQuery(stream); - }); - } - - // --- stream_synchronize --- - { - suite.run("stream.stream_synchronize", [&]() { - check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); - }); - } - - // Cleanup - check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - - suite.write(); - - return 0; -} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp deleted file mode 100644 index 837c15a9d1..0000000000 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace bench { - -struct Options { - std::uint64_t loops = 1000; - std::uint64_t warmups = 5; - std::uint64_t values = 20; - std::uint64_t runs = 20; - std::string output_path; - std::string benchmark_name; -}; - -// A single run result: warmup values and timed values (seconds per loop) -struct RunResult { - std::string date; - double duration_sec; - std::vector warmup_values; // seconds per loop - std::vector values; // seconds per loop -}; - -inline Options parse_args(int argc, char** argv) { - Options options; - for (int i = 1; i < argc; ++i) { - const std::string arg(argv[i]); - if (arg == "--loops" && i + 1 < argc) { - options.loops = std::strtoull(argv[++i], nullptr, 10); - continue; - } - if (arg == "--warmups" && i + 1 < argc) { - options.warmups = std::strtoull(argv[++i], nullptr, 10); - continue; - } - if (arg == "--values" && i + 1 < argc) { - options.values = std::strtoull(argv[++i], nullptr, 10); - continue; - } - if (arg == "--runs" && i + 1 < argc) { - options.runs = std::strtoull(argv[++i], nullptr, 10); - continue; - } - if ((arg == "-o" || arg == "--output") && i + 1 < argc) { - options.output_path = argv[++i]; - continue; - } - if (arg == "--name" && i + 1 < argc) { - options.benchmark_name = argv[++i]; - continue; - } - if (arg == "--help" || arg == "-h") { - std::cout << "Usage: benchmark [options]\n" - << " --loops N Loop iterations per value (default: 1000)\n" - << " --warmups N Warmup values per run (default: 5)\n" - << " --values N Timed values per run (default: 20)\n" - << " --runs N Number of runs (default: 20)\n" - << " -o, --output F Write pyperf-compatible JSON to file\n" - << " --name S Benchmark name (overrides default)\n"; - std::exit(0); - } - - std::cerr << "Unknown argument: " << arg << '\n'; - std::exit(2); - } - return options; -} - -inline std::string iso_now() { - const auto now = std::chrono::system_clock::now(); - const std::time_t t = std::chrono::system_clock::to_time_t(now); - std::tm tm{}; -#ifdef _WIN32 - gmtime_s(&tm, &t); -#else - gmtime_r(&t, &tm); -#endif - char buf[64]; - std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm); - return std::string(buf); -} - -// Run a benchmark function. The function signature is: void fn() — one call = one operation. -// The harness calls fn() in a tight loop `loops` times per value. -template -std::vector run_benchmark(const Options& options, Fn&& fn) { - std::vector results; - results.reserve(options.runs); - - for (std::uint64_t r = 0; r < options.runs; ++r) { - RunResult run; - run.date = iso_now(); - const auto run_start = std::chrono::steady_clock::now(); - - // Warmups - for (std::uint64_t w = 0; w < options.warmups; ++w) { - const auto t0 = std::chrono::steady_clock::now(); - for (std::uint64_t i = 0; i < options.loops; ++i) { - fn(); - } - const auto t1 = std::chrono::steady_clock::now(); - const double elapsed = std::chrono::duration(t1 - t0).count(); - run.warmup_values.push_back(elapsed / static_cast(options.loops)); - } - - // Timed values - for (std::uint64_t v = 0; v < options.values; ++v) { - const auto t0 = std::chrono::steady_clock::now(); - for (std::uint64_t i = 0; i < options.loops; ++i) { - fn(); - } - const auto t1 = std::chrono::steady_clock::now(); - const double elapsed = std::chrono::duration(t1 - t0).count(); - run.values.push_back(elapsed / static_cast(options.loops)); - } - - const auto run_end = std::chrono::steady_clock::now(); - run.duration_sec = std::chrono::duration(run_end - run_start).count(); - results.push_back(std::move(run)); - } - - return results; -} - -inline void print_summary(const std::string& name, const std::vector& results) { - // Collect all timed values - std::vector all_values; - for (const auto& run : results) { - for (double v : run.values) { - all_values.push_back(v); - } - } - if (all_values.empty()) - return; - - double sum = 0; - for (double v : all_values) - sum += v; - - double mean = sum / static_cast(all_values.size()); - - double sq_sum = 0; - for (double v : all_values) { - double diff = v - mean; - sq_sum += diff * diff; - } - double stdev = std::sqrt(sq_sum / static_cast(all_values.size())); - - std::cout << name << ": Mean +- std dev: " - << std::fixed << std::setprecision(0) - << (mean * 1e9) << " ns +- " - << (stdev * 1e9) << " ns\n"; -} - -// Escape a JSON string (minimal — no control chars expected) -inline std::string json_str(const std::string& s) { - return "\"" + s + "\""; -} - -inline void write_pyperf_json( - const std::string& output_path, - const std::string& name, - std::uint64_t loops, - const std::vector& results -) { - std::ofstream out(output_path); - if (!out) { - std::cerr << "Failed to open output file: " << output_path << '\n'; - std::exit(3); - } - - out << std::setprecision(17); - - out << "{\"version\": \"1.0\", "; - out << "\"metadata\": {"; - out << "\"name\": " << json_str(name) << ", "; - out << "\"loops\": " << loops << ", "; - out << "\"unit\": \"second\""; - out << "}, "; - - out << "\"benchmarks\": [{\"runs\": ["; - - for (std::size_t r = 0; r < results.size(); ++r) { - const auto& run = results[r]; - if (r > 0) out << ", "; - - out << "{\"metadata\": {"; - out << "\"date\": " << json_str(run.date) << ", "; - out << "\"duration\": " << run.duration_sec; - out << "}, "; - - // Warmups: array of [loops, value] pairs - out << "\"warmups\": ["; - for (std::size_t w = 0; w < run.warmup_values.size(); ++w) { - if (w > 0) out << ", "; - out << "[" << loops << ", " << run.warmup_values[w] << "]"; - } - out << "], "; - - // Values - out << "\"values\": ["; - for (std::size_t v = 0; v < run.values.size(); ++v) { - if (v > 0) out << ", "; - out << run.values[v]; - } - out << "]}"; - } - - out << "]}]}\n"; -} - -// A collected benchmark entry: name, loops, and run results -struct BenchmarkEntry { - std::string name; - std::uint64_t loops; - std::vector results; -}; - -// Collect multiple benchmarks from a single binary and write them all -// to one pyperf-compatible JSON file. -class BenchmarkSuite { -public: - explicit BenchmarkSuite(Options options) : options_(std::move(options)) {} - - // Run a benchmark and record it. The name is used as the benchmark ID. - template - void run(const std::string& name, Fn&& fn) { - auto results = run_benchmark(options_, std::forward(fn)); - print_summary(name, results); - entries_.push_back({name, options_.loops, std::move(results)}); - } - - // Write all collected benchmarks to the output file (if -o was given). - void write() const { - if (options_.output_path.empty() || entries_.empty()) - return; - write_multi_pyperf_json(options_.output_path, entries_); - } - -private: - Options options_; - std::vector entries_; - - static void write_multi_pyperf_json( - const std::string& output_path, - const std::vector& entries - ) { - std::ofstream out(output_path); - if (!out) { - std::cerr << "Failed to open output file: " << output_path << '\n'; - std::exit(3); - } - - out << std::setprecision(17); - out << "{\"version\": \"1.0\", \"benchmarks\": ["; - - for (std::size_t e = 0; e < entries.size(); ++e) { - const auto& entry = entries[e]; - if (e > 0) out << ", "; - - out << "{\"metadata\": {"; - out << "\"name\": " << json_str(entry.name) << ", "; - out << "\"loops\": " << entry.loops << ", "; - out << "\"unit\": \"second\""; - out << "}, \"runs\": ["; - - for (std::size_t r = 0; r < entry.results.size(); ++r) { - const auto& run = entry.results[r]; - if (r > 0) out << ", "; - - out << "{\"metadata\": {"; - out << "\"date\": " << json_str(run.date) << ", "; - out << "\"duration\": " << run.duration_sec; - out << "}, "; - - out << "\"warmups\": ["; - for (std::size_t w = 0; w < run.warmup_values.size(); ++w) { - if (w > 0) out << ", "; - out << "[" << entry.loops << ", " << run.warmup_values[w] << "]"; - } - out << "], "; - - out << "\"values\": ["; - for (std::size_t v = 0; v < run.values.size(); ++v) { - if (v > 0) out << ", "; - out << run.values[v]; - } - out << "]}"; - } - out << "]}"; - } - out << "]}\n"; - } -}; - -} // namespace bench diff --git a/cuda_bindings/benchmarks/compare.py b/cuda_bindings/benchmarks/compare.py deleted file mode 100644 index 6a3e94f344..0000000000 --- a/cuda_bindings/benchmarks/compare.py +++ /dev/null @@ -1,118 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -"""Compare Python and C++ benchmark results in a summary table.""" - -import argparse -import json -import statistics -import sys -from pathlib import Path - -PROJECT_ROOT = Path(__file__).resolve().parent -DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json" -DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json" - - -def load_benchmarks(path: Path) -> dict[str, list[float]]: - """Load a pyperf JSON file and return {name: [values]}.""" - with open(path) as f: - data = json.load(f) - - results: dict[str, list[float]] = {} - for bench in data.get("benchmarks", []): - name = bench.get("metadata", {}).get("name", "") - if not name: - # Try to find name in run metadata - for run in bench.get("runs", []): - name = run.get("metadata", {}).get("name", "") - if name: - break - values = [] - for run in bench.get("runs", []): - values.extend(run.get("values", [])) - if name and values: - results[name] = values - return results - - -def fmt_ns(seconds: float) -> str: - ns = seconds * 1e9 - if ns >= 1000: - return f"{ns / 1000:.2f} us" - return f"{ns:.0f} ns" - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results") - parser.add_argument( - "--python", - type=Path, - default=DEFAULT_PYTHON, - help=f"Python results JSON (default: {DEFAULT_PYTHON.name})", - ) - parser.add_argument( - "--cpp", - type=Path, - default=DEFAULT_CPP, - help=f"C++ results JSON (default: {DEFAULT_CPP.name})", - ) - args = parser.parse_args() - - if not args.python.exists(): - print(f"Python results not found: {args.python}", file=sys.stderr) - print("Run: pixi run -e wheel bench", file=sys.stderr) - sys.exit(1) - - py_benchmarks = load_benchmarks(args.python) - cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {} - - if not py_benchmarks: - print("No benchmarks found in Python results.", file=sys.stderr) - sys.exit(1) - - # Column widths - all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks)) - name_width = max(len(n) for n in all_names) - name_width = max(name_width, len("Benchmark")) - - # Header - if cpp_benchmarks: - header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" - sep = "-" * len(header) - print(sep) - print(header) - print(sep) - else: - header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" - sep = "-" * len(header) - print(sep) - print(header) - print(sep) - - for name in all_names: - py_vals = py_benchmarks.get(name) - cpp_vals = cpp_benchmarks.get(name) - - py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" - cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" - - if py_vals and cpp_vals: - py_mean = statistics.mean(py_vals) - cpp_mean = statistics.mean(cpp_vals) - overhead_ns = (py_mean - cpp_mean) * 1e9 - overhead_str = f"+{overhead_ns:.0f} ns" - else: - overhead_str = "-" - - if cpp_benchmarks: - print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") - else: - print(f"{name:<{name_width}} {py_str:>14}") - - print(sep) - - -if __name__ == "__main__": - main() diff --git a/cuda_bindings/benchmarks/pixi.lock b/cuda_bindings/benchmarks/pixi.lock deleted file mode 100644 index c610db2f45..0000000000 --- a/cuda_bindings/benchmarks/pixi.lock +++ /dev/null @@ -1,1767 +0,0 @@ -version: 6 -environments: - default: - channels: - - url: https://conda.anaconda.org/conda-forge/ - options: - channel-priority: disabled - pypi-prerelease-mode: if-necessary-or-explicit - packages: {} - source: - channels: - - url: https://conda.anaconda.org/conda-forge/ - options: - channel-priority: disabled - pypi-prerelease-mode: if-necessary-or-explicit - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - - conda: .. - - conda: ../../cuda_pathfinder - wheel: - channels: - - url: https://conda.anaconda.org/conda-forge/ - options: - channel-priority: disabled - pypi-prerelease-mode: if-necessary-or-explicit - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda -packages: -- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda - build_number: 20 - sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 - md5: a9f577daf3de00bca7c3c76c0ecbd1de - depends: - - __glibc >=2.17,<3.0.a0 - - libgomp >=7.5.0 - constrains: - - openmp_impl <0.0a0 - license: BSD-3-Clause - license_family: BSD - size: 28948 - timestamp: 1770939786096 -- conda: https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda - sha256: a9c114cbfeda42a226e2db1809a538929d2f118ef855372293bd188f71711c48 - md5: 791365c5f65975051e4e017b5da3abf5 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: GPL-2.0-or-later - license_family: GPL - size: 68072 - timestamp: 1756738968573 -- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils-2.45.1-default_h4852527_101.conda - sha256: 2851d34944b056d028543f0440fb631aeeff204151ea09589d8d9c13882395de - md5: 9902aeb08445c03fb31e01beeb173988 - depends: - - binutils_impl_linux-64 >=2.45.1,<2.45.2.0a0 - license: GPL-3.0-only - license_family: GPL - size: 35128 - timestamp: 1770267175160 -- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_101.conda - sha256: 74341b26a2b9475dc14ba3cf12432fcd10a23af285101883e720216d81d44676 - md5: 83aa53cb3f5fc849851a84d777a60551 - depends: - - ld_impl_linux-64 2.45.1 default_hbd61a6d_101 - - sysroot_linux-64 - - zstd >=1.5.7,<1.6.0a0 - license: GPL-3.0-only - license_family: GPL - size: 3744895 - timestamp: 1770267152681 -- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.45.1-default_h4852527_101.conda - sha256: 4826f97d33cbe54459970a1e84500dbe0cccf8326aaf370e707372ae20ec5a47 - md5: dec96579f9a7035a59492bf6ee613b53 - depends: - - binutils_impl_linux-64 2.45.1 default_hfdba357_101 - license: GPL-3.0-only - license_family: GPL - size: 36060 - timestamp: 1770267177798 -- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 - md5: d2ffd7602c02f2b316fd921d39876885 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: bzip2-1.0.6 - license_family: BSD - size: 260182 - timestamp: 1771350215188 -- conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - sha256: cc9accf72fa028d31c2a038460787751127317dcfa991f8d1f1babf216bb454e - md5: 920bb03579f15389b9e512095ad995b7 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: MIT - license_family: MIT - size: 207882 - timestamp: 1765214722852 -- conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.11.0-h4d9bdce_0.conda - sha256: 8e7a40f16400d7839c82581410aa05c1f8324a693c9d50079f8c50dc9fb241f0 - md5: abd85120de1187b0d1ec305c2173c71b - depends: - - binutils - - gcc - - gcc_linux-64 14.* - license: BSD-3-Clause - license_family: BSD - size: 6693 - timestamp: 1753098721814 -- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda - sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc - md5: 4492fd26db29495f0ba23f146cd5638d - depends: - - __unix - license: ISC - size: 147413 - timestamp: 1772006283803 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda - sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e - md5: cf45f4278afd6f4e6d03eda0f435d527 - depends: - - __glibc >=2.17,<3.0.a0 - - libffi >=3.5.2,<3.6.0a0 - - libgcc >=14 - - pycparser - - python >=3.14,<3.15.0a0 - - python_abi 3.14.* *_cp314 - license: MIT - license_family: MIT - size: 300271 - timestamp: 1761203085220 -- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda - sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449 - md5: 381bd45fb7aa032691f3063aff47e3a1 - depends: - - python >=3.10 - license: MIT - license_family: MIT - size: 13589 - timestamp: 1763607964133 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cmake-4.2.3-hc85cc9f_1.conda - sha256: 5ece78754577b8d9030ec1f09ce1cd481125f27d8d6fcdcfe2c1017661830c61 - md5: 51d37989c1758b5edfe98518088bf700 - depends: - - __glibc >=2.17,<3.0.a0 - - bzip2 >=1.0.8,<2.0a0 - - libcurl >=8.18.0,<9.0a0 - - libexpat >=2.7.4,<3.0a0 - - libgcc >=14 - - liblzma >=5.8.2,<6.0a0 - - libstdcxx >=14 - - libuv >=1.51.0,<2.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - rhash >=1.4.6,<2.0a0 - - zstd >=1.5.7,<1.6.0a0 - license: BSD-3-Clause - license_family: BSD - size: 22330508 - timestamp: 1771383666798 -- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287 - md5: 962b9857ee8e7018c22f2776ffa0b2d7 - depends: - - python >=3.9 - license: BSD-3-Clause - license_family: BSD - size: 27011 - timestamp: 1733218222191 -- conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-14.3.0-he8ccf15_18.conda - sha256: b90ec0e6a9eb22f7240b3584fe785457cff961fec68d40e6aece5d596f9bbd9a - md5: 0e3e144115c43c9150d18fa20db5f31c - depends: - - gcc_impl_linux-64 >=14.3.0,<14.3.1.0a0 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 31705 - timestamp: 1771378159534 -- conda: .. - name: cuda-bindings - version: 13.2.0 - build: hb0f4dca_0 - subdir: linux-64 - variants: - target_platform: linux-64 - depends: - - python - - cuda-version - - cuda-pathfinder - - libnvjitlink - - cuda-nvrtc - - cuda-nvrtc >=13.2.51,<14.0a0 - - cuda-nvvm - - libnvfatbin - - libcufile - - libcufile >=1.17.0.44,<2.0a0 - - libgcc >=15 - - libgcc >=15 - - libstdcxx >=15 - - python_abi 3.14.* *_cp314 - license: LicenseRef-NVIDIA-SOFTWARE-LICENSE - sources: - cuda-pathfinder: - path: ../cuda_pathfinder -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda - sha256: aecfbbc9a687e5daba66b896613a00c617e3eadc21a31b19e53e8e642e83d7a7 - md5: 3bd3abdf71e1b8c53310195677bf00be - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-nvrtc >=13,<14.0a0 - - cuda-nvvm-impl >=13,<14.0a0 - - cuda-pathfinder >=1.1.0,<2 - - cuda-version >=13,<14.0a0 - - libcufile >=1,<2.0a0 - - libgcc >=14 - - libnvjitlink >=13.0,<14.0a0 - - libstdcxx >=14 - - numpy - - python >=3.14,<3.15.0a0 - - python_abi 3.14.* *_cp314 - constrains: - - cuda-python >=13.1.0,<13.2.0a0 - - cuda-cudart >=13,<14.0a0 - license: LicenseRef-NVIDIA-SOFTWARE-LICENSE - size: 7267159 - timestamp: 1764919647948 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.1.115-ha770c72_0.conda - sha256: 0715f15da71587238600f0584bc8d243d8fde602c3d8856f421b58dff3fb9422 - md5: a179486129ff28d053bb16fdb533568e - depends: - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1277295 - timestamp: 1768272295906 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - sha256: e539baa32e3be63f89bd11d421911363faac322903caf58a15a46ba68ae29867 - md5: 4910b7b709f1168baffc2a742b39a222 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1415308 - timestamp: 1773098874302 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.1.115-ha770c72_0.conda - sha256: 82ae1f3e492146722e258e237daa537f4d4df8157b2dfa49a0869eb41a11d284 - md5: 3723bca2a84e6cc0f0a98427b71bec73 - depends: - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 96480 - timestamp: 1768280269206 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - sha256: dd9a74a40b196b1ea150b17ca8fb539dd8f75edd349af354a7bae6dbb43e43b4 - md5: 6f4a609f3d142d4b22728823955249e9 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 97122 - timestamp: 1773115163637 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.1.80-hecca717_0.conda - sha256: 00acb7564e7c7dd60be431bd2a1a937856e38a86535d72281461cd193500a0a4 - md5: 2e2b71c8d67f6ceb1d3820aa438f3580 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart_linux-64 13.1.80 h376f20c_0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24159 - timestamp: 1764883525821 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - sha256: 9cc44fd4914738a32cf5c801925a08c61ce45b5534833cf1df1621236a9a321d - md5: 29f5b46965bd82b0e9cc27a96d13f2bd - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart_linux-64 13.2.51 h376f20c_0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24534 - timestamp: 1773104357094 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.1.80-hecca717_0.conda - sha256: 12aa5dcf82cdf863be18a48a9ad4d271aa864ef985752bc9707371b84085f0c8 - md5: e3cbe24bf8ae135e9f82450be520e886 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart 13.1.80 hecca717_0 - - cuda-cudart-dev_linux-64 13.1.80 h376f20c_0 - - cuda-cudart-static 13.1.80 hecca717_0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24597 - timestamp: 1764883573873 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - sha256: f6d81c961b6212389c07ffc9dc1268966db63aa351d46875effee40447eb9dd8 - md5: 9b35a56418b6cbbde5ea5f7d84c26317 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart 13.2.51 hecca717_0 - - cuda-cudart-dev_linux-64 13.2.51 h376f20c_0 - - cuda-cudart-static 13.2.51 hecca717_0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24961 - timestamp: 1773104406956 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.1.80-h376f20c_0.conda - sha256: 41a1cc86f2759ef6ae47cc68e2180baaeb4b989709931366ee0cdc90f8e10f5f - md5: a36776a49ae0e47a26e129bdc82aeb3e - depends: - - cuda-cccl_linux-64 - - cuda-cudart-static_linux-64 - - cuda-cudart_linux-64 - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 392459 - timestamp: 1764883538793 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - sha256: 86dd0dc301bab5263d63f13d47b02507e0cf2fd22ff9aefa37dea2dd03c6df83 - md5: 7e5cf4b991525b7b1a2cfa3f1c81462e - depends: - - cuda-cccl_linux-64 - - cuda-cudart-static_linux-64 - - cuda-cudart_linux-64 - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 399921 - timestamp: 1773104368666 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - sha256: 7cbf145b3e59d360052556bfe9425753b119c33cbba0c1f20f0191a7330ced5c - md5: 0e5edde73725a13f7d62ddf96b7656b9 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart-static_linux-64 13.1.80 h376f20c_0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24119 - timestamp: 1764883551735 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - sha256: d4a316038b02161e04a864c8cd146d2ec62cbd114eb951197c6ef6042d3c46c4 - md5: daec4c4dc0355adcdf009dceb3b94259 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-cudart-static_linux-64 13.2.51 h376f20c_0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 24494 - timestamp: 1773104383494 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - sha256: 2252e12fa9a806f685684b6395a660d845dc95bdc95e52a6bc09dba8a9eccec3 - md5: be9f8ef5a01fca1f28c8d523f8501771 - depends: - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1121385 - timestamp: 1764883490595 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - sha256: e3cc51809bd8be0a96bbe01a668f08e6e611c8fba60426c4d9f10926f3159456 - md5: aa9c7d5cd427042ffbd59c9ef6014f98 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1103784 - timestamp: 1773104321614 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - sha256: fca2951815564c36cf5a4e0f7ed0222429d206fda3d4e1aa3d52a969a293b868 - md5: 4dc4c3a1e010e06035f01d661c1b70bd - depends: - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 199654 - timestamp: 1764883502803 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - sha256: e1d943a5582c8e171c9dcf2c0c72ddd5bf0a2ac9acd6ed15898d69d618cf53c6 - md5: 51a1624c7e26d8821b5d959ee7ecb517 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 203460 - timestamp: 1773104333900 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda - sha256: 83bf37d5a3b4a85853cded6a8b90db302b014845b7d9461ccdb84db8c2abfbc3 - md5: 1d7073905d0359ff234545494a933d59 - depends: - - cuda-version >=13.1,<13.2.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 38992 - timestamp: 1764883514338 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda - sha256: 1b372b7af937a3a2fdb1cbd5356e6b365f3495d899a413ebf98369ab0c5c0c79 - md5: 970891239574056829fc1cfc208278a7 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 39485 - timestamp: 1773104345638 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b - md5: df16c9049d882cdaf4f83a5b90079589 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35339417 - timestamp: 1768272955912 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5 - md5: 0413baaa73be1a39d5d8e442184acc78 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35736655 - timestamp: 1773100338749 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda - sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3 - md5: 1096fce4abad7dd975ce6d9953fceb6a - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-nvrtc 13.1.115 hecca717_0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - constrains: - - cuda-nvrtc-static >=13.1.115 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35845 - timestamp: 1768273073971 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda - sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879 - md5: 83535200a9e77165d5291b4ac82ebf6a - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-nvrtc 13.2.51 hecca717_0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - constrains: - - cuda-nvrtc-static >=13.2.51 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 36305 - timestamp: 1773100458841 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b - md5: 03cd3639b8e13623c7b91b1cb0136402 - depends: - - cuda-nvvm-dev_linux-64 13.2.51.* - - cuda-nvvm-impl 13.2.51.* - - cuda-nvvm-tools 13.2.51.* - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 25494 - timestamp: 1773157399568 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - sha256: f00fce92bf7f1da314654f7693f571a014aaa2ba1fae3762634f3e5be254da83 - md5: 57724ac113f7435762d0c39e1b1ad341 - depends: - - cuda-version >=13.2,<13.3.0a0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 28399 - timestamp: 1773115185916 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda - sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c - md5: bf76661bc0de83a60537c4913f339fb3 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=12 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 21873791 - timestamp: 1768280315627 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - sha256: bea7cbd2ff0f8bf07e0b90d522b4834533b4024237322c09f1b3875970c4abc9 - md5: 3c3872ff2bd6cc6368dcd4b35bb995f2 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=12 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 22202489 - timestamp: 1773115209641 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - sha256: da5fd2dc57df2047215ff76f295685b1e1e586a46c2e46214120458cee18ee80 - md5: 2df6cd3b3d6d1365a2979285703056f9 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=12 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 25988523 - timestamp: 1773115248060 -- conda: ../../cuda_pathfinder - name: cuda-pathfinder - version: 1.3.4a0 - build: pyh4616a5c_0 - subdir: noarch - variants: - target_platform: noarch - depends: - - python >=3.10 - - python * - license: Apache-2.0 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda - sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151 - md5: 2d13e524da66b60e6e7d5c6585729ea8 - depends: - - python >=3.10 - - cuda-version >=12.0,<14 - - python - license: Apache-2.0 - license_family: APACHE - size: 39327 - timestamp: 1772059437166 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda - sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992 - md5: d85448460c25ee43ff2f8346bb9ad52b - constrains: - - cudatoolkit 13.1|13.1.* - - __cuda >=13 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 21511 - timestamp: 1757017115788 -- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679 - md5: 45676e3dd76b30ec613f1f822d450eff - constrains: - - __cuda >=13 - - cudatoolkit 13.2|13.2.* - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 21908 - timestamp: 1773093709154 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda - sha256: 3fcc97ae3e89c150401a50a4de58794ffc67b1ed0e1851468fcc376980201e25 - md5: 5da8c935dca9186673987f79cef0b2a5 - depends: - - c-compiler 1.11.0 h4d9bdce_0 - - gxx - - gxx_linux-64 14.* - license: BSD-3-Clause - license_family: BSD - size: 6635 - timestamp: 1753098722177 -- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda - sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e - md5: 003b8ba0a94e2f1e117d0bd46aebc901 - depends: - - python >=3.9 - license: Apache-2.0 - license_family: APACHE - size: 275642 - timestamp: 1752823081585 -- conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda - sha256: ee6cf346d017d954255bbcbdb424cddea4d14e4ed7e9813e429db1d795d01144 - md5: 8e662bd460bda79b1ea39194e3c4c9ab - depends: - - python >=3.10 - - typing_extensions >=4.6.0 - license: MIT and PSF-2.0 - size: 21333 - timestamp: 1763918099466 -- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda - sha256: dddea9ec53d5e179de82c24569d41198f98db93314f0adae6b15195085d5567f - md5: f58064cec97b12a7136ebb8a6f8a129b - depends: - - python >=3.10 - license: Unlicense - size: 25845 - timestamp: 1773314012590 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda - sha256: 9b34b57b06b485e33a40d430f71ac88c8f381673592507cf7161c50ff0832772 - md5: 52d6457abc42e320787ada5f9033fa99 - depends: - - conda-gcc-specs - - gcc_impl_linux-64 14.3.0 hbdf3cc3_18 - license: BSD-3-Clause - license_family: BSD - size: 29506 - timestamp: 1771378321585 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-14.3.0-hbdf3cc3_18.conda - sha256: 3b31a273b806c6851e16e9cf63ef87cae28d19be0df148433f3948e7da795592 - md5: 30bb690150536f622873758b0e8d6712 - depends: - - binutils_impl_linux-64 >=2.45 - - libgcc >=14.3.0 - - libgcc-devel_linux-64 14.3.0 hf649bbc_118 - - libgomp >=14.3.0 - - libsanitizer 14.3.0 h8f1669f_18 - - libstdcxx >=14.3.0 - - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118 - - sysroot_linux-64 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 76302378 - timestamp: 1771378056505 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-14.3.0-h298d278_21.conda - sha256: 27ad0cd10dccffca74e20fb38c9f8643ff8fce56eee260bf89fa257d5ab0c90a - md5: 1403ed5fe091bd7442e4e8a229d14030 - depends: - - gcc_impl_linux-64 14.3.0.* - - binutils_linux-64 - - sysroot_linux-64 - license: BSD-3-Clause - license_family: BSD - size: 28946 - timestamp: 1770908213807 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-14.3.0-h76987e4_18.conda - sha256: 1b490c9be9669f9c559db7b2a1f7d8b973c58ca0c6f21a5d2ba3f0ab2da63362 - md5: 19189121d644d4ef75fed05383bc75f5 - depends: - - gcc 14.3.0 h0dff253_18 - - gxx_impl_linux-64 14.3.0 h2185e75_18 - license: BSD-3-Clause - license_family: BSD - size: 28883 - timestamp: 1771378355605 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-14.3.0-h2185e75_18.conda - sha256: 38ffca57cc9c264d461ac2ce9464a9d605e0f606d92d831de9075cb0d95fc68a - md5: 6514b3a10e84b6a849e1b15d3753eb22 - depends: - - gcc_impl_linux-64 14.3.0 hbdf3cc3_18 - - libstdcxx-devel_linux-64 14.3.0 h9f08a49_118 - - sysroot_linux-64 - - tzdata - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 14566100 - timestamp: 1771378271421 -- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-14.3.0-he467f4b_21.conda - sha256: 1e07c197e0779fa9105e59cd55a835ded96bfde59eb169439736a89b27b48e5d - md5: 7b51f4ff82eeb1f386bfee20a7bed3ed - depends: - - gxx_impl_linux-64 14.3.0.* - - gcc_linux-64 ==14.3.0 h298d278_21 - - binutils_linux-64 - - sysroot_linux-64 - license: BSD-3-Clause - license_family: BSD - size: 27503 - timestamp: 1770908213813 -- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda - sha256: 142a722072fa96cf16ff98eaaf641f54ab84744af81754c292cb81e0881c0329 - md5: 186a18e3ba246eccfc7cff00cd19a870 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - libstdcxx >=14 - license: MIT - license_family: MIT - size: 12728445 - timestamp: 1767969922681 -- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.17-pyhd8ed1ab_0.conda - sha256: 7cd5eccdb171a0adbf83a1ad8fc4e17822f4fc3f5518da9040de64e88bc07343 - md5: 5b7ae2ec4e0750e094f804a6cf1b2a37 - depends: - - python >=3.10 - - ukkonen - license: MIT - license_family: MIT - size: 79520 - timestamp: 1772402363021 -- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda - sha256: c18ab120a0613ada4391b15981d86ff777b5690ca461ea7e9e49531e8f374745 - md5: 63ccfdc3a3ce25b027b8767eb722fca8 - depends: - - python >=3.9 - - zipp >=3.20 - - python - license: Apache-2.0 - license_family: APACHE - size: 34641 - timestamp: 1747934053147 -- conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda - sha256: e1a9e3b1c8fe62dc3932a616c284b5d8cbe3124bbfbedcf4ce5c828cb166ee19 - md5: 9614359868482abba1bd15ce465e3c42 - depends: - - python >=3.10 - license: MIT - license_family: MIT - size: 13387 - timestamp: 1760831448842 -- conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda - sha256: 41557eeadf641de6aeae49486cef30d02a6912d8da98585d687894afd65b356a - md5: 86d9cba083cd041bfbf242a01a7a1999 - constrains: - - sysroot_linux-64 ==2.28 - license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later - license_family: GPL - size: 1278712 - timestamp: 1765578681495 -- conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda - sha256: 0960d06048a7185d3542d850986d807c6e37ca2e644342dd0c72feefcf26c2a4 - md5: b38117a3c920364aff79f870c984b4a3 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: LGPL-2.1-or-later - size: 134088 - timestamp: 1754905959823 -- conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda - sha256: 3e307628ca3527448dd1cb14ad7bb9d04d1d28c7d4c5f97ba196ae984571dd25 - md5: fb53fb07ce46a575c5d004bbc96032c2 - depends: - - __glibc >=2.17,<3.0.a0 - - keyutils >=1.6.3,<2.0a0 - - libedit >=3.1.20250104,<3.2.0a0 - - libedit >=3.1.20250104,<4.0a0 - - libgcc >=14 - - libstdcxx >=14 - - openssl >=3.5.5,<4.0a0 - license: MIT - license_family: MIT - size: 1386730 - timestamp: 1769769569681 -- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda - sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3 - md5: 12bd9a3f089ee6c9266a37dab82afabd - depends: - - __glibc >=2.17,<3.0.a0 - - zstd >=1.5.7,<1.6.0a0 - constrains: - - binutils_impl_linux-64 2.45.1 - license: GPL-3.0-only - license_family: GPL - size: 725507 - timestamp: 1770267139900 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - build_number: 5 - sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c - md5: c160954f7418d7b6e87eaf05a8913fa9 - depends: - - libopenblas >=0.3.30,<0.3.31.0a0 - - libopenblas >=0.3.30,<1.0a0 - constrains: - - mkl <2026 - - liblapack 3.11.0 5*_openblas - - libcblas 3.11.0 5*_openblas - - blas 2.305 openblas - - liblapacke 3.11.0 5*_openblas - license: BSD-3-Clause - license_family: BSD - size: 18213 - timestamp: 1765818813880 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - sha256: 9517cce5193144af0fcbf19b7bd67db0a329c2cc2618f28ffecaa921a1cbe9d3 - md5: 09c264d40c67b82b49a3f3b89037bd2e - depends: - - __glibc >=2.17,<3.0.a0 - - attr >=2.5.2,<2.6.0a0 - - libgcc >=14 - license: BSD-3-Clause - license_family: BSD - size: 121429 - timestamp: 1762349484074 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - build_number: 5 - sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 - md5: 6636a2b6f1a87572df2970d3ebc87cc0 - depends: - - libblas 3.11.0 5_h4a7cf45_openblas - constrains: - - liblapacke 3.11.0 5*_openblas - - blas 2.305 openblas - - liblapack 3.11.0 5*_openblas - license: BSD-3-Clause - license_family: BSD - size: 18194 - timestamp: 1765818837135 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1 - md5: 48418c48dac04671fa46cb446122b8a5 - depends: - - __glibc >=2.28,<3.0.a0 - - cuda-version >=13.1,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - - rdma-core >=60.0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 990938 - timestamp: 1768273732081 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda - sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19 - md5: 6d8ed8335d144ec7303b8d3587b2205c - depends: - - __glibc >=2.28,<3.0.a0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - - rdma-core >=61.0 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1085341 - timestamp: 1773100191342 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda - sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b - md5: 1707cdd636af2ff697b53186572c9f77 - depends: - - __glibc >=2.17,<3.0.a0 - - krb5 >=1.22.2,<1.23.0a0 - - libgcc >=14 - - libnghttp2 >=1.67.0,<2.0a0 - - libssh2 >=1.11.1,<2.0a0 - - libzlib >=1.3.1,<2.0a0 - - openssl >=3.5.5,<4.0a0 - - zstd >=1.5.7,<1.6.0a0 - license: curl - license_family: MIT - size: 463621 - timestamp: 1770892808818 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda - sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724 - md5: c277e0a4d549b03ac1e9d6cbbe3d017b - depends: - - ncurses - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - ncurses >=6.5,<7.0a0 - license: BSD-2-Clause - license_family: BSD - size: 134676 - timestamp: 1738479519902 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda - sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4 - md5: 172bf1cd1ff8629f2b1179945ed45055 - depends: - - libgcc-ng >=12 - license: BSD-2-Clause - license_family: BSD - size: 112766 - timestamp: 1702146165126 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda - sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5 - md5: e7f7ce06ec24cfcfb9e36d28cf82ba57 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - constrains: - - expat 2.7.4.* - license: MIT - license_family: MIT - size: 76798 - timestamp: 1771259418166 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda - sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 - md5: a360c33a5abe61c07959e449fa1453eb - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: MIT - license_family: MIT - size: 58592 - timestamp: 1769456073053 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda - sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 - md5: 0aa00f03f9e39fb9876085dee11a85d4 - depends: - - __glibc >=2.17,<3.0.a0 - - _openmp_mutex >=4.5 - constrains: - - libgcc-ng ==15.2.0=*_18 - - libgomp 15.2.0 he0feb66_18 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 1041788 - timestamp: 1771378212382 -- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-14.3.0-hf649bbc_118.conda - sha256: 1abc6a81ee66e8ac9ac09a26e2d6ad7bba23f0a0cc3a6118654f036f9c0e1854 - md5: 06901733131833f5edd68cf3d9679798 - depends: - - __unix - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 3084533 - timestamp: 1771377786730 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda - sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893 - md5: d5e96b1ed75ca01906b3d2469b4ce493 - depends: - - libgcc 15.2.0 he0feb66_18 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 27526 - timestamp: 1771378224552 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda - sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee - md5: 9063115da5bc35fdc3e1002e69b9ef6e - depends: - - libgfortran5 15.2.0 h68bc16d_18 - constrains: - - libgfortran-ng ==15.2.0=*_18 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 27523 - timestamp: 1771378269450 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda - sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 - md5: 646855f357199a12f02a87382d429b75 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=15.2.0 - constrains: - - libgfortran 15.2.0 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 2482475 - timestamp: 1771378241063 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda - sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 - md5: 239c5e9546c38a1e884d69effcf4c882 - depends: - - __glibc >=2.17,<3.0.a0 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 603262 - timestamp: 1771378117851 -- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda - build_number: 5 - sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 - md5: b38076eb5c8e40d0106beda6f95d7609 - depends: - - libblas 3.11.0 5_h4a7cf45_openblas - constrains: - - blas 2.305 openblas - - liblapacke 3.11.0 5*_openblas - - libcblas 3.11.0 5*_openblas - license: BSD-3-Clause - license_family: BSD - size: 18200 - timestamp: 1765818857876 -- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda - sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb - md5: c7c83eecbb72d88b940c249af56c8b17 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - constrains: - - xz 5.8.2.* - license: 0BSD - size: 113207 - timestamp: 1768752626120 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda - sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 - md5: 2c21e66f50753a083cbe6b80f38268fa - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: BSD-2-Clause - license_family: BSD - size: 92400 - timestamp: 1769482286018 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda - sha256: a4a7dab8db4dc81c736e9a9b42bdfd97b087816e029e221380511960ac46c690 - md5: b499ce4b026493a13774bcf0f4c33849 - depends: - - __glibc >=2.17,<3.0.a0 - - c-ares >=1.34.5,<2.0a0 - - libev >=4.33,<4.34.0a0 - - libev >=4.33,<5.0a0 - - libgcc >=14 - - libstdcxx >=14 - - libzlib >=1.3.1,<2.0a0 - - openssl >=3.5.2,<4.0a0 - license: MIT - license_family: MIT - size: 666600 - timestamp: 1756834976695 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4 - md5: db63358239cbe1ff86242406d440e44a - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: LGPL-2.1-or-later - license_family: LGPL - size: 741323 - timestamp: 1731846827427 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - sha256: 66b7bbe40d259e4927b9c264569afd49d0e31a3813c585beea63f3415577f1b3 - md5: 7e6534bce7252c84efdedae1fae2148e - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13.2,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 471076 - timestamp: 1773100181931 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.1.115-hecca717_1.conda - sha256: 6b5300bf9952da4bfdbfb45c13b042d786a0daffb1bd2fa45ea9ad971703fe96 - md5: 851acc1af02d31c732b931b9ffddc2d9 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13,<13.2.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 31328660 - timestamp: 1771443943495 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda - sha256: 2ca45a2c9e6cc307cea3c8a1bf27bceb745fa5e1150d7b768b63a781eeaee7a2 - md5: 20a82402e6851e5d4e0b13ee1083d370 - depends: - - __glibc >=2.17,<3.0.a0 - - cuda-version >=13,<13.3.0a0 - - libgcc >=14 - - libstdcxx >=14 - license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 31691081 - timestamp: 1773100788615 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 - md5: be43915efc66345cccb3c310b6ed0374 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - libgfortran - - libgfortran5 >=14.3.0 - constrains: - - openblas >=0.3.30,<0.3.31.0a0 - license: BSD-3-Clause - license_family: BSD - size: 5927939 - timestamp: 1763114673331 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda - sha256: e03ed186eefb46d7800224ad34bad1268c9d19ecb8f621380a50601c6221a4a7 - md5: ad3a0e2dc4cce549b2860e2ef0e6d75b - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14.3.0 - - libstdcxx >=14.3.0 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 7949259 - timestamp: 1771377982207 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda - sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217 - md5: da5be73701eecd0e8454423fd6ffcf30 - depends: - - __glibc >=2.17,<3.0.a0 - - icu >=78.2,<79.0a0 - - libgcc >=14 - - libzlib >=1.3.1,<2.0a0 - license: blessing - size: 942808 - timestamp: 1768147973361 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda - sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661 - md5: eecce068c7e4eddeb169591baac20ac4 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - libzlib >=1.3.1,<2.0a0 - - openssl >=3.5.0,<4.0a0 - license: BSD-3-Clause - license_family: BSD - size: 304790 - timestamp: 1745608545575 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda - sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e - md5: 1b08cd684f34175e4514474793d44bcb - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc 15.2.0 he0feb66_18 - constrains: - - libstdcxx-ng ==15.2.0=*_18 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 5852330 - timestamp: 1771378262446 -- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-14.3.0-h9f08a49_118.conda - sha256: b1c3824769b92a1486bf3e2cc5f13304d83ae613ea061b7bc47bb6080d6dfdba - md5: 865a399bce236119301ebd1532fced8d - depends: - - __unix - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - size: 20171098 - timestamp: 1771377827750 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda - sha256: f0356bb344a684e7616fc84675cfca6401140320594e8686be30e8ac7547aed2 - md5: 1d4c18d75c51ed9d00092a891a547a7d - depends: - - __glibc >=2.17,<3.0.a0 - - libcap >=2.77,<2.78.0a0 - - libgcc >=14 - license: LGPL-2.1-or-later - size: 491953 - timestamp: 1770738638119 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda - sha256: ed4d2c01fbeb1330f112f7e399408634db277d3dfb2dec1d0395f56feaa24351 - md5: 6c74fba677b61a0842cbf0f63eee683b - depends: - - __glibc >=2.17,<3.0.a0 - - libcap >=2.77,<2.78.0a0 - - libgcc >=14 - license: LGPL-2.1-or-later - size: 144654 - timestamp: 1770738650966 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda - sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee - md5: db409b7c1720428638e7c0d509d3e1b5 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: BSD-3-Clause - license_family: BSD - size: 40311 - timestamp: 1766271528534 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda - sha256: c180f4124a889ac343fc59d15558e93667d894a966ec6fdb61da1604481be26b - md5: 0f03292cc56bf91a077a134ea8747118 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - license: MIT - license_family: MIT - size: 895108 - timestamp: 1753948278280 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 - md5: edb0dca6bc32e4f4789199455a1dbeb8 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - size: 60963 - timestamp: 1727963148474 -- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 - md5: 47e340acb35de30501a76c7c799c41d7 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: X11 AND BSD-3-Clause - size: 891641 - timestamp: 1738195959188 -- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.2-h171cf75_0.conda - sha256: 6f7d59dbec0a7b00bf5d103a4306e8886678b796ff2151b62452d4582b2a53fb - md5: b518e9e92493721281a60fa975bddc65 - depends: - - libstdcxx >=14 - - libgcc >=14 - - __glibc >=2.17,<3.0.a0 - license: Apache-2.0 - license_family: APACHE - size: 186323 - timestamp: 1763688260928 -- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee - md5: eb52d14a901e23c39e9e7b4a1a5c015f - depends: - - python >=3.10 - - setuptools - license: BSD-3-Clause - license_family: BSD - size: 40866 - timestamp: 1766261270149 -- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda - sha256: 1d8377c8001c15ed12c2713b723213474b435706ab9d34ede69795d64af9e94d - md5: 4ea6b620fdf24a1a0bc4f1c7134dfafb - depends: - - python - - libstdcxx >=14 - - libgcc >=14 - - __glibc >=2.17,<3.0.a0 - - libcblas >=3.9.0,<4.0a0 - - python_abi 3.14.* *_cp314 - - libblas >=3.9.0,<4.0a0 - - liblapack >=3.9.0,<4.0a0 - constrains: - - numpy-base <0a0 - license: BSD-3-Clause - license_family: BSD - size: 8926994 - timestamp: 1770098474394 -- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda - sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c - md5: f61eb8cd60ff9057122a3d338b99c00f - depends: - - __glibc >=2.17,<3.0.a0 - - ca-certificates - - libgcc >=14 - license: Apache-2.0 - license_family: Apache - size: 3164551 - timestamp: 1769555830639 -- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda - sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 - md5: b76541e68fea4d511b1ac46a28dcd2c6 - depends: - - python >=3.8 - - python - license: Apache-2.0 - license_family: APACHE - size: 72010 - timestamp: 1769093650580 -- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda - sha256: 0289f0a38337ee201d984f8f31f11f6ef076cfbbfd0ab9181d12d9d1d099bf46 - md5: 82c1787f2a65c0155ef9652466ee98d6 - depends: - - python >=3.10 - - python - license: MIT - license_family: MIT - size: 25646 - timestamp: 1773199142345 -- conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - sha256: e14aafa63efa0528ca99ba568eaf506eb55a0371d12e6250aaaa61718d2eb62e - md5: d7585b6550ad04c8c5e21097ada2888e - depends: - - python >=3.9 - - python - license: MIT - license_family: MIT - size: 25877 - timestamp: 1764896838868 -- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.5.1-pyha770c72_0.conda - sha256: 5b81b7516d4baf43d0c185896b245fa7384b25dc5615e7baa504b7fa4e07b706 - md5: 7f3ac694319c7eaf81a0325d6405e974 - depends: - - cfgv >=2.0.0 - - identify >=1.0.0 - - nodeenv >=0.11.1 - - python >=3.10 - - pyyaml >=5.1 - - virtualenv >=20.10.0 - license: MIT - license_family: MIT - size: 200827 - timestamp: 1765937577534 -- conda: https://conda.anaconda.org/conda-forge/noarch/py-cpuinfo-9.0.0-pyhd8ed1ab_1.conda - sha256: 6d8f03c13d085a569fde931892cded813474acbef2e03381a1a87f420c7da035 - md5: 46830ee16925d5ed250850503b5dc3a8 - depends: - - python >=3.9 - license: MIT - license_family: MIT - size: 25766 - timestamp: 1733236452235 -- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda - sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6 - md5: 12c566707c80111f9799308d9e265aef - depends: - - python >=3.9 - - python - license: BSD-3-Clause - license_family: BSD - size: 110100 - timestamp: 1733195786147 -- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a - md5: 6b6ece66ebcae2d5f326c77ef2c5a066 - depends: - - python >=3.9 - license: BSD-2-Clause - license_family: BSD - size: 889287 - timestamp: 1750615908735 -- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda - sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07 - md5: ed166875b3876d5d7e6e39d2e8d1c6e3 - depends: - - python >=3.14,<3.15.0a0 - - python_abi 3.14.* *_cp314 - - six - license: MIT - license_family: MIT - size: 273897 - timestamp: 1765980972868 -- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda - sha256: 9e749fb465a8bedf0184d8b8996992a38de351f7c64e967031944978de03a520 - md5: 2b694bad8a50dc2f712f5368de866480 - depends: - - pygments >=2.7.2 - - python >=3.10 - - iniconfig >=1.0.1 - - packaging >=22 - - pluggy >=1.5,<2 - - tomli >=1 - - colorama >=0.4 - - exceptiongroup >=1 - - python - constrains: - - pytest-faulthandler >=2 - license: MIT - license_family: MIT - size: 299581 - timestamp: 1765062031645 -- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda - sha256: 2f2229415a6e5387c1faaedf442ea8c07471cb2bf5ad1007b9cfb83ea85ca29a - md5: 0e7294ed4af8b833fcd2c101d647c3da - depends: - - py-cpuinfo - - pytest >=8.1 - - python >=3.10 - license: BSD-2-Clause - license_family: BSD - size: 43976 - timestamp: 1762716480208 -- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda - build_number: 101 - sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd - md5: c014ad06e60441661737121d3eae8a60 - depends: - - __glibc >=2.17,<3.0.a0 - - bzip2 >=1.0.8,<2.0a0 - - ld_impl_linux-64 >=2.36.1 - - libexpat >=2.7.3,<3.0a0 - - libffi >=3.5.2,<3.6.0a0 - - libgcc >=14 - - liblzma >=5.8.2,<6.0a0 - - libmpdec >=4.0.0,<5.0a0 - - libsqlite >=3.51.2,<4.0a0 - - libuuid >=2.41.3,<3.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.5.5,<4.0a0 - - python_abi 3.14.* *_cp314 - - readline >=8.3,<9.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - - zstd >=1.5.7,<1.6.0a0 - license: Python-2.0 - size: 36702440 - timestamp: 1770675584356 - python_site_packages_path: lib/python3.14/site-packages -- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.1.3-pyhcf101f3_0.conda - sha256: 36429765f626c345710fbae14aeeda676c1745427667eb480bb855b7089affba - md5: 69fc0a99fc21b26b81026c72e00f83df - depends: - - python >=3.10 - - filelock >=3.15.4 - - platformdirs <5,>=4.3.6 - - python - license: MIT - license_family: MIT - size: 33996 - timestamp: 1773161039118 -- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda - build_number: 8 - sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 - md5: 0539938c55b6b1a59b560e843ad864a4 - constrains: - - python 3.14.* *_cp314 - license: BSD-3-Clause - license_family: BSD - size: 6989 - timestamp: 1752805904792 -- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda - sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d - md5: 2035f68f96be30dc60a5dfd7452c7941 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - python >=3.14,<3.15.0a0 - - python_abi 3.14.* *_cp314 - - yaml >=0.2.5,<0.3.0a0 - license: MIT - license_family: MIT - size: 202391 - timestamp: 1770223462836 -- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda - sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717 - md5: d487d93d170e332ab39803e05912a762 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - libnl >=3.11.0,<4.0a0 - - libstdcxx >=14 - - libsystemd0 >=257.10 - - libudev1 >=257.10 - license: Linux-OpenIB - license_family: BSD - size: 1268666 - timestamp: 1769154883613 -- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda - sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 - md5: d7d95fc8287ea7bf33e0e7116d2b95ec - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - ncurses >=6.5,<7.0a0 - license: GPL-3.0-only - license_family: GPL - size: 345073 - timestamp: 1765813471974 -- conda: https://conda.anaconda.org/conda-forge/linux-64/rhash-1.4.6-hb9d3cd8_1.conda - sha256: d5c73079c1dd2c2a313c3bfd81c73dbd066b7eb08d213778c8bff520091ae894 - md5: c1c9b02933fdb2cfb791d936c20e887e - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: MIT - license_family: MIT - size: 193775 - timestamp: 1748644872902 -- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda - sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1 - md5: 8e194e7b992f99a5015edbd4ebd38efd - depends: - - python >=3.10 - license: MIT - license_family: MIT - size: 639697 - timestamp: 1773074868565 -- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d - md5: 3339e3b65d58accf4ca4fb8748ab16b3 - depends: - - python >=3.9 - - python - license: MIT - license_family: MIT - size: 18455 - timestamp: 1753199211006 -- conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda - sha256: c47299fe37aebb0fcf674b3be588e67e4afb86225be4b0d452c7eb75c086b851 - md5: 13dc3adbc692664cd3beabd216434749 - depends: - - __glibc >=2.28 - - kernel-headers_linux-64 4.18.0 he073ed8_9 - - tzdata - license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later - license_family: GPL - size: 24008591 - timestamp: 1765578833462 -- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda - sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac - md5: cffd3bdd58090148f4cfcd831f4b26ab - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - libzlib >=1.3.1,<2.0a0 - constrains: - - xorg-libx11 >=1.8.12,<2.0a0 - license: TCL - license_family: BSD - size: 3301196 - timestamp: 1769460227866 -- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda - sha256: 62940c563de45790ba0f076b9f2085a842a65662268b02dd136a8e9b1eaf47a8 - md5: 72e780e9aa2d0a3295f59b1874e3768b - depends: - - python >=3.10 - - python - license: MIT - license_family: MIT - size: 21453 - timestamp: 1768146676791 -- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda - sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 - md5: 0caa1af407ecff61170c9437a808404d - depends: - - python >=3.10 - - python - license: PSF-2.0 - license_family: PSF - size: 51692 - timestamp: 1756220668932 -- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda - sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c - md5: ad659d0a2b3e47e38d829aa8cad2d610 - license: LicenseRef-Public-Domain - size: 119135 - timestamp: 1767016325805 -- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda - sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43 - md5: 5d3c008e54c7f49592fca9c32896a76f - depends: - - __glibc >=2.17,<3.0.a0 - - cffi - - libgcc >=14 - - libstdcxx >=14 - - python >=3.14,<3.15.0a0 - - python_abi 3.14.* *_cp314 - license: MIT - license_family: MIT - size: 15004 - timestamp: 1769438727085 -- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.2.0-pyhcf101f3_0.conda - sha256: b83246d145ba0e6814d2ed0b616293e56924e6c7d6649101f5a4f97f9e757ed1 - md5: 704c22301912f7e37d0a92b2e7d5942d - depends: - - python >=3.10 - - distlib >=0.3.7,<1 - - filelock <4,>=3.24.2 - - importlib-metadata >=6.6 - - platformdirs >=3.9.1,<5 - - python-discovery >=1 - - typing_extensions >=4.13.2 - - python - license: MIT - license_family: MIT - size: 4647775 - timestamp: 1773133660203 -- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad - md5: a77f85f77be52ff59391544bfe73390a - depends: - - libgcc >=14 - - __glibc >=2.17,<3.0.a0 - license: MIT - license_family: MIT - size: 85189 - timestamp: 1753484064210 -- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae - md5: 30cd29cb87d819caead4d55184c1d115 - depends: - - python >=3.10 - - python - license: MIT - license_family: MIT - size: 24194 - timestamp: 1764460141901 -- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 - md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 - depends: - - __glibc >=2.17,<3.0.a0 - - libzlib >=1.3.1,<2.0a0 - license: BSD-3-Clause - license_family: BSD - size: 601375 - timestamp: 1764777111296 diff --git a/cuda_bindings/benchmarks/pixi.toml b/cuda_bindings/benchmarks/pixi.toml deleted file mode 100644 index a448e8d3e4..0000000000 --- a/cuda_bindings/benchmarks/pixi.toml +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -[workspace] -channels = ["conda-forge"] -platforms = ["linux-64"] -preview = ["pixi-build"] -channel-priority = "disabled" - -[feature.cu13.system-requirements] -cuda = "13" - -[feature.cu13-pinned.dependencies] -cuda-version = "13.1.*" - -[feature.cu13-source.dependencies] -cuda-version = "13.*" - -[feature.bench.dependencies] -python = "3.14.*" -pyperf = "*" -pytest = "*" -pytest-benchmark = "*" -numpy = "*" - -[feature.cpp-bench.dependencies] -cmake = "*" -ninja = "*" -cxx-compiler = "*" -cuda-cudart-dev = "*" -cuda-nvrtc-dev = "*" - -[feature.cpp-bench.target.linux-64.dependencies] -cuda-crt-dev_linux-64 = "*" -cuda-driver-dev_linux-64 = "*" - -[feature.cpp-bench.target.linux-64.activation.env] -CUDA_HOME = "$CONDA_PREFIX/targets/x86_64-linux" - -[feature.dev.dependencies] -pre-commit = "*" - -[feature.bindings-wheel.dependencies] -cuda-bindings = "==13.1.0" - -[feature.bindings-source.dependencies] -cuda-bindings = { path = ".." } - -[environments] -wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] } -source = { features = ["cu13", "cu13-source", "bench", "cpp-bench", "dev", "bindings-source"] } - -[target.linux.tasks.bench] -cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"] - -[target.linux.tasks.bench-smoke-test] -cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--fast", "--min-time", "1" -] - -[target.linux.tasks.bench-legacy] -cmd = "pytest --benchmark-only --override-ini 'addopts=' $PIXI_PROJECT_ROOT/pytest-legacy/" - -[target.linux.tasks.bench-cpp-configure] -cmd = [ - "cmake", - "-S", - "$PIXI_PROJECT_ROOT/benchmarks/cpp", - "-B", - "$PIXI_PROJECT_ROOT/.build/cpp", - "-G", - "Ninja", -] - -[target.linux.tasks.bench-cpp-build] -cmd = ["cmake", "--build", "$PIXI_PROJECT_ROOT/.build/cpp"] -depends-on = [{ task = "bench-cpp-configure" }] - -[target.linux.tasks.bench-cpp] -cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"] -depends-on = [{ task = "bench-cpp-build" }] - -[target.linux.tasks.bench-compare] -cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"] - -[target.linux.tasks.lint] -cmd = ["pre-commit", "run", "--all-files"] diff --git a/cuda_bindings/benchmarks/pytest-legacy/conftest.py b/cuda_bindings/benchmarks/pytest-legacy/conftest.py deleted file mode 100644 index 0ea7b1d772..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/conftest.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import numpy as np -import pytest - -from cuda.bindings import driver as cuda -from cuda.bindings import nvrtc -from cuda.bindings import runtime as cudart - - -def ASSERT_DRV(err): - if isinstance(err, cuda.CUresult): - if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError(f"Cuda Error: {err}") - elif isinstance(err, cudart.cudaError_t): - if err != cudart.cudaError_t.cudaSuccess: - raise RuntimeError(f"Cudart Error: {err}") - elif isinstance(err, nvrtc.nvrtcResult): - if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError(f"Nvrtc Error: {err}") - else: - raise RuntimeError(f"Unknown error type: {err}") - - -@pytest.fixture -def init_cuda(): - # Initialize - (err,) = cuda.cuInit(0) - ASSERT_DRV(err) - err, device = cuda.cuDeviceGet(0) - ASSERT_DRV(err) - err, ctx = cuda.cuCtxCreate(None, 0, device) - ASSERT_DRV(err) - - # create stream - err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) - ASSERT_DRV(err) - - yield device, ctx, stream - - (err,) = cuda.cuStreamDestroy(stream) - ASSERT_DRV(err) - (err,) = cuda.cuCtxDestroy(ctx) - ASSERT_DRV(err) - - -@pytest.fixture -def load_module(): - module = None - - def _load_module(kernel_string, device): - nonlocal module - # Get module - err, major = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device - ) - ASSERT_DRV(err) - err, minor = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device - ) - ASSERT_DRV(err) - - err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], []) - ASSERT_DRV(err) - opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")] - (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts) - - err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog) - ASSERT_DRV(err_log) - log = b" " * logSize - (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) - ASSERT_DRV(err_log) - result = log.decode() - if len(result) > 1: - print(result) - - ASSERT_DRV(err) - err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog) - ASSERT_DRV(err) - cubin = b" " * cubinSize - (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) - ASSERT_DRV(err) - cubin = np.char.array(cubin) - err, module = cuda.cuModuleLoadData(cubin) - ASSERT_DRV(err) - - return module - - yield _load_module - - (err,) = cuda.cuModuleUnload(module) - ASSERT_DRV(err) diff --git a/cuda_bindings/benchmarks/pytest-legacy/kernels.py b/cuda_bindings/benchmarks/pytest-legacy/kernels.py deleted file mode 100644 index 36646fba00..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/kernels.py +++ /dev/null @@ -1,159 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -kernel_string = """\ -#define ITEM_PARAM(x, T) T x -#define REP1(x, T) , ITEM_PARAM(x, T) -#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) -#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) -#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) -#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) -#define REP32(x, T) REP16(x##0, T) REP16(x##1, T) -#define REP64(x, T) REP32(x##0, T) REP32(x##1, T) -#define REP128(x, T) REP64(x##0, T) REP64(x##1, T) -#define REP256(x, T) REP128(x##0, T) REP128(x##1, T) - -template -struct KernelFunctionParam -{ - unsigned char p[maxBytes]; -}; - -extern "C" __global__ void small_kernel(float *f) -{ - *f = 0.0f; -} - -extern "C" __global__ void empty_kernel() -{ - return; -} - -extern "C" __global__ -void small_kernel_512_args( - ITEM_PARAM(F, int*) - REP1(A, int*) - REP2(A, int*) - REP4(A, int*) - REP8(A, int*) - REP16(A, int*) - REP32(A, int*) - REP64(A, int*) - REP128(A, int*) - REP256(A, int*)) -{ - *F = 0; -} - -extern "C" __global__ -void small_kernel_512_bools( - ITEM_PARAM(F, bool) - REP1(A, bool) - REP2(A, bool) - REP4(A, bool) - REP8(A, bool) - REP16(A, bool) - REP32(A, bool) - REP64(A, bool) - REP128(A, bool) - REP256(A, bool)) -{ - return; -} - -extern "C" __global__ -void small_kernel_512_ints( - ITEM_PARAM(F, int) - REP1(A, int) - REP2(A, int) - REP4(A, int) - REP8(A, int) - REP16(A, int) - REP32(A, int) - REP64(A, int) - REP128(A, int) - REP256(A, int)) -{ - return; -} - -extern "C" __global__ -void small_kernel_512_doubles( - ITEM_PARAM(F, double) - REP1(A, double) - REP2(A, double) - REP4(A, double) - REP8(A, double) - REP16(A, double) - REP32(A, double) - REP64(A, double) - REP128(A, double) - REP256(A, double)) -{ - return; -} - -extern "C" __global__ -void small_kernel_512_chars( - ITEM_PARAM(F, char) - REP1(A, char) - REP2(A, char) - REP4(A, char) - REP8(A, char) - REP16(A, char) - REP32(A, char) - REP64(A, char) - REP128(A, char) - REP256(A, char)) -{ - return; -} - -extern "C" __global__ -void small_kernel_512_longlongs( - ITEM_PARAM(F, long long) - REP1(A, long long) - REP2(A, long long) - REP4(A, long long) - REP8(A, long long) - REP16(A, long long) - REP32(A, long long) - REP64(A, long long) - REP128(A, long long) - REP256(A, long long)) -{ - return; -} - -extern "C" __global__ -void small_kernel_256_args( - ITEM_PARAM(F, int*) - REP1(A, int*) - REP2(A, int*) - REP4(A, int*) - REP8(A, int*) - REP16(A, int*) - REP32(A, int*) - REP64(A, int*) - REP128(A, int*)) -{ - *F = 0; -} - -extern "C" __global__ -void small_kernel_16_args( - ITEM_PARAM(F, int*) - REP1(A, int*) - REP2(A, int*) - REP4(A, int*) - REP8(A, int*)) -{ - *F = 0; -} - -extern "C" __global__ void small_kernel_2048B(KernelFunctionParam<2048> param) -{ - // Do not touch param to prevent compiler from copying - // the whole structure from const bank to lmem. -} -""" diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py b/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py deleted file mode 100644 index 76dd6e6a45..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py +++ /dev/null @@ -1,199 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import ctypes - -import pytest - -try: - import cupy - - skip_tests = False -except ImportError: - skip_tests = True - -from kernels import kernel_string - - -def launch(kernel, args=()): - kernel((1,), (1,), args) - - -# Measure launch latency with no parmaeters -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_empty_kernel(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("empty_kernel") - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel) - stream.synchronize() - - -# Measure launch latency with a single parameter -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel") - cupy.cuda.set_allocator() - arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float)) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, (arg,)) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_args(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_args") - cupy.cuda.set_allocator() - - args = [] - for _ in range(512): - args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_bools(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_bools") - cupy.cuda.set_allocator() - - args = [True] * 512 - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_doubles(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_doubles") - cupy.cuda.set_allocator() - - args = [1.2345] * 512 - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_ints(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_ints") - cupy.cuda.set_allocator() - - args = [123] * 512 - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_bytes(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_chars") - cupy.cuda.set_allocator() - - args = [127] * 512 - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_512_longlongs(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_512_longlongs") - cupy.cuda.set_allocator() - - args = [9223372036854775806] * 512 - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_256_args(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_256_args") - cupy.cuda.set_allocator() - - args = [] - for _ in range(256): - args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -@pytest.mark.benchmark(group="cupy") -def test_launch_latency_small_kernel_16_args(benchmark): - module = cupy.RawModule(code=kernel_string) - kernel = module.get_function("small_kernel_16_args") - cupy.cuda.set_allocator() - - args = [] - for _ in range(16): - args.append(cupy.cuda.alloc(ctypes.sizeof(ctypes.c_int))) - args = tuple(args) - - stream = cupy.cuda.stream.Stream(non_blocking=True) - - with stream: - benchmark(launch, kernel, args) - stream.synchronize() diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py b/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py deleted file mode 100755 index dd994081a0..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py +++ /dev/null @@ -1,336 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import ctypes - -import pytest -from kernels import kernel_string - -from conftest import ASSERT_DRV -from cuda.bindings import driver as cuda - - -def launch(kernel, stream, args=(), arg_types=()): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - (args, arg_types), - 0, - ) # arguments - - -def launch_packed(kernel, stream, params): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - params, - 0, - ) # arguments - - -# Measure launch latency with no parmaeters -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") - ASSERT_DRV(err) - - benchmark(launch, func, stream) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with a single parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel") - ASSERT_DRV(err) - - err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) - ASSERT_DRV(err) - - benchmark(launch, func, stream, args=(f,), arg_types=(None,)) - - cuda.cuCtxSynchronize() - - (err,) = cuda.cuMemFree(f) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 512 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") - ASSERT_DRV(err) - - args = [True] * 512 - arg_types = [ctypes.c_bool] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") - ASSERT_DRV(err) - - args = [1.2345] * 512 - arg_types = [ctypes.c_double] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") - ASSERT_DRV(err) - - args = [123] * 512 - arg_types = [ctypes.c_int] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") - ASSERT_DRV(err) - - args = [127] * 512 - arg_types = [ctypes.c_byte] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") - ASSERT_DRV(err) - - args = [9223372036854775806] * 512 - arg_types = [ctypes.c_longlong] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 256 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 16 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters, excluding parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - val_ps = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - val_ps.append(ctypes.c_void_p(int(vals[i]))) - - packagedParams = (ctypes.c_void_p * 512)() - for i in range(512): - packagedParams[i] = ctypes.addressof(val_ps[i]) - - benchmark(launch_packed, func, stream, packagedParams) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -def pack_and_launch(kernel, stream, params): - packed_params = (ctypes.c_void_p * len(params))() - ptrs = [0] * len(params) - for i in range(len(params)): - ptrs[i] = ctypes.c_void_p(int(params[i])) - packed_params[i] = ctypes.addressof(ptrs[i]) - - cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) - - -# Measure launch latency plus parameter packing using ctypes -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - - benchmark(pack_and_launch, func, stream, vals) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with a single large struct parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") - ASSERT_DRV(err) - - class struct_2048B(ctypes.Structure): - _fields_ = [("values", ctypes.c_uint8 * 2048)] - - benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) - - cuda.cuCtxSynchronize() diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py b/cuda_bindings/benchmarks/pytest-legacy/test_numba.py deleted file mode 100644 index dfe084c6b1..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import numpy as np -import pytest - -try: - from numba import cuda - - skip_tests = False -except ImportError: - skip_tests = True - - -def launch_empty(kernel, stream): - kernel[1, 1, stream]() - - -def launch(kernel, stream, arg): - kernel[1, 1, stream](arg) - - -# Measure launch latency with no parmaeters -@pytest.mark.skipif(skip_tests, reason="Numba is not installed") -@pytest.mark.benchmark(group="numba", min_rounds=1000) -def test_launch_latency_empty_kernel(benchmark): - stream = cuda.stream() - - @cuda.jit - def empty_kernel(): - return - - benchmark(launch_empty, empty_kernel, stream) - - cuda.synchronize() - - -# Measure launch latency with a single parameter -@pytest.mark.skipif(skip_tests, reason="Numba is not installed") -@pytest.mark.benchmark(group="numba", min_rounds=1000) -def test_launch_latency_small_kernel(benchmark): - stream = cuda.stream() - - arg = cuda.device_array(1, dtype=np.float32, stream=stream) - - @cuda.jit - def small_kernel(array): - array[0] = 0.0 - - benchmark(launch, small_kernel, stream, arg) - - cuda.synchronize() diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py b/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py deleted file mode 100644 index fae72ffd79..0000000000 --- a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py +++ /dev/null @@ -1,112 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import random - -import pytest - -from conftest import ASSERT_DRV -from cuda.bindings import driver as cuda - -random.seed(0) - -idx = 0 - - -def query_attribute(attribute, ptrs): - global idx - ptr = ptrs[idx] - idx = (idx + 1) % len(ptrs) - - cuda.cuPointerGetAttribute(attribute, ptr) - - -def query_attributes(attributes, ptrs): - global idx - ptr = ptrs[idx] - idx = (idx + 1) % len(ptrs) - - cuda.cuPointerGetAttributes(len(attributes), attributes, ptr) - - -@pytest.mark.benchmark(group="pointer-attributes") -# Measure cuPointerGetAttribute in the same way as C benchmarks -def test_pointer_get_attribute(benchmark, init_cuda): - _ = init_cuda - - ptrs = [] - for _ in range(500): - err, ptr = cuda.cuMemAlloc(1 << 18) - ASSERT_DRV(err) - ptrs.append(ptr) - - random.shuffle(ptrs) - - benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs) - - for p in ptrs: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -@pytest.mark.benchmark(group="pointer-attributes") -# Measure cuPointerGetAttributes with all attributes -def test_pointer_get_attributes_all(benchmark, init_cuda): - _ = init_cuda - - ptrs = [] - for _ in range(500): - err, ptr = cuda.cuMemAlloc(1 << 18) - ASSERT_DRV(err) - ptrs.append(ptr) - - random.shuffle(ptrs) - - attributes = [ - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE, - ] - - benchmark(query_attributes, attributes, ptrs) - - for p in ptrs: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -@pytest.mark.benchmark(group="pointer-attributes") -# Measure cuPointerGetAttributes with a single attribute -def test_pointer_get_attributes_single(benchmark, init_cuda): - _ = init_cuda - - ptrs = [] - for _ in range(500): - err, ptr = cuda.cuMemAlloc(1 << 18) - ASSERT_DRV(err) - ptrs.append(ptr) - - random.shuffle(ptrs) - - attributes = [ - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - ] - - benchmark(query_attributes, attributes, ptrs) - - for p in ptrs: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) diff --git a/cuda_bindings/benchmarks/run_cpp.py b/cuda_bindings/benchmarks/run_cpp.py deleted file mode 100644 index 96e50cb890..0000000000 --- a/cuda_bindings/benchmarks/run_cpp.py +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from runner.cpp import main - -if __name__ == "__main__": - main() diff --git a/cuda_bindings/benchmarks/run_pyperf.py b/cuda_bindings/benchmarks/run_pyperf.py deleted file mode 100644 index f45af8c69a..0000000000 --- a/cuda_bindings/benchmarks/run_pyperf.py +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from runner.main import main - -if __name__ == "__main__": - main() diff --git a/cuda_bindings/benchmarks/runner/__init__.py b/cuda_bindings/benchmarks/runner/__init__.py deleted file mode 100644 index 27422b3cb7..0000000000 --- a/cuda_bindings/benchmarks/runner/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/cuda_bindings/benchmarks/runner/cpp.py deleted file mode 100644 index f8c3490381..0000000000 --- a/cuda_bindings/benchmarks/runner/cpp.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import json -import subprocess -import sys -import tempfile -from pathlib import Path - -PROJECT_ROOT = Path(__file__).resolve().parent.parent -BUILD_DIR = PROJECT_ROOT / ".build" / "cpp" -DEFAULT_OUTPUT = PROJECT_ROOT / "results-cpp.json" - -BINARY_PREFIX = "bench_" -BINARY_SUFFIX = "_cpp" - - -def discover_binaries() -> dict[str, Path]: - """Discover C++ benchmark binaries in the build directory""" - if not BUILD_DIR.is_dir(): - return {} - - registry: dict[str, Path] = {} - for path in sorted(BUILD_DIR.iterdir()): - if not path.is_file() or not path.name.startswith(BINARY_PREFIX): - continue - if not path.name.endswith(BINARY_SUFFIX): - continue - name = path.name.removeprefix(BINARY_PREFIX).removesuffix(BINARY_SUFFIX) - registry[name] = path - return registry - - -def strip_output_args(argv: list[str]) -> list[str]: - cleaned: list[str] = [] - skip_next = False - for arg in argv: - if skip_next: - skip_next = False - continue - if arg in ("-o", "--output"): - skip_next = True - continue - if arg.startswith(("-o=", "--output=")): - continue - cleaned.append(arg) - return cleaned - - -def merge_pyperf_json(individual_files: list[Path], output_path: Path) -> int: - """Merge individual pyperf JSON files into a single BenchmarkSuite file. - - Each C++ binary produces a file with structure: - {"version": "1.0", "metadata": {...}, "benchmarks": [{...}]} - - We merge them by collecting all benchmark entries into one file. - """ - all_benchmarks = [] - - for path in individual_files: - with open(path) as f: - data = json.load(f) - - file_metadata = data.get("metadata", {}) - bench_name = file_metadata.get("name", "") - loops = file_metadata.get("loops") - unit = file_metadata.get("unit", "second") - - for bench in data.get("benchmarks", []): - for run in bench.get("runs", []): - run_meta = run.setdefault("metadata", {}) - if bench_name: - run_meta.setdefault("name", bench_name) - if loops is not None: - run_meta.setdefault("loops", loops) - run_meta.setdefault("unit", unit) - - all_benchmarks.append(bench) - - merged = { - "version": "1.0", - "benchmarks": all_benchmarks, - } - - with open(output_path, "w") as f: - json.dump(merged, f) - - return len(all_benchmarks) - - -def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: - parser = argparse.ArgumentParser( - description="Run C++ CUDA benchmarks", - add_help=False, - ) - parser.add_argument( - "--benchmark", - action="append", - default=[], - help="Benchmark name to run (e.g. 'ctx_device'). Repeat for multiple. Defaults to all.", - ) - parser.add_argument( - "--list", - action="store_true", - help="Print discovered benchmark names and exit.", - ) - parser.add_argument( - "-o", - "--output", - type=Path, - default=DEFAULT_OUTPUT, - help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", - ) - parsed, remaining = parser.parse_known_args(argv) - return parsed, remaining - - -def main() -> None: - parsed, remaining_argv = parse_args(sys.argv[1:]) - - registry = discover_binaries() - if not registry: - print( - f"No C++ benchmark binaries found in {BUILD_DIR}.\nRun 'pixi run bench-cpp-build' first.", - file=sys.stderr, - ) - sys.exit(1) - - if parsed.list: - for name in sorted(registry): - print(name) - return - - if parsed.benchmark: - missing = sorted(set(parsed.benchmark) - set(registry)) - if missing: - known = ", ".join(sorted(registry)) - unknown = ", ".join(missing) - print( - f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}", - file=sys.stderr, - ) - sys.exit(1) - names = parsed.benchmark - else: - names = sorted(registry) - - # Strip any --output args to avoid conflicts with our output handling - passthrough_argv = strip_output_args(remaining_argv) - - output_path = parsed.output.resolve() - failed = False - individual_files: list[Path] = [] - - with tempfile.TemporaryDirectory(prefix="cuda_bench_cpp_") as tmpdir: - tmpdir_path = Path(tmpdir) - - for name in names: - binary = registry[name] - tmp_json = tmpdir_path / f"{name}.json" - cmd = [str(binary), "-o", str(tmp_json), *passthrough_argv] - result = subprocess.run(cmd, check=False) # noqa: S603 - if result.returncode != 0: - print(f"FAILED: {name} (exit code {result.returncode})", file=sys.stderr) - failed = True - elif tmp_json.exists(): - individual_files.append(tmp_json) - - if individual_files: - count = merge_pyperf_json(individual_files, output_path) - print(f"\nResults saved to {output_path} ({count} benchmark(s))") - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/cuda_bindings/benchmarks/runner/main.py b/cuda_bindings/benchmarks/runner/main.py deleted file mode 100644 index 4089aa5559..0000000000 --- a/cuda_bindings/benchmarks/runner/main.py +++ /dev/null @@ -1,217 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import ast -import importlib.util -import os -import sys -from collections.abc import Callable -from pathlib import Path -from types import ModuleType - -import pyperf - -PROJECT_ROOT = Path(__file__).resolve().parent.parent -BENCH_DIR = PROJECT_ROOT / "benchmarks" -DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json" -PYPERF_INHERITED_ENV_VARS = ( - "CUDA_HOME", - "CUDA_PATH", - "CUDA_VISIBLE_DEVICES", - "LD_LIBRARY_PATH", - "NVIDIA_VISIBLE_DEVICES", -) -_MODULE_CACHE: dict[Path, ModuleType] = {} - - -def load_module(module_path: Path) -> ModuleType: - module_path = module_path.resolve() - cached_module = _MODULE_CACHE.get(module_path) - if cached_module is not None: - return cached_module - - module_name = f"cuda_bindings_bench_{module_path.stem}" - spec = importlib.util.spec_from_file_location(module_name, module_path) - if spec is None or spec.loader is None: - raise RuntimeError(f"Failed to load benchmark module: {module_path}") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - _MODULE_CACHE[module_path] = module - return module - - -def benchmark_id(module_name: str, function_name: str) -> str: - module_suffix = module_name.removeprefix("bench_") - suffix = function_name.removeprefix("bench_") - return f"{module_suffix}.{suffix}" - - -def _discover_module_functions(module_path: Path) -> list[str]: - tree = ast.parse(module_path.read_text(encoding="utf-8"), filename=str(module_path)) - return [ - node.name - for node in tree.body - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_") - ] - - -def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]: - loaded_function: Callable[[int], float] | None = None - - def run(loops: int) -> float: - nonlocal loaded_function - if loaded_function is None: - module = load_module(module_path) - loaded_function = getattr(module, function_name) - return loaded_function(loops) - - run.__name__ = function_name - return run - - -def discover_benchmarks() -> dict[str, Callable[[int], float]]: - """Discover bench_ functions. - - Each bench_ function must have the signature: bench_*(loops: int) -> float - where it calls the operation `loops` times and returns the total elapsed - time in seconds (using time.perf_counter). - """ - registry: dict[str, Callable[[int], float]] = {} - for module_path in sorted(BENCH_DIR.glob("bench_*.py")): - module_name = module_path.stem - for function_name in _discover_module_functions(module_path): - bench_id = benchmark_id(module_name, function_name) - if bench_id in registry: - raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}") - registry[bench_id] = _lazy_benchmark(module_path, function_name) - return registry - - -def strip_pyperf_output_args(argv: list[str]) -> list[str]: - cleaned: list[str] = [] - skip_next = False - for arg in argv: - if skip_next: - skip_next = False - continue - if arg in ("-o", "--output", "--append"): - skip_next = True - continue - if arg.startswith(("-o=", "--output=", "--append=")): - continue - cleaned.append(arg) - return cleaned - - -def _split_env_vars(arg_value: str) -> list[str]: - return [env_var for env_var in arg_value.split(",") if env_var] - - -def ensure_pyperf_worker_env(argv: list[str]) -> list[str]: - if "--copy-env" in argv: - return list(argv) - - inherited_env: list[str] = [] - cleaned: list[str] = [] - skip_next = False - for arg in argv: - if skip_next: - inherited_env.extend(_split_env_vars(arg)) - skip_next = False - continue - if arg == "--inherit-environ": - skip_next = True - continue - if arg.startswith("--inherit-environ="): - inherited_env.extend(_split_env_vars(arg.partition("=")[2])) - continue - cleaned.append(arg) - - if skip_next: - raise ValueError("Missing value for --inherit-environ") - - for env_var in PYPERF_INHERITED_ENV_VARS: - if env_var in os.environ: - inherited_env.append(env_var) - - deduped_env: list[str] = [] - for env_var in inherited_env: - if env_var not in deduped_env: - deduped_env.append(env_var) - - if deduped_env: - cleaned.extend(["--inherit-environ", ",".join(deduped_env)]) - - return cleaned - - -def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: - parser = argparse.ArgumentParser(add_help=False) - parser.add_argument( - "--benchmark", - action="append", - default=[], - help="Benchmark ID to run. Repeat to run multiple IDs. Defaults to all.", - ) - parser.add_argument( - "--list", - action="store_true", - help="Print discovered benchmark IDs and exit.", - ) - parser.add_argument( - "-o", - "--output", - type=Path, - default=DEFAULT_OUTPUT, - help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", - ) - parsed, remaining = parser.parse_known_args(argv) - return parsed, remaining - - -def main() -> None: - parsed, remaining_argv = parse_args(sys.argv[1:]) - - registry = discover_benchmarks() - if not registry: - raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}") - - if parsed.list: - for bench_id in sorted(registry): - print(bench_id) - return - - if parsed.benchmark: - missing = sorted(set(parsed.benchmark) - set(registry)) - if missing: - known = ", ".join(sorted(registry)) - unknown = ", ".join(missing) - raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}") - benchmark_ids = parsed.benchmark - else: - benchmark_ids = sorted(registry) - - # Strip any --output args to avoid conflicts with our output handling. - output_path = parsed.output.resolve() - remaining_argv = strip_pyperf_output_args(remaining_argv) - remaining_argv = ensure_pyperf_worker_env(remaining_argv) - is_worker = "--worker" in remaining_argv - - # Delete the file so this run starts fresh. - if not is_worker: - output_path.unlink(missing_ok=True) - - sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv] - - runner = pyperf.Runner() - for bench_id in benchmark_ids: - runner.bench_time_func(bench_id, registry[bench_id]) - - if not is_worker: - print(f"\nResults saved to {output_path}") - - -if __name__ == "__main__": - main() diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/cuda_bindings/benchmarks/runner/runtime.py deleted file mode 100644 index c985adb2e2..0000000000 --- a/cuda_bindings/benchmarks/runner/runtime.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import atexit - -from cuda.bindings import driver as cuda -from cuda.bindings import nvrtc - -_ctx = None -_device = None -_persistent_ptrs: list[int] = [] -_modules: list = [] - - -def assert_drv(err) -> None: - if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError(f"Cuda Error: {err}") - - -def ensure_context() -> int: - global _ctx, _device - if _ctx is not None: - return _ctx - - (err,) = cuda.cuInit(0) - assert_drv(err) - - err, device = cuda.cuDeviceGet(0) - assert_drv(err) - _device = device - - err, ctx = cuda.cuCtxCreate(None, 0, device) - assert_drv(err) - _ctx = ctx - return ctx - - -def alloc_persistent(size: int) -> int: - ensure_context() - err, ptr = cuda.cuMemAlloc(size) - assert_drv(err) - _persistent_ptrs.append(ptr) - return ptr - - -def compile_and_load(kernel_source: str) -> int: - """Compile CUDA C source and returns the CUmodule handle""" - ensure_context() - - err, major = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _device - ) - assert_drv(err) - err, minor = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _device - ) - assert_drv(err) - - err, prog = nvrtc.nvrtcCreateProgram(kernel_source.encode(), b"benchmark_kernel.cu", 0, [], []) - assert_drv(err) - - arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode() - (err,) = nvrtc.nvrtcCompileProgram(prog, 2, [b"--fmad=false", arch_flag]) - - # check for compile errors - err_log, log_size = nvrtc.nvrtcGetProgramLogSize(prog) - assert_drv(err_log) - log = b" " * log_size - (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) - assert_drv(err_log) - assert_drv(err) - - err, cubin_size = nvrtc.nvrtcGetCUBINSize(prog) - assert_drv(err) - cubin = b" " * cubin_size - (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) - assert_drv(err) - - err, module = cuda.cuModuleLoadData(cubin) - assert_drv(err) - _modules.append(module) - return module - - -def cleanup() -> None: - global _ctx - for ptr in reversed(_persistent_ptrs): - (err,) = cuda.cuMemFree(ptr) - assert_drv(err) - _persistent_ptrs.clear() - - for module in reversed(_modules): - (err,) = cuda.cuModuleUnload(module) - assert_drv(err) - _modules.clear() - - if _ctx is None: - return - (err,) = cuda.cuCtxDestroy(_ctx) - assert_drv(err) - _ctx = None - - -atexit.register(cleanup) diff --git a/cuda_bindings/benchmarks/tests/test_runner.py b/cuda_bindings/benchmarks/tests/test_runner.py deleted file mode 100644 index 612094dac9..0000000000 --- a/cuda_bindings/benchmarks/tests/test_runner.py +++ /dev/null @@ -1,166 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import importlib.util -import itertools -import sys -import types -from pathlib import Path - -REPO_ROOT = Path(__file__).resolve().parents[3] -RUNNER_MAIN_PATH = REPO_ROOT / "cuda_bindings/benchmarks/runner/main.py" -BENCH_LAUNCH_PATH = REPO_ROOT / "cuda_bindings/benchmarks/benchmarks/bench_launch.py" - - -def load_module_from_path(module_name: str, module_path: Path): - spec = importlib.util.spec_from_file_location(module_name, module_path) - if spec is None or spec.loader is None: - raise RuntimeError(f"Failed to load test module: {module_path}") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def load_runner_main(monkeypatch): - pyperf_module = types.ModuleType("pyperf") - - class FakeRunner: - def bench_time_func(self, *_args, **_kwargs) -> None: - raise AssertionError("FakeRunner should not be used in these tests") - - pyperf_module.Runner = FakeRunner - monkeypatch.setitem(sys.modules, "pyperf", pyperf_module) - return load_module_from_path("test_cuda_bindings_bench_runner_main", RUNNER_MAIN_PATH) - - -def load_bench_launch(monkeypatch, calls: list[tuple]): - pointer_values = itertools.count(1000) - - runtime_module = types.ModuleType("runner.runtime") - - def alloc_persistent(size: int) -> int: - calls.append(("alloc_persistent", size)) - return next(pointer_values) - - def assert_drv(err) -> None: - calls.append(("assert_drv", err)) - assert err == 0 - - def compile_and_load(source: str) -> str: - calls.append(("compile_and_load", source)) - return "module" - - runtime_module.alloc_persistent = alloc_persistent - runtime_module.assert_drv = assert_drv - runtime_module.compile_and_load = compile_and_load - - runner_module = types.ModuleType("runner") - runner_module.runtime = runtime_module - - driver_module = types.ModuleType("cuda.bindings.driver") - - class FakeCUresult: - CUDA_SUCCESS = 0 - - class FakeCUstreamFlags: - CU_STREAM_NON_BLOCKING = types.SimpleNamespace(value=1) - - def cuModuleGetFunction(module, name): - calls.append(("cuModuleGetFunction", module, name)) - return 0, name - - def cuStreamCreate(flags): - calls.append(("cuStreamCreate", flags)) - return 0, "stream" - - def cuLaunchKernel(*args): - calls.append(("cuLaunchKernel", args)) - return 0 - - driver_module.CUresult = FakeCUresult - driver_module.CUstream_flags = FakeCUstreamFlags - driver_module.cuModuleGetFunction = cuModuleGetFunction - driver_module.cuStreamCreate = cuStreamCreate - driver_module.cuLaunchKernel = cuLaunchKernel - - cuda_module = types.ModuleType("cuda") - bindings_module = types.ModuleType("cuda.bindings") - bindings_module.driver = driver_module - cuda_module.bindings = bindings_module - - monkeypatch.setitem(sys.modules, "runner", runner_module) - monkeypatch.setitem(sys.modules, "runner.runtime", runtime_module) - monkeypatch.setitem(sys.modules, "cuda", cuda_module) - monkeypatch.setitem(sys.modules, "cuda.bindings", bindings_module) - monkeypatch.setitem(sys.modules, "cuda.bindings.driver", driver_module) - - return load_module_from_path("test_cuda_bindings_bench_launch", BENCH_LAUNCH_PATH) - - -def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path): - runner_main = load_runner_main(monkeypatch) - - marker_path = tmp_path / "imported.txt" - bench_path = tmp_path / "bench_lazy.py" - bench_path.write_text( - "\n".join( - ( - "from pathlib import Path", - f"Path({str(marker_path)!r}).write_text('imported')", - "", - "def helper() -> float:", - " return 0.0", - "", - "def bench_visible(loops: int) -> float:", - " return loops + 0.5", - "", - ) - ), - encoding="utf-8", - ) - - monkeypatch.setattr(runner_main, "BENCH_DIR", tmp_path) - runner_main._MODULE_CACHE.clear() - - registry = runner_main.discover_benchmarks() - - assert sorted(registry) == ["lazy.visible"] - assert not marker_path.exists() - assert registry["lazy.visible"](3) == 3.5 - assert marker_path.read_text(encoding="utf-8") == "imported" - - -def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch): - runner_main = load_runner_main(monkeypatch) - - for env_var in runner_main.PYPERF_INHERITED_ENV_VARS: - monkeypatch.delenv(env_var, raising=False) - monkeypatch.setenv("CUDA_PATH", "/opt/cuda") - monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64") - - argv = runner_main.ensure_pyperf_worker_env(["--fast", "--inherit-environ=FOO,BAR"]) - - assert argv == ["--fast", "--inherit-environ", "FOO,BAR,CUDA_PATH,LD_LIBRARY_PATH"] - - -def test_bench_launch_initializes_on_first_use(monkeypatch): - calls: list[tuple] = [] - bench_launch = load_bench_launch(monkeypatch, calls) - - assert calls == [] - - bench_launch.bench_launch_empty_kernel(1) - compile_calls = [call for call in calls if call[0] == "compile_and_load"] - launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"] - - assert len(compile_calls) == 1 - assert len(launch_calls) == 1 - - bench_launch.bench_launch_16_args_pre_packed(1) - compile_calls = [call for call in calls if call[0] == "compile_and_load"] - launch_calls = [call for call in calls if call[0] == "cuLaunchKernel"] - - assert len(compile_calls) == 1 - assert len(launch_calls) == 2 From 5ecba207b527a72fb8465e3f4843ad4d3ba0b54e Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 16:04:39 -0500 Subject: [PATCH 05/10] lint --- .../benchmarks/bench_ctx_device.py | 20 ++++++------- .../cuda_bindings/benchmarks/bench_event.py | 20 ++++++------- .../cuda_bindings/benchmarks/bench_launch.py | 16 +++++----- .../cuda_bindings/benchmarks/bench_memory.py | 30 +++++++++---------- .../benchmarks/bench_pointer_attributes.py | 4 +-- .../cuda_bindings/benchmarks/bench_stream.py | 16 +++++----- 6 files changed, 52 insertions(+), 54 deletions(-) diff --git a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py index 1c82cd4046..2e2cd11d93 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py @@ -15,48 +15,48 @@ def bench_ctx_get_current(loops: int) -> float: - _cuCtxGetCurrent = cuda.cuCtxGetCurrent + _fn = cuda.cuCtxGetCurrent t0 = time.perf_counter() for _ in range(loops): - _cuCtxGetCurrent() + _fn() return time.perf_counter() - t0 def bench_ctx_set_current(loops: int) -> float: - _cuCtxSetCurrent = cuda.cuCtxSetCurrent + _fn = cuda.cuCtxSetCurrent _ctx = CTX t0 = time.perf_counter() for _ in range(loops): - _cuCtxSetCurrent(_ctx) + _fn(_ctx) return time.perf_counter() - t0 def bench_ctx_get_device(loops: int) -> float: - _cuCtxGetDevice = cuda.cuCtxGetDevice + _fn = cuda.cuCtxGetDevice t0 = time.perf_counter() for _ in range(loops): - _cuCtxGetDevice() + _fn() return time.perf_counter() - t0 def bench_device_get(loops: int) -> float: - _cuDeviceGet = cuda.cuDeviceGet + _fn = cuda.cuDeviceGet t0 = time.perf_counter() for _ in range(loops): - _cuDeviceGet(0) + _fn(0) return time.perf_counter() - t0 def bench_device_get_attribute(loops: int) -> float: - _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute + _fn = cuda.cuDeviceGetAttribute _attr = ATTRIBUTE _dev = DEVICE t0 = time.perf_counter() for _ in range(loops): - _cuDeviceGetAttribute(_attr, _dev) + _fn(_attr, _dev) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py index e8e319115d..041adc2553 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_event.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py @@ -20,43 +20,43 @@ def bench_event_create_destroy(loops: int) -> float: - _cuEventCreate = cuda.cuEventCreate - _cuEventDestroy = cuda.cuEventDestroy + _create = cuda.cuEventCreate + _destroy = cuda.cuEventDestroy _flags = EVENT_FLAGS t0 = time.perf_counter() for _ in range(loops): - _, e = _cuEventCreate(_flags) - _cuEventDestroy(e) + _, e = _create(_flags) + _destroy(e) return time.perf_counter() - t0 def bench_event_record(loops: int) -> float: - _cuEventRecord = cuda.cuEventRecord + _fn = cuda.cuEventRecord _event = EVENT _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuEventRecord(_event, _stream) + _fn(_event, _stream) return time.perf_counter() - t0 def bench_event_query(loops: int) -> float: - _cuEventQuery = cuda.cuEventQuery + _fn = cuda.cuEventQuery _event = EVENT t0 = time.perf_counter() for _ in range(loops): - _cuEventQuery(_event) + _fn(_event) return time.perf_counter() - t0 def bench_event_synchronize(loops: int) -> float: - _cuEventSynchronize = cuda.cuEventSynchronize + _fn = cuda.cuEventSynchronize _event = EVENT t0 = time.perf_counter() for _ in range(loops): - _cuEventSynchronize(_event) + _fn(_event) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py index 931194fbd3..abf3f946cc 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_launch.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py @@ -82,19 +82,19 @@ def _ensure_launch_state() -> None: def bench_launch_empty_kernel(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = EMPTY_KERNEL _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) return time.perf_counter() - t0 def bench_launch_small_kernel(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = SMALL_KERNEL _stream = STREAM _args = (FLOAT_PTR,) @@ -102,13 +102,13 @@ def bench_launch_small_kernel(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 def bench_launch_16_args(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = KERNEL_16_ARGS _stream = STREAM _args = INT_PTRS @@ -116,18 +116,18 @@ def bench_launch_16_args(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 def bench_launch_16_args_pre_packed(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = KERNEL_16_ARGS _stream = STREAM _packed = PACKED_16 t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py index faa4795580..875c060406 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_memory.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py @@ -2,11 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -import ctypes import time import numpy as np - from runner.runtime import alloc_persistent, ensure_context from cuda.bindings import driver as cuda @@ -30,61 +28,61 @@ def bench_mem_alloc_free(loops: int) -> float: - _cuMemAlloc = cuda.cuMemAlloc - _cuMemFree = cuda.cuMemFree + _alloc = cuda.cuMemAlloc + _free = cuda.cuMemFree _size = ALLOC_SIZE t0 = time.perf_counter() for _ in range(loops): - _, ptr = _cuMemAlloc(_size) - _cuMemFree(ptr) + _, ptr = _alloc(_size) + _free(ptr) return time.perf_counter() - t0 def bench_mem_alloc_async_free_async(loops: int) -> float: - _cuMemAllocAsync = cuda.cuMemAllocAsync - _cuMemFreeAsync = cuda.cuMemFreeAsync + _alloc = cuda.cuMemAllocAsync + _free = cuda.cuMemFreeAsync _size = ALLOC_SIZE _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _, ptr = _cuMemAllocAsync(_size, _stream) - _cuMemFreeAsync(ptr, _stream) + _, ptr = _alloc(_size, _stream) + _free(ptr, _stream) return time.perf_counter() - t0 def bench_memcpy_htod(loops: int) -> float: - _cuMemcpyHtoD = cuda.cuMemcpyHtoD + _fn = cuda.cuMemcpyHtoD _dst = DST_DPTR _src = HOST_SRC _size = COPY_SIZE t0 = time.perf_counter() for _ in range(loops): - _cuMemcpyHtoD(_dst, _src, _size) + _fn(_dst, _src, _size) return time.perf_counter() - t0 def bench_memcpy_dtoh(loops: int) -> float: - _cuMemcpyDtoH = cuda.cuMemcpyDtoH + _fn = cuda.cuMemcpyDtoH _dst = HOST_DST _src = SRC_DPTR _size = COPY_SIZE t0 = time.perf_counter() for _ in range(loops): - _cuMemcpyDtoH(_dst, _src, _size) + _fn(_dst, _src, _size) return time.perf_counter() - t0 def bench_memcpy_dtod(loops: int) -> float: - _cuMemcpyDtoD = cuda.cuMemcpyDtoD + _fn = cuda.cuMemcpyDtoD _dst = DST_DPTR _src = SRC_DPTR _size = COPY_SIZE t0 = time.perf_counter() for _ in range(loops): - _cuMemcpyDtoD(_dst, _src, _size) + _fn(_dst, _src, _size) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py index a02b82c399..191da263ee 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py @@ -15,11 +15,11 @@ def bench_pointer_get_attribute(loops: int) -> float: # Local references to avoid global lookups in the hot loop - _cuPointerGetAttribute = cuda.cuPointerGetAttribute + _fn = cuda.cuPointerGetAttribute _attr = ATTRIBUTE _ptr = PTR t0 = time.perf_counter() for _ in range(loops): - _cuPointerGetAttribute(_attr, _ptr) + _fn(_attr, _ptr) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py index d816099ed5..3aab9288fc 100644 --- a/benchmarks/cuda_bindings/benchmarks/bench_stream.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py @@ -14,32 +14,32 @@ def bench_stream_create_destroy(loops: int) -> float: - _cuStreamCreate = cuda.cuStreamCreate - _cuStreamDestroy = cuda.cuStreamDestroy + _create = cuda.cuStreamCreate + _destroy = cuda.cuStreamDestroy _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value t0 = time.perf_counter() for _ in range(loops): - _, s = _cuStreamCreate(_flags) - _cuStreamDestroy(s) + _, s = _create(_flags) + _destroy(s) return time.perf_counter() - t0 def bench_stream_query(loops: int) -> float: - _cuStreamQuery = cuda.cuStreamQuery + _fn = cuda.cuStreamQuery _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuStreamQuery(_stream) + _fn(_stream) return time.perf_counter() - t0 def bench_stream_synchronize(loops: int) -> float: - _cuStreamSynchronize = cuda.cuStreamSynchronize + _fn = cuda.cuStreamSynchronize _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuStreamSynchronize(_stream) + _fn(_stream) return time.perf_counter() - t0 From e4682631fe63b84cdd27885b916c3cdda8486137 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 16:08:00 -0500 Subject: [PATCH 06/10] lint --- benchmarks/cuda_bindings/runner/main.py | 2 +- ruff.toml | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py index 4089aa5559..b0f6e76f41 100644 --- a/benchmarks/cuda_bindings/runner/main.py +++ b/benchmarks/cuda_bindings/runner/main.py @@ -53,7 +53,7 @@ def _discover_module_functions(module_path: Path) -> list[str]: return [ node.name for node in tree.body - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_") + if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef) and node.name.startswith("bench_") ] diff --git a/ruff.toml b/ruff.toml index 704e422c19..b0df8060b6 100644 --- a/ruff.toml +++ b/ruff.toml @@ -87,7 +87,6 @@ inline-quotes = "double" "ARG001", # unused function argument (fixtures) "ARG002", # unused method argument "RUF012", # mutable class default (ctypes _fields_ is standard) - "RUF059", # unused unpacked variable (side-effect assignments) "F841", # unused local variable (side-effect assignments) "E402", # module-level import not at top of file "E702", # multiple statements on one line (compact test tables) @@ -111,17 +110,21 @@ inline-quotes = "double" "**/examples/**" = [ "T201", # print "E402", # module-level import not at top of file - "RUF059", # unused unpacked variable ] "**/benchmarks/**" = [ "T201", # print "RUF012", # mutable class default (ctypes _fields_ is standard) - "RUF059", # unused unpacked variable "F841", # unused local variable "E402", # module-level import not at top of file ] +"**/pytest-legacy/**" = [ + "N801", # legacy CUDA naming conventions + "N802", + "N806", +] + # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) # Keep examples opted-in to enforce naming conventions in example-local identifiers. "cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ From ddd32480b4d3a6801e72981ce635c54935c8ea68 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 17 Apr 2026 16:10:33 -0500 Subject: [PATCH 07/10] lint --- ruff.toml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ruff.toml b/ruff.toml index b0df8060b6..704e422c19 100644 --- a/ruff.toml +++ b/ruff.toml @@ -87,6 +87,7 @@ inline-quotes = "double" "ARG001", # unused function argument (fixtures) "ARG002", # unused method argument "RUF012", # mutable class default (ctypes _fields_ is standard) + "RUF059", # unused unpacked variable (side-effect assignments) "F841", # unused local variable (side-effect assignments) "E402", # module-level import not at top of file "E702", # multiple statements on one line (compact test tables) @@ -110,21 +111,17 @@ inline-quotes = "double" "**/examples/**" = [ "T201", # print "E402", # module-level import not at top of file + "RUF059", # unused unpacked variable ] "**/benchmarks/**" = [ "T201", # print "RUF012", # mutable class default (ctypes _fields_ is standard) + "RUF059", # unused unpacked variable "F841", # unused local variable "E402", # module-level import not at top of file ] -"**/pytest-legacy/**" = [ - "N801", # legacy CUDA naming conventions - "N802", - "N806", -] - # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) # Keep examples opted-in to enforce naming conventions in example-local identifiers. "cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ From f2c083818bdfad9a4c97c211092ab91239c8895a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 17 Apr 2026 15:57:46 -0700 Subject: [PATCH 08/10] Recover missing SPDX follow-ups from PR 1913 GitHub merged PR 1913 before the later local commits were pushed, so replay the recovered SPDX policy follow-ups and related license fixes here. Context: https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4271701561 Made-with: Cursor --- .coveragerc | 2 +- .spdx-ignore | 3 - .../cuda_python_test_helpers/nvvm_bitcode.py | 2 +- toolshed/build_static_bitcode_input.py | 2 +- toolshed/check_spdx.py | 74 ++++++++++++++----- toolshed/dump_cutile_b64.py | 2 +- 6 files changed, 61 insertions(+), 24 deletions(-) diff --git a/.coveragerc b/.coveragerc index 36f0f7879a..1e1776fd56 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 [paths] source = diff --git a/.spdx-ignore b/.spdx-ignore index 8c1d155c47..7263b5414f 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -8,9 +8,6 @@ LICENSE requirements*.txt cuda_bindings/examples/* -# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149) -cuda_bindings/benchmarks/* - # Vendored cuda_core/cuda/core/_include/dlpack.h diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py index ddb6eae107..e6366ac95d 100644 --- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py +++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import binascii diff --git a/toolshed/build_static_bitcode_input.py b/toolshed/build_static_bitcode_input.py index 273ce33244..e2400100dd 100755 --- a/toolshed/build_static_bitcode_input.py +++ b/toolshed/build_static_bitcode_input.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 """ Helper to produce static bitcode input for test_nvvm.py. diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py index 6be42282bf..d769ded66c 100644 --- a/toolshed/check_spdx.py +++ b/toolshed/check_spdx.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import datetime +import fnmatch import os import re import subprocess @@ -17,12 +18,27 @@ LICENSE_IDENTIFIER_REGEX = re.compile(re.escape(SPDX_LICENSE_IDENTIFIER_PREFIX) + rb"(?P[^\r\n]+)") -EXPECTED_LICENSE_IDENTIFIERS = ( - ("cuda_bindings/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"), - ("cuda_core/", "Apache-2.0"), - ("cuda_pathfinder/", "Apache-2.0"), - ("cuda_python/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"), -) +TOP_LEVEL_FILE_LICENSE_IDENTIFIER = "Apache-2.0" + +# Every top-level directory needs to have an entry here, so new paths +# can't slip in without a reviewed license decision. +TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = { + ".github": "Apache-2.0", + "ci": "Apache-2.0", + "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", + "cuda_core": "Apache-2.0", + "cuda_pathfinder": "Apache-2.0", + "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", + "cuda_python_test_helpers": "Apache-2.0", + "scripts": "Apache-2.0", + "toolshed": "Apache-2.0", +} + +SPECIAL_CASE_LICENSE_IDENTIFIERS = { + # key: repo-relative path or glob, value: expected SPDX license identifier + "cuda_bindings/benchmarks/*": "Apache-2.0", + "cuda_bindings/benchmarks/pytest-legacy/*": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", +} SPDX_IGNORE_FILENAME = ".spdx-ignore" @@ -63,12 +79,34 @@ def normalize_repo_path(filepath): return PureWindowsPath(filepath).as_posix() +def get_top_level_directory(normalized_path): + if "/" not in normalized_path: + return None + return normalized_path.split("/", 1)[0] + + def get_expected_license_identifier(filepath): normalized_path = normalize_repo_path(filepath) - for prefix, license_identifier in EXPECTED_LICENSE_IDENTIFIERS: - if normalized_path.startswith(prefix): - return license_identifier - return None + matching_special_cases = [ + (prefix, license_identifier) + for prefix, license_identifier in SPECIAL_CASE_LICENSE_IDENTIFIERS.items() + if fnmatch.fnmatchcase(normalized_path, prefix) + ] + if matching_special_cases: + return max(matching_special_cases, key=lambda item: len(item[0]))[1], None + + top_level_directory = get_top_level_directory(normalized_path) + if top_level_directory is None: + return TOP_LEVEL_FILE_LICENSE_IDENTIFIER, None + + if top_level_directory not in TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS: + return ( + None, + f"MISSING TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS entry for top-level directory " + f"{top_level_directory!r} required by {filepath!r}", + ) + + return TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS[top_level_directory], None def validate_required_spdx_field(filepath, blob, expected_bytes): @@ -82,10 +120,11 @@ def extract_license_identifier(blob): match = LICENSE_IDENTIFIER_REGEX.search(blob) if match is None: return None - try: - return match.group("license_identifier").decode("ascii") - except UnicodeDecodeError: - return None + license_identifier = match.group("license_identifier").decode("ascii", errors="replace").strip() + for comment_suffix in ("-->", "*/"): + if license_identifier.endswith(comment_suffix): + license_identifier = license_identifier.removesuffix(comment_suffix).rstrip() + return license_identifier or None def validate_license_identifier(filepath, blob): @@ -94,9 +133,10 @@ def validate_license_identifier(filepath, blob): print(f"MISSING valid SPDX license identifier in {filepath!r}") return False - expected_license_identifier = get_expected_license_identifier(filepath) - if expected_license_identifier is None: - return True + expected_license_identifier, configuration_error = get_expected_license_identifier(filepath) + if configuration_error is not None: + print(configuration_error) + return False if license_identifier != expected_license_identifier: print( diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py index 84013ea94b..422bf95232 100644 --- a/toolshed/dump_cutile_b64.py +++ b/toolshed/dump_cutile_b64.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 """ Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=., From efc34e8f0b0b208739614851572bcb5ae431b2e9 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 17 Apr 2026 16:06:55 -0700 Subject: [PATCH 09/10] Fix benchmarks/cuda_bindings/pytest-legacy license identifiers --- benchmarks/cuda_bindings/pytest-legacy/conftest.py | 2 +- benchmarks/cuda_bindings/pytest-legacy/kernels.py | 2 +- benchmarks/cuda_bindings/pytest-legacy/test_cupy.py | 2 +- benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py | 2 +- benchmarks/cuda_bindings/pytest-legacy/test_numba.py | 2 +- .../cuda_bindings/pytest-legacy/test_pointer_attributes.py | 2 +- toolshed/check_spdx.py | 1 + 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarks/cuda_bindings/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py index 0ea7b1d772..5d0cc95e7a 100644 --- a/benchmarks/cuda_bindings/pytest-legacy/conftest.py +++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import numpy as np import pytest diff --git a/benchmarks/cuda_bindings/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py index 36646fba00..7e741110a3 100644 --- a/benchmarks/cuda_bindings/pytest-legacy/kernels.py +++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 kernel_string = """\ #define ITEM_PARAM(x, T) T x diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py index 76dd6e6a45..3eea752ce0 100644 --- a/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import ctypes diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py index dd994081a0..ad421de382 100755 --- a/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import ctypes diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py index dfe084c6b1..d9ae0cdfee 100644 --- a/benchmarks/cuda_bindings/pytest-legacy/test_numba.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import numpy as np import pytest diff --git a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py index fae72ffd79..6df32ec511 100644 --- a/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import random diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py index d769ded66c..3d52142554 100644 --- a/toolshed/check_spdx.py +++ b/toolshed/check_spdx.py @@ -24,6 +24,7 @@ # can't slip in without a reviewed license decision. TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = { ".github": "Apache-2.0", + "benchmarks": "Apache-2.0", "ci": "Apache-2.0", "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", "cuda_core": "Apache-2.0", From e17681724a02820996af41ad723e5d27d1a2427a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 17 Apr 2026 16:20:04 -0700 Subject: [PATCH 10/10] Move legacy benchmark Ruff suppressions with code move The naming-rule suppressions used to live under cuda_bindings/benchmarks, so move the needed legacy-path suppressions to the relocated benchmarks/cuda_bindings pytest-legacy path and drop the stale old-path entry. Made-with: Cursor --- ruff.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ruff.toml b/ruff.toml index 704e422c19..210f852cd3 100644 --- a/ruff.toml +++ b/ruff.toml @@ -124,13 +124,18 @@ inline-quotes = "double" # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) # Keep examples opted-in to enforce naming conventions in example-local identifiers. -"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ +"cuda_bindings/{cuda,docs,tests}/**" = [ "N801", # invalid-class-name "N802", # invalid-function-name "N803", # invalid-argument-name "N806", # non-lowercase-variable-in-function "N816", # mixed-case-variable-in-global-scope ] +"benchmarks/cuda_bindings/pytest-legacy/**" = [ + "N801", # invalid-class-name + "N802", # invalid-function-name + "N806", # non-lowercase-variable-in-function +] "cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"] # scripts and build tooling — print is the expected output method