Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/linux_cuda_plugin_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,27 @@ jobs:
cd /onnxruntime_src/onnxruntime/test/python/transformers
python test_cuda_plugin_ep.py
"
# --- Run the CUDA plugin EP C++ GoogleTest binary ---
# onnxruntime_provider_test is built into the artifact and links the plugin tests
# (gated by ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP). The user-stream + CUDA graph test
# registers the plugin .so via GetSharedLibraryFileName("onnxruntime_providers_cuda_plugin"),
# which returns the platform-specific filename without a directory component. Run from
# /build/Release/Release so that filename resolves to the plugin .so built there.
- name: Run CUDA Plugin EP C++ Tests
run: |
docker run --rm --gpus all \
-v ${{ github.workspace }}:/onnxruntime_src \
-v ${{ runner.temp }}/Release:/build/Release \
-e NVIDIA_VISIBLE_DEVICES=all \
${{ steps.build_docker_image_step.outputs.full-image-name }} \
bash -c "
set -ex
export PATH=/opt/python/cp312-cp312/bin:\$PATH
# Make libcudart.so.13 (and the plugin's CUDA deps) findable; see note above.
export LD_LIBRARY_PATH=/build/Release/Release:/usr/local/cuda-13.0/lib64:\${LD_LIBRARY_PATH:-}
cd /build/Release/Release
ls -la onnxruntime_provider_test libonnxruntime_providers_cuda_plugin.so
./onnxruntime_provider_test --gtest_filter='CudaPluginUserStreamGraphTest.*'
"
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo
psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2
pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae
pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b
pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/4628dc060ce4e82345dc166bbac875609db4ff69.zip;e58d4b47c16a982111c897e669ae4f1821a393d7
re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
Expand Down
4 changes: 1 addition & 3 deletions cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,7 @@ if (CPUINFO_SUPPORTED)
PATCH_COMMAND
${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_cpuinfo_h_for_arm64ec.patch &&
# https://github.com/pytorch/cpuinfo/pull/324
${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch &&
# https://github.com/pytorch/cpuinfo/pull/348
${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch
${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch
FIND_PACKAGE_ARGS NAMES cpuinfo
)
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
${MLAS_SRC_DIR}/flashattn.cpp
${MLAS_SRC_DIR}/flashattn_qkv.cpp
${MLAS_SRC_DIR}/flashattn_gqa.cpp
${MLAS_SRC_DIR}/qkv_quant.cpp
${MLAS_SRC_DIR}/cast.cpp
${MLAS_SRC_DIR}/layernorm.cpp
Expand Down
9 changes: 5 additions & 4 deletions cmake/onnxruntime_providers_cuda_plugin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,11 @@ list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/sequence_op\\.cc$")
# in the CPU provider and is not linked into the plugin.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/size\\.cc$")

# Permanently excluded — pure CPU ops, handled by GetCpuPreferredNodes.
# shape_op.cc inherits from onnxruntime::OpKernel (framework)
# which cannot convert to ep::adapter::OpKernel in the plugin build.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/shape_op\\.cc$")
# shape_op.cc is INCLUDED in the plugin build. It provides an adapter-based
# Shape kernel under #ifdef BUILD_CUDA_EP_AS_PLUGIN (the CPU onnxruntime::Shape
# class, which derives from the framework OpKernel, is only used in the
# non-plugin build). Registering Shape on the EP keeps it off the CPU EP and
# avoids Memcpy nodes that would otherwise break CUDA Graph capture.

# Exclude contrib training ops (shrunken_gather depends on provider_api.h in header).
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/tensor/shrunken_gather\\.cc$")
Expand Down
4 changes: 2 additions & 2 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1609,8 +1609,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

endif()


if(onnxruntime_USE_QNN)
# Build ep_weight_sharing_ctx_gen for all supported EPs (QNN, TensorRT, OpenVINO, VitisAI)
if(onnxruntime_USE_QNN OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_VITISAI)
#qnn ctx generator
set(ep_weight_sharing_ctx_gen_src_dir ${TEST_SRC_DIR}/ep_weight_sharing_ctx_gen)
set(ep_weight_sharing_ctx_gen_src_patterns
Expand Down
58 changes: 50 additions & 8 deletions cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
diff --git a/src/linux/processors.c b/src/linux/processors.c
index 47bee76..d0c5569 100644
index fd040a3..2ca8ec4 100644
--- a/src/linux/processors.c
+++ b/src/linux/processors.c
@@ -2,0 +3 @@
@@ -3,6 +3,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <unistd.h>
@@ -291,0 +293,22 @@

#if !defined(__ANDROID__)
/*
@@ -289,6 +290,28 @@ static bool max_processor_number_parser(uint32_t processor_list_start, uint32_t
return true;
}

+static uint32_t cpuinfo_linux_get_max_processor_from_sysconf(
+ uint32_t max_processors_count,
+ const char* processor_list_name) {
Expand All @@ -27,13 +36,31 @@ index 47bee76..d0c5569 100644
+ return max_processor;
+}
+
@@ -301 +324 @@
uint32_t cpuinfo_linux_get_max_possible_processor(uint32_t max_processors_count) {
uint32_t max_possible_processor = 0;
if (!cpuinfo_linux_parse_cpulist(
@@ -298,7 +321,7 @@ uint32_t cpuinfo_linux_get_max_possible_processor(uint32_t max_processors_count)
#else
cpuinfo_log_warning("failed to parse the list of possible processors in %s", POSSIBLE_CPULIST_FILENAME);
#endif
- return UINT32_MAX;
+ return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, POSSIBLE_CPULIST_FILENAME);
@@ -323 +346 @@
}
if (max_possible_processor >= max_processors_count) {
cpuinfo_log_warning(
@@ -320,7 +343,7 @@ uint32_t cpuinfo_linux_get_max_present_processor(uint32_t max_processors_count)
#else
cpuinfo_log_warning("failed to parse the list of present processors in %s", PRESENT_CPULIST_FILENAME);
#endif
- return UINT32_MAX;
+ return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, PRESENT_CPULIST_FILENAME);
@@ -357,0 +381,31 @@
}
if (max_present_processor >= max_processors_count) {
cpuinfo_log_warning(
@@ -355,6 +378,37 @@ static bool detect_processor_parser(uint32_t processor_list_start, uint32_t proc
return true;
}

+static bool cpuinfo_linux_detect_processors_from_sysconf(
+ uint32_t max_processors_count,
+ uint32_t* processor0_flags,
Expand Down Expand Up @@ -65,19 +92,34 @@ index 47bee76..d0c5569 100644
+ return true;
+}
+
@@ -373 +427,6 @@
bool cpuinfo_linux_detect_possible_processors(
uint32_t max_processors_count,
uint32_t* processor0_flags,
@@ -370,7 +424,12 @@ bool cpuinfo_linux_detect_possible_processors(
return true;
} else {
cpuinfo_log_warning("failed to parse the list of possible processors in %s", POSSIBLE_CPULIST_FILENAME);
- return false;
+ return cpuinfo_linux_detect_processors_from_sysconf(
+ max_processors_count,
+ processor0_flags,
+ processor_struct_size,
+ possible_flag,
+ POSSIBLE_CPULIST_FILENAME);
@@ -392 +451,6 @@
}
}

@@ -389,7 +448,12 @@ bool cpuinfo_linux_detect_present_processors(
return true;
} else {
cpuinfo_log_warning("failed to parse the list of present processors in %s", PRESENT_CPULIST_FILENAME);
- return false;
+ return cpuinfo_linux_detect_processors_from_sysconf(
+ max_processors_count,
+ processor0_flags,
+ processor_struct_size,
+ present_flag,
+ PRESENT_CPULIST_FILENAME);
}
}

4 changes: 2 additions & 2 deletions cmake/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aedc983..dab589e 100644
index 072c987..e43d6ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,17 @@ IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND CPUINFO_TARGET_PROCESSOR STREQUAL "am
ENDIF()
IF(IS_APPLE_OS AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64.*)$")
SET(CPUINFO_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
+ELSEIF(MSVC AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.10")
+ # Use CMAKE_C_COMPILER_ARCHITECTURE_ID. MSVC values are documented as available since CMake 3.10.
+ # Use CMAKE_C_COMPILER_ARCHITECTURE_ID for non-VS generators (e.g. Ninja) with MSVC.
+ IF(CMAKE_C_COMPILER_ARCHITECTURE_ID STREQUAL "X86")
+ SET(CPUINFO_TARGET_PROCESSOR "x86")
+ ELSEIF(CMAKE_C_COMPILER_ARCHITECTURE_ID STREQUAL "x64")
Expand Down
19 changes: 0 additions & 19 deletions cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch

This file was deleted.

4 changes: 2 additions & 2 deletions cmake/vcpkg-ports/cpuinfo/patch_vcpkg_arm64ec_support.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aedc983..dab589e 100644
index 072c987..e43d6ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,17 @@ IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND CPUINFO_TARGET_PROCESSOR STREQUAL "am
ENDIF()
IF(IS_APPLE_OS AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64.*)$")
SET(CPUINFO_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
+ELSEIF(MSVC AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.10")
+ # Use CMAKE_C_COMPILER_ARCHITECTURE_ID. MSVC values are documented as available since CMake 3.10.
+ # Use CMAKE_C_COMPILER_ARCHITECTURE_ID for non-VS generators (e.g. Ninja) with MSVC.
+ IF(CMAKE_C_COMPILER_ARCHITECTURE_ID STREQUAL "X86")
+ SET(CPUINFO_TARGET_PROCESSOR "x86")
+ ELSEIF(CMAKE_C_COMPILER_ARCHITECTURE_ID STREQUAL "x64")
Expand Down
7 changes: 3 additions & 4 deletions cmake/vcpkg-ports/cpuinfo/portfile.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ endif()
vcpkg_from_github(
OUT_SOURCE_PATH SOURCE_PATH
REPO pytorch/cpuinfo
REF 403d652dca4c1046e8145950b1c0997a9f748b57
SHA512 f7cd6dc44bd1120af610cae1337ed4c0f557ba78d2de9c73fed350fa3dfe9512643a1619ae55f5a540c6316a87d641856cca27297bb8766e48f39b7b7a59da1f
HEAD_REF master
REF 4628dc060ce4e82345dc166bbac875609db4ff69
SHA512 db7a93279f2f6daaf825fbd8552935d8ed671d276b65ad614e11f722b6a6848e663850d65180d33b554d67ef1a36aae842feb368699f90be8f21172a1af1924e
HEAD_REF main
PATCHES
patch_cpuinfo_h_for_arm64ec.patch
patch_vcpkg_arm64ec_support.patch # https://github.com/pytorch/cpuinfo/pull/324
win_arm_fp16_detection_fallback.patch # https://github.com/pytorch/cpuinfo/pull/348
)

vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
Expand Down
19 changes: 0 additions & 19 deletions cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch

This file was deleted.

Loading
Loading