diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b332583035890..8650cc53d93ef 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -259,6 +259,12 @@ option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF) option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF) option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF) +option(onnxruntime_USE_TENSORRT_INTERFACE "Build ONNXRuntime shared lib which is compatible with TensorRT EP interface" OFF) +option(onnxruntime_USE_CUDA_INTERFACE "Build ONNXRuntime shared lib which is compatible with Cuda EP interface" OFF) +option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is compatible with OpenVINO EP interface" OFF) +option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF) +option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF) + # ENABLE_TRAINING includes all training functionality # The following 2 entry points # 1. ORTModule @@ -703,7 +709,7 @@ if (WIN32) # structure was padded due to __declspec(align()) list(APPEND ORT_WARNING_FLAGS "/wd4324") # warning C4800: Implicit conversion from 'X' to bool. Possible information loss - if (onnxruntime_USE_OPENVINO) + if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE) list(APPEND ORT_WARNING_FLAGS "/wd4800") endif() # operator 'operator-name': deprecated between enumerations of different types @@ -864,7 +870,7 @@ else() set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF) endif() -if (onnxruntime_USE_CUDA) +if (onnxruntime_USE_CUDA OR onnxruntime_USE_CUDA_INTERFACE) list(APPEND ORT_PROVIDER_FLAGS -DUSE_CUDA=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_CUDA=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES cuda) @@ -888,7 +894,7 @@ if (onnxruntime_USE_CUDA) endif() endif() -if (onnxruntime_USE_VITISAI) +if (onnxruntime_USE_VITISAI OR onnxruntime_USE_VITISAI_INTERFACE) list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES vitisai) @@ -898,12 +904,12 @@ if (onnxruntime_USE_DNNL) list(APPEND ONNXRUNTIME_PROVIDER_NAMES dnnl) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DNNL=1) endif() -if (onnxruntime_USE_OPENVINO) +if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE) list(APPEND ORT_PROVIDER_FLAGS -DUSE_OPENVINO=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES openvino) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_OPENVINO=1) endif() -if (onnxruntime_USE_TENSORRT) +if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_TENSORRT_INTERFACE) list(APPEND ORT_PROVIDER_FLAGS -DUSE_TENSORRT=1) #TODO: remove the following line and change the test code in onnxruntime_shared_lib_test to use the new EP API. list(APPEND ONNXRUNTIME_PROVIDER_NAMES tensorrt) @@ -929,7 +935,7 @@ if (onnxruntime_USE_JSEP) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_JSEP=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES js) endif() -if (onnxruntime_USE_QNN) +if (onnxruntime_USE_QNN OR onnxruntime_USE_QNN_INTERFACE) list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_QNN=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES qnn) @@ -957,7 +963,7 @@ if (onnxruntime_USE_QNN) endif() endif() - if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + if ((NOT onnxruntime_USE_QNN_INTERFACE) AND (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")) file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libHtpPrepare.so" @@ -1416,7 +1422,7 @@ if (onnxruntime_ENABLE_TRAINING_APIS) ) endif() -if (onnxruntime_USE_OPENVINO) +if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE) add_definitions(-DUSE_OPENVINO=1) @@ -1429,7 +1435,7 @@ if (onnxruntime_USE_OPENVINO) add_definitions(-DOPENVINO_CONFIG_GPU=1) endif() - if (onnxruntime_USE_OPENVINO_CPU) + if (onnxruntime_USE_OPENVINO_CPU OR onnxruntime_USE_OPENVINO_INTERFACE) # OpenVino CPU interface is default built. add_definitions(-DOPENVINO_CONFIG_CPU=1) endif() diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 962d10d8952d6..a1bb86598ebc0 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -178,7 +178,6 @@ struct ProviderHost { virtual std::string demangle(const char* name) = 0; virtual std::string demangle(const std::string& name) = 0; -#ifdef USE_CUDA virtual std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) = 0; virtual std::unique_ptr CreateCUDAPinnedAllocator(const char* name) = 0; virtual std::unique_ptr CreateGPUDataTransfer() = 0; @@ -190,7 +189,6 @@ struct ProviderHost { virtual Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0; virtual void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0; -#endif #ifdef USE_MIGRAPHX virtual std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0; @@ -200,7 +198,6 @@ struct ProviderHost { #ifdef USE_ROCM virtual std::unique_ptr CreateROCMAllocator(int16_t device_id, const char* name) = 0; virtual std::unique_ptr CreateROCMPinnedAllocator(const char* name) = 0; - virtual std::unique_ptr CreateGPUDataTransfer() = 0; virtual void rocm__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) = 0; virtual void rocm__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) = 0; @@ -1256,9 +1253,7 @@ struct ProviderHost { virtual training::DistributedRunContext& GetDistributedRunContextInstance() = 0; #endif -#if defined(USE_CUDA) || defined(USE_ROCM) virtual PhiloxGenerator& PhiloxGenerator__Default() = 0; -#endif #ifdef ENABLE_TRAINING_TORCH_INTEROP virtual void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) = 0; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index d7c6dab72fde8..3a694ac6f8e5e 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -258,10 +258,8 @@ struct ProviderHostImpl : ProviderHost { void* CPUAllocator__Alloc(CPUAllocator* p, size_t size) override { return p->CPUAllocator::Alloc(size); } void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); } -#ifdef USE_CUDA std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); } std::unique_ptr CreateCUDAPinnedAllocator(const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(name); } - std::unique_ptr CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); } void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); } void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); } @@ -271,7 +269,6 @@ struct ProviderHostImpl : ProviderHost { Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg, file, line); } void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); } -#endif #ifdef USE_MIGRAPHX std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(device_id, name); } @@ -291,6 +288,8 @@ struct ProviderHostImpl : ProviderHost { Status RocmCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_ROCM().RocmCall_false(retCode, exprString, libName, successCode, msg, file, line); } void RocmCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_ROCM().RocmCall_true(retCode, exprString, libName, successCode, msg, file, line); } +#else + std::unique_ptr CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); } #endif std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); } @@ -1560,9 +1559,7 @@ struct ProviderHostImpl : ProviderHost { training::DistributedRunContext& GetDistributedRunContextInstance() override { return training::DistributedRunContext::GetInstance(); } #endif -#if defined(USE_CUDA) || defined(USE_ROCM) PhiloxGenerator& PhiloxGenerator__Default() override { return PhiloxGenerator::Default(); } -#endif #ifdef ENABLE_TRAINING_TORCH_INTEROP void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) override { p->PythonOpBase::Init(info); } diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index bce7552854a4c..cc733f859fe0b 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -782,6 +782,12 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--use_triton_kernel", action="store_true", help="Use triton compiled kernels") parser.add_argument("--use_lock_free_queue", action="store_true", help="Use lock-free task queue for threadpool.") + parser.add_argument( + "--enable_generic_interface", + action="store_true", + help="build ORT shared library and compatible bridge with primary EPs(tensorRT, OpenVino, Qnn, vitisai) but not tests", + ) + if not is_windows(): parser.add_argument( "--allow_running_as_root", @@ -1042,6 +1048,12 @@ def generate_build_tree( "-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"), "-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=" + ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"), + # interface variables are used only for building onnxruntime/onnxruntime_shared.dll but not EPs + "-Donnxruntime_USE_TENSORRT_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"), + "-Donnxruntime_USE_CUDA_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"), + "-Donnxruntime_USE_OPENVINO_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"), + "-Donnxruntime_USE_VITISAI_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"), + "-Donnxruntime_USE_QNN_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"), # set vars for migraphx "-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"), "-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"), @@ -1372,6 +1384,8 @@ def generate_build_tree( cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"] if args.android and args.use_qnn != "static_lib": raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.") + if args.use_qnn == "static_lib" and args.enable_generic_interface: + raise BuildError("Generic ORT interface only supported with QNN EP built as a shared library.") if args.use_coreml: cmake_args += ["-Donnxruntime_USE_COREML=ON"] @@ -1529,6 +1543,12 @@ def generate_build_tree( "-Donnxruntime_USE_FULL_PROTOBUF=ON", ] + # When this flag is enabled, that means we only build ONNXRuntime shared library, expecting some compatible EP + # shared lib being build in a seperate process. So we skip the test for now as ONNXRuntime shared lib built under + # this flag is not expected to work alone + if args.enable_generic_interface: + cmake_args += ["-Donnxruntime_BUILD_UNIT_TESTS=OFF"] + if args.enable_lazy_tensor: import torch @@ -2649,6 +2669,9 @@ def main(): # Disable ONNX Runtime's builtin memory checker args.disable_memleak_checker = True + if args.enable_generic_interface: + args.test = False + # If there was no explicit argument saying what to do, default # to update, build and test (for native builds). if not (args.update or args.clean or args.build or args.test or args.gen_doc): @@ -2752,7 +2775,10 @@ def main(): source_dir = os.path.normpath(os.path.join(script_dir, "..", "..")) # if using cuda, setup cuda paths and env vars - cuda_home, cudnn_home = setup_cuda_vars(args) + cuda_home = "" + cudnn_home = "" + if args.use_cuda: + cuda_home, cudnn_home = setup_cuda_vars(args) mpi_home = args.mpi_home nccl_home = args.nccl_home @@ -2765,10 +2791,14 @@ def main(): armnn_home = args.armnn_home armnn_libs = args.armnn_libs - qnn_home = args.qnn_home + qnn_home = "" + if args.use_qnn: + qnn_home = args.qnn_home # if using tensorrt, setup tensorrt paths - tensorrt_home = setup_tensorrt_vars(args) + tensorrt_home = "" + if args.use_tensorrt: + tensorrt_home = setup_tensorrt_vars(args) # if using migraphx, setup migraphx paths migraphx_home = setup_migraphx_vars(args) @@ -2853,9 +2883,9 @@ def main(): toolset = "host=" + host_arch + ",version=" + args.msvc_toolset else: toolset = "host=" + host_arch - if args.cuda_version: + if args.use_cuda and args.cuda_version: toolset += ",cuda=" + args.cuda_version - elif args.cuda_home: + elif args.use_cuda and args.cuda_home: toolset += ",cuda=" + args.cuda_home if args.windows_sdk_version: target_arch += ",version=" + args.windows_sdk_version diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml index 94c2d35a563b6..d96f1cb68c388 100644 --- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml @@ -177,6 +177,25 @@ stages: WITH_CACHE: false MachinePool: 'onnxruntime-Win-CPU-2022' +- stage: x64_release_ep_generic_interface + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + buildArch: x64 + additionalBuildFlags: --enable_generic_interface + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_release_ep_generic_interface + RunOnnxRuntimeTests: false # --enable_generic_interface does not build tests + EnablePython: false + isTraining: false + ORT_EP_NAME: CPU + GenerateDocumentation: false + WITH_CACHE: false + MachinePool: 'onnxruntime-Win-CPU-2022' + - stage: x86_release dependsOn: [] jobs: