Skip to content

Commit

Permalink
[onnxruntime/build] Add new flag enable_generic_interface to build pr…
Browse files Browse the repository at this point in the history
…imary EPs by default (#23342)

### Description
- Add new build flag in build.py to build onnxruntime.dll supporting
interfaces for all primary EPs( QNN, TensoRT, OpenVino, VitisAI).
- Modify onnxruntime.dll/onnxruntime_shared.dll build settings to remove
dependency of IHV SDK Toolset to be installed on the system.
- Change CMake variables to be explicit when building EP vs ORT. e.g.
onnxruntime_USE_TENSORRT vs onnxruntime_USE_TENSORRT_INTERFACE, to
evolve the build system to build ORT independent of EPs.



### Motivation and Context
Changes in the build system required to evolve the repo to build the
components independently while removing unnecessary dependencies

---------

Co-authored-by: Lei Cao <[email protected]>
Co-authored-by: Karim Vadsariya <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Jan 28, 2025
1 parent a770a8d commit 655a23f
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 24 deletions.
24 changes: 15 additions & 9 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,12 @@ option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)

option(onnxruntime_USE_TENSORRT_INTERFACE "Build ONNXRuntime shared lib which is compatible with TensorRT EP interface" OFF)
option(onnxruntime_USE_CUDA_INTERFACE "Build ONNXRuntime shared lib which is compatible with Cuda EP interface" OFF)
option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is compatible with OpenVINO EP interface" OFF)
option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)

# ENABLE_TRAINING includes all training functionality
# The following 2 entry points
# 1. ORTModule
Expand Down Expand Up @@ -703,7 +709,7 @@ if (WIN32)
# structure was padded due to __declspec(align())
list(APPEND ORT_WARNING_FLAGS "/wd4324")
# warning C4800: Implicit conversion from 'X' to bool. Possible information loss
if (onnxruntime_USE_OPENVINO)
if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)
list(APPEND ORT_WARNING_FLAGS "/wd4800")
endif()
# operator 'operator-name': deprecated between enumerations of different types
Expand Down Expand Up @@ -864,7 +870,7 @@ else()
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
endif()

if (onnxruntime_USE_CUDA)
if (onnxruntime_USE_CUDA OR onnxruntime_USE_CUDA_INTERFACE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_CUDA=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_CUDA=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES cuda)
Expand All @@ -888,7 +894,7 @@ if (onnxruntime_USE_CUDA)
endif()
endif()

if (onnxruntime_USE_VITISAI)
if (onnxruntime_USE_VITISAI OR onnxruntime_USE_VITISAI_INTERFACE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES vitisai)
Expand All @@ -898,12 +904,12 @@ if (onnxruntime_USE_DNNL)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES dnnl)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DNNL=1)
endif()
if (onnxruntime_USE_OPENVINO)
if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_OPENVINO=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES openvino)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_OPENVINO=1)
endif()
if (onnxruntime_USE_TENSORRT)
if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_TENSORRT_INTERFACE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_TENSORRT=1)
#TODO: remove the following line and change the test code in onnxruntime_shared_lib_test to use the new EP API.
list(APPEND ONNXRUNTIME_PROVIDER_NAMES tensorrt)
Expand All @@ -929,7 +935,7 @@ if (onnxruntime_USE_JSEP)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_JSEP=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES js)
endif()
if (onnxruntime_USE_QNN)
if (onnxruntime_USE_QNN OR onnxruntime_USE_QNN_INTERFACE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_QNN=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES qnn)
Expand Down Expand Up @@ -957,7 +963,7 @@ if (onnxruntime_USE_QNN)
endif()
endif()

if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
if ((NOT onnxruntime_USE_QNN_INTERFACE) AND (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux"))
file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so"
"${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll"
"${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libHtpPrepare.so"
Expand Down Expand Up @@ -1416,7 +1422,7 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
)
endif()

if (onnxruntime_USE_OPENVINO)
if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)

add_definitions(-DUSE_OPENVINO=1)

Expand All @@ -1429,7 +1435,7 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_CONFIG_GPU=1)
endif()

if (onnxruntime_USE_OPENVINO_CPU)
if (onnxruntime_USE_OPENVINO_CPU OR onnxruntime_USE_OPENVINO_INTERFACE) # OpenVino CPU interface is default built.
add_definitions(-DOPENVINO_CONFIG_CPU=1)
endif()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ struct ProviderHost {
virtual std::string demangle(const char* name) = 0;
virtual std::string demangle(const std::string& name) = 0;

#ifdef USE_CUDA
virtual std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
virtual std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(const char* name) = 0;
virtual std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() = 0;
Expand All @@ -190,7 +189,6 @@ struct ProviderHost {

virtual Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
virtual void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
#endif

#ifdef USE_MIGRAPHX
virtual std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0;
Expand All @@ -200,7 +198,6 @@ struct ProviderHost {
#ifdef USE_ROCM
virtual std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) = 0;
virtual std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name) = 0;
virtual std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() = 0;

virtual void rocm__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) = 0;
virtual void rocm__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) = 0;
Expand Down Expand Up @@ -1256,9 +1253,7 @@ struct ProviderHost {
virtual training::DistributedRunContext& GetDistributedRunContextInstance() = 0;
#endif

#if defined(USE_CUDA) || defined(USE_ROCM)
virtual PhiloxGenerator& PhiloxGenerator__Default() = 0;
#endif

#ifdef ENABLE_TRAINING_TORCH_INTEROP
virtual void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) = 0;
Expand Down
7 changes: 2 additions & 5 deletions onnxruntime/core/session/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,8 @@ struct ProviderHostImpl : ProviderHost {
void* CPUAllocator__Alloc(CPUAllocator* p, size_t size) override { return p->CPUAllocator::Alloc(size); }
void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); }

#ifdef USE_CUDA
std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); }
std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(name); }
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); }

void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
Expand All @@ -271,7 +269,6 @@ struct ProviderHostImpl : ProviderHost {

Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg, file, line); }
void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); }
#endif

#ifdef USE_MIGRAPHX
std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(device_id, name); }
Expand All @@ -291,6 +288,8 @@ struct ProviderHostImpl : ProviderHost {

Status RocmCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_ROCM().RocmCall_false(retCode, exprString, libName, successCode, msg, file, line); }
void RocmCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_ROCM().RocmCall_true(retCode, exprString, libName, successCode, msg, file, line); }
#else
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); }
#endif

std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); }
Expand Down Expand Up @@ -1560,9 +1559,7 @@ struct ProviderHostImpl : ProviderHost {
training::DistributedRunContext& GetDistributedRunContextInstance() override { return training::DistributedRunContext::GetInstance(); }
#endif

#if defined(USE_CUDA) || defined(USE_ROCM)
PhiloxGenerator& PhiloxGenerator__Default() override { return PhiloxGenerator::Default(); }
#endif

#ifdef ENABLE_TRAINING_TORCH_INTEROP
void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) override { p->PythonOpBase::Init(info); }
Expand Down
40 changes: 35 additions & 5 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,12 @@ def convert_arg_line_to_args(self, arg_line):
parser.add_argument("--use_triton_kernel", action="store_true", help="Use triton compiled kernels")
parser.add_argument("--use_lock_free_queue", action="store_true", help="Use lock-free task queue for threadpool.")

parser.add_argument(
"--enable_generic_interface",
action="store_true",
help="build ORT shared library and compatible bridge with primary EPs(tensorRT, OpenVino, Qnn, vitisai) but not tests",
)

if not is_windows():
parser.add_argument(
"--allow_running_as_root",
Expand Down Expand Up @@ -1042,6 +1048,12 @@ def generate_build_tree(
"-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
"-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
+ ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
# interface variables are used only for building onnxruntime/onnxruntime_shared.dll but not EPs
"-Donnxruntime_USE_TENSORRT_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
"-Donnxruntime_USE_CUDA_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
"-Donnxruntime_USE_OPENVINO_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
"-Donnxruntime_USE_VITISAI_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
"-Donnxruntime_USE_QNN_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
# set vars for migraphx
"-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"),
"-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"),
Expand Down Expand Up @@ -1372,6 +1384,8 @@ def generate_build_tree(
cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"]
if args.android and args.use_qnn != "static_lib":
raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.")
if args.use_qnn == "static_lib" and args.enable_generic_interface:
raise BuildError("Generic ORT interface only supported with QNN EP built as a shared library.")

if args.use_coreml:
cmake_args += ["-Donnxruntime_USE_COREML=ON"]
Expand Down Expand Up @@ -1529,6 +1543,12 @@ def generate_build_tree(
"-Donnxruntime_USE_FULL_PROTOBUF=ON",
]

# When this flag is enabled, that means we only build ONNXRuntime shared library, expecting some compatible EP
# shared lib being build in a seperate process. So we skip the test for now as ONNXRuntime shared lib built under
# this flag is not expected to work alone
if args.enable_generic_interface:
cmake_args += ["-Donnxruntime_BUILD_UNIT_TESTS=OFF"]

if args.enable_lazy_tensor:
import torch

Expand Down Expand Up @@ -2649,6 +2669,9 @@ def main():
# Disable ONNX Runtime's builtin memory checker
args.disable_memleak_checker = True

if args.enable_generic_interface:
args.test = False

# If there was no explicit argument saying what to do, default
# to update, build and test (for native builds).
if not (args.update or args.clean or args.build or args.test or args.gen_doc):
Expand Down Expand Up @@ -2752,7 +2775,10 @@ def main():
source_dir = os.path.normpath(os.path.join(script_dir, "..", ".."))

# if using cuda, setup cuda paths and env vars
cuda_home, cudnn_home = setup_cuda_vars(args)
cuda_home = ""
cudnn_home = ""
if args.use_cuda:
cuda_home, cudnn_home = setup_cuda_vars(args)

mpi_home = args.mpi_home
nccl_home = args.nccl_home
Expand All @@ -2765,10 +2791,14 @@ def main():
armnn_home = args.armnn_home
armnn_libs = args.armnn_libs

qnn_home = args.qnn_home
qnn_home = ""
if args.use_qnn:
qnn_home = args.qnn_home

# if using tensorrt, setup tensorrt paths
tensorrt_home = setup_tensorrt_vars(args)
tensorrt_home = ""
if args.use_tensorrt:
tensorrt_home = setup_tensorrt_vars(args)

# if using migraphx, setup migraphx paths
migraphx_home = setup_migraphx_vars(args)
Expand Down Expand Up @@ -2853,9 +2883,9 @@ def main():
toolset = "host=" + host_arch + ",version=" + args.msvc_toolset
else:
toolset = "host=" + host_arch
if args.cuda_version:
if args.use_cuda and args.cuda_version:
toolset += ",cuda=" + args.cuda_version
elif args.cuda_home:
elif args.use_cuda and args.cuda_home:
toolset += ",cuda=" + args.cuda_home
if args.windows_sdk_version:
target_arch += ",version=" + args.windows_sdk_version
Expand Down
19 changes: 19 additions & 0 deletions tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,25 @@ stages:
WITH_CACHE: false
MachinePool: 'onnxruntime-Win-CPU-2022'

- stage: x64_release_ep_generic_interface
dependsOn: []
jobs:
- template: templates/jobs/win-ci-vs-2022-job.yml
parameters:
BuildConfig: 'RelWithDebInfo'
buildArch: x64
additionalBuildFlags: --enable_generic_interface
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_release_ep_generic_interface
RunOnnxRuntimeTests: false # --enable_generic_interface does not build tests
EnablePython: false
isTraining: false
ORT_EP_NAME: CPU
GenerateDocumentation: false
WITH_CACHE: false
MachinePool: 'onnxruntime-Win-CPU-2022'

- stage: x86_release
dependsOn: []
jobs:
Expand Down

0 comments on commit 655a23f

Please sign in to comment.