Skip to content
Open
Show file tree
Hide file tree
Changes from 57 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
36c0dc1
plugin TRT EP init
chilo-ms Jun 23, 2025
ed65a9f
clean up GetCapabilityImpl and make it pass compiler for now
chilo-ms Jun 23, 2025
3269f73
Clean up CompileImpl
chilo-ms Jun 23, 2025
4da9f90
update ep factory
chilo-ms Jun 25, 2025
1928767
update ep factory
chilo-ms Jun 25, 2025
4f5ffcb
update ep factory
chilo-ms Jun 25, 2025
bc64bdc
clean up and add back onnx_ctx_model_helper.cc
chilo-ms Jun 25, 2025
c4437a2
clean up
chilo-ms Jun 25, 2025
a5a294e
remove onnxruntime namespace
chilo-ms Jun 25, 2025
f990a7b
update
chilo-ms Jun 25, 2025
7851a1c
Add TRTEpNodeComputeInfo
chilo-ms Jun 28, 2025
be453b1
add allocator and data transfer
chilo-ms Jul 2, 2025
3d6fa57
fix a lot of compile errors
chilo-ms Jul 2, 2025
c8e3d6f
call EpDevice_AddAllocatorInfo in GetSupportedDevicesImpl
chilo-ms Jul 2, 2025
3c43029
temporary way to get provider option without proper API
chilo-ms Jul 3, 2025
549b29d
Clean up cmake file to remove dependencies that built with ORT
chilo-ms Jul 7, 2025
3ad7736
Update CompileImpl
chilo-ms Jul 8, 2025
3ced4cf
add ort_graph_to_proto.h and leverage OrtGraphToProto utilities
chilo-ms Jul 8, 2025
081de36
update EP context model helper
chilo-ms Jul 10, 2025
75240a4
Convert onnxruntime::Status to OrtStatus
chilo-ms Jul 10, 2025
f73420f
remove unused files
chilo-ms Jul 10, 2025
938a3fe
use GetSessionOptionsConfigEntries to get provider options
chilo-ms Jul 10, 2025
731ed72
fix a bunch of compile errors
chilo-ms Jul 11, 2025
30e0f91
update memory info and data transfer in TRT EP's factor to accommodat…
chilo-ms Jul 14, 2025
f443a33
update cuda/pinned allocator to make compiler happy
chilo-ms Jul 14, 2025
95dd71e
add GetVersionImpl in factory
chilo-ms Jul 14, 2025
35b0cf1
update data transfer initialization in TRT EP
chilo-ms Jul 14, 2025
a65908f
Fix compile errors/issues
chilo-ms Jul 14, 2025
c77391f
fix to use correct API
chilo-ms Jul 14, 2025
c5363e6
fix bug for gpu data transfer implementation
chilo-ms Jul 15, 2025
09138ee
clean up
chilo-ms Jul 15, 2025
a8dde45
remove unnecessary files
chilo-ms Jul 15, 2025
b911754
Temporarily manually creates cudaStream to run
chilo-ms Jul 15, 2025
0c817ac
Temporary make plugin TRT links against the protobuf, onnx, flatbuffe…
chilo-ms Jul 15, 2025
da729f9
fix the issue of error LNK2038: mismatch detected for 'RuntimeLibrary…
chilo-ms Jul 15, 2025
6fd38c3
refactor memory info stored in factory
chilo-ms Jul 15, 2025
7467c65
update as onnxruntime_ep_c_api.h changes
chilo-ms Jul 16, 2025
da0f9c6
Add support for dump and run EP Context model
chilo-ms Jul 23, 2025
ccf20da
update and sync with latest ep c api
chilo-ms Jul 23, 2025
cca956d
remove delete resource in TRTEpDataTransfer::ReleaseImpl
chilo-ms Jul 24, 2025
404cd4e
update cmake file to force dynamic release CRT globally for all depen…
chilo-ms Jul 29, 2025
c58130b
use updated Value_GetMemoryDevice API
chilo-ms Aug 11, 2025
5828e10
update ort to graph util
chilo-ms Aug 11, 2025
832a7f4
Add EP API Stream support
chilo-ms Aug 11, 2025
edd4b34
Update CMakeLists.txt
chilo-ms Aug 11, 2025
5f46b68
fix mem leak for OrtAllocator
chilo-ms Aug 12, 2025
e81d395
add missing header file
chilo-ms Aug 19, 2025
1211cd6
fix build issue on Linux
chilo-ms Aug 20, 2025
0a8be0d
lintrunner -a
chilo-ms Aug 22, 2025
e4c2405
Update to use new API OpAttr_GetTensorAttributeAsOrtValue
chilo-ms Aug 29, 2025
2472a15
remove unnecessary files
chilo-ms Sep 8, 2025
ab8cd70
Add default logger for TRT logger
chilo-ms Sep 10, 2025
12d2306
Add default logger for TRT EP
chilo-ms Sep 10, 2025
c6ae7b6
update include path in utility function header
chilo-ms Sep 10, 2025
6b180a4
Add default logger for TRT EP (cont.)
chilo-ms Sep 10, 2025
b3ac797
put code under namespace trt_ep
chilo-ms Sep 16, 2025
632d224
remove unnecessary files
chilo-ms Sep 18, 2025
4d32867
update GetCapabilityImpl()
chilo-ms Sep 25, 2025
ae9686f
Add code for updating cache path for EPContext node
chilo-ms Sep 26, 2025
c8a6ae6
add onnx_external_data_bytestream support for refitting the engine
chilo-ms Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions plugin_execution_providers/tensorrt/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# usage:
# cd build/
# cmake -S ../ -B ./ -DCMAKE_BUILD_TYPE=Debug -DORT_HOME=/home/lochi/onnxruntime-win-x64-gpu-1.23.0 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DTENSORRT_HOME=/home/lochi/tensorrt/TensorRT-10.3.0.26 -DCMAKE_POSITION_INDEPENDENT_CODE=ON (see the result of "nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: perhaps should replace lochi with a generic user or something like it

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could it be put in the c_cxx folder along with other C/C++ examples?

# cmake --build ./ --config Debug
cmake_minimum_required(VERSION 3.26)
project(TensorRTEp VERSION 1.0)
set(CMAKE_CXX_STANDARD 17)

enable_language(CUDA) # via nvcc to get the CUDA tool kit
file(TO_CMAKE_PATH "/usr/local/cuda" CUDAToolkit_ROOT)
find_package(CUDAToolkit REQUIRED)

# CMake config to force dynamic debug CRT or dynamic release CRT globally for all dependencies.
# This is to address the issue of:
# libprotobufd.lib(common.obj) : error LNK2038: mismatch detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value 'MDd_DynamicDebug' in unary_elementwise_ops_impl.obj
if (WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL" CACHE STRING "" FORCE) # /MDd
set(BUILD_SHARED_LIBS OFF) # Build protobuf as static .lib, but using dynamic runtime
endif()

if(CMAKE_BUILD_TYPE STREQUAL "Release")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL" CACHE STRING "" FORCE)
set(BUILD_SHARED_LIBS OFF) # Build protobuf as static .lib, but using dynamic runtime
endif()
endif()

add_definitions(-DONNX_NAMESPACE=onnx)
add_definitions(-DONNX_ML)
add_definitions(-DNV_TENSORRT_MAJOR=10)
add_definitions(-DNOMINMAX)
file(GLOB tensorrt_src "./*.cc" "./utils/*.cc" "./cuda/unary_elementwise_ops_impl.cu" "./*.h")
add_library(TensorRTEp SHARED ${tensorrt_src})

if (NOT ORT_HOME)
message(FATAL_ERROR "Please specify ORT_HOME, e.g. -DORT_HOME=/path/to/ort/")
endif()

if (NOT TENSORRT_HOME)
message(FATAL_ERROR "Please specify TENSORRT_HOME, e.g. -DTENSORRT_HOME=/path/to/trt/")
endif()

# Use release mode if not specified
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
endif()

# Add dependencies
include(FetchContent)

# Add protobuf
FetchContent_Declare(
protobuf
GIT_REPOSITORY https://github.com/protocolbuffers/protobuf.git
GIT_TAG v21.12 # Use a specific tag or commit
)

if (WIN32)
# Sometimes, protobuf ignores CMAKE_MSVC_RUNTIME_LIBRARY. To ensure it works:
set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "" FORCE)
endif()

FetchContent_MakeAvailable(protobuf)

# Add ONNX
FetchContent_Declare(
onnx
GIT_REPOSITORY https://github.com/onnx/onnx.git
GIT_TAG v1.18.0 # Use a specific tag or commit
)

FetchContent_MakeAvailable(onnx)

# Add GSL
FetchContent_Declare(
gsl
GIT_REPOSITORY https://github.com/microsoft/GSL.git
GIT_TAG v4.0.0 # Use a specific tag or commit
)

FetchContent_MakeAvailable(gsl)

# Add flatbuffers
FetchContent_Declare(
flatbuffers
GIT_REPOSITORY https://github.com/google/flatbuffers.git
GIT_TAG v23.5.26 # Use a specific tag or commit
)

FetchContent_MakeAvailable(flatbuffers)

set(DEPS_PATH "${CMAKE_BINARY_DIR}/_deps")

if (WIN32) # Windows
set(ORT_LIB "${ORT_HOME}/lib/onnxruntime.lib")
set(TRT_LIBS "${TENSORRT_HOME}/lib/nvinfer_10.lib"
"${TENSORRT_HOME}/lib/nvinfer_plugin_10.lib"
"${TENSORRT_HOME}/lib/nvonnxparser_10.lib")

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(DEPS_LIBS ${DEPS_LIBS}
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobufd.lib"
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotocd.lib")
else()
set(DEPS_LIBS ${DEPS_LIBS}
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobuf.lib"
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotoc.lib")
endif()

set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/${CMAKE_BUILD_TYPE}/flatbuffers.lib"
"${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx.lib"
"${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx_proto.lib")

set(TRT_EP_LIB_LINK_FLAG
"-DEF:${CMAKE_SOURCE_DIR}/tensorrt_execution_provider.def")

else() # Linux
set(ORT_LIB "${ORT_HOME}/lib/libonnxruntime.so")
set(TRT_LIBS "${TENSORRT_HOME}/lib/libnvinfer.so"
"${TENSORRT_HOME}/lib/libnvinfer_plugin.so"
"${TENSORRT_HOME}/lib/libnvonnxparser.so")
set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/libflatbuffers.a"
"${DEPS_PATH}/onnx-build/libonnx.a"
"${DEPS_PATH}/onnx-build/libonnx_proto.a")

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(DEPS_LIBS ${DEPS_LIBS}
"${DEPS_PATH}/protobuf-build/libprotobufd.a"
"${DEPS_PATH}/protobuf-build/libprotocd.a")
else()
set(DEPS_LIBS ${DEPS_LIBS}
"${DEPS_PATH}/protobuf-build/libprotobuf.a"
"${DEPS_PATH}/protobuf-build/libprotoc.a")
endif()
endif()

MESSAGE(STATUS "Looking for following dependencies ...")
MESSAGE(STATUS "ORT lib : ${ORT_LIB}")
MESSAGE(STATUS "TRT libs : ${TRT_LIBS}")
MESSAGE(STATUS "Deps libs: ${DEPS_LIBS}")

set_property(TARGET TensorRTEp APPEND_STRING PROPERTY LINK_FLAGS
${TRT_EP_LIB_LINK_FLAG})

target_include_directories(TensorRTEp PUBLIC "${ORT_HOME}/include"
"./utils"
"/usr/local/cuda/include"
"${TENSORRT_HOME}/include"
"${DEPS_PATH}/flatbuffers-src/include"
"${DEPS_PATH}/gsl-src/include" # GSL is header-only
"${DEPS_PATH}/onnx-src"
"${DEPS_PATH}/onnx-build"
"${DEPS_PATH}/protobuf-src/src"
)

target_link_libraries(TensorRTEp PUBLIC #${DEPS_LIBS}
protobuf::libprotobuf onnx flatbuffers
${ORT_LIB}
${TRT_LIBS}
CUDA::cudart
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once
#include <stdint.h>

namespace cuda {

// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
#ifndef CUDA_LONG
#define CUDA_LONG int32_t
#endif

template <class INT, class INT2>
inline __host__ __device__ INT CeilDiv(INT a, INT2 b) // ceil(a/b)
{
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
}

struct GridDim {
enum : CUDA_LONG {
maxThreadsPerBlock = 256, // max threads per block
maxElementsPerThread = 4, // max element processed per thread
};
};

template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void _UnaryElementWise(
const InT* input_data,
OutT* output_data,
const FuncT functor,
CUDA_LONG N) {
CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
InT value[NumElementsPerThread];

CUDA_LONG id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
value[i] = input_data[id];
id += NumThreadsPerBlock;
}
}

id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output_data[id] = functor(value[i]);
id += NumThreadsPerBlock;
}
}
}

template <typename InT, typename OutT, typename FuncT>
void UnaryElementWiseImpl(
cudaStream_t stream,
const InT* input_data,
OutT* output_data,
const FuncT& func,
size_t count) {
if (count == 0) // special case where there's a dim value of 0 in the shape
return;

int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(count);
_UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data,
output_data,
func,
N);
}

} // namespace cuda
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include <cuda_runtime.h>
#include "cu_inc/unary_elementwise_impl.cuh"

#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
#include "cuda_fp8.h"
#endif
#include <cuda_fp16.h>

namespace cuda {

// the postfix of means the types supported by the op:
// B: uint8_t
// W: uint16_t
// U: uint32_t
// Z: uint64_t
// C: int8_t
// S: int16_t
// I: int32_t
// L: int64_t
// H: float16
// F: float
// D: double
// O: bool
// X: BFloat16

// When casting, half needs to be converted via float type from most other types
template <typename T>
struct ViaTypeMap {
typedef T ViaT;
};

template <>
struct ViaTypeMap<half> {
typedef float ViaT;
};

template <typename InT, typename OutT>
struct OP_Cast {
__device__ __inline__ OutT operator()(const InT& a) const {
const bool any_float16 = std::is_same<half, InT>::value || std::is_same<half, OutT>::value;
typedef typename std::conditional<any_float16, half, OutT>::type T;
typedef typename ViaTypeMap<T>::ViaT ViaT;
return (OutT)((ViaT)a);
}
};

#define IMPL_CAST_IMPL(InT, OutT) \
void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count); \
}

#define IMPL_CAST_IMPL_THROW(InT, OutT) \
void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
size_t /*count*/) { \
ORT_THROW("Cast from " #InT " to " #OutT " must define saturate."); \
}

#define IMPL_CAST_IMPL_FROM(T) \
IMPL_CAST_IMPL(T, half) \
IMPL_CAST_IMPL(T, float) \
IMPL_CAST_IMPL(T, double) \
IMPL_CAST_IMPL(T, int8_t) \
IMPL_CAST_IMPL(T, int16_t) \
IMPL_CAST_IMPL(T, int32_t) \
IMPL_CAST_IMPL(T, int64_t) \
IMPL_CAST_IMPL(T, uint8_t) \
IMPL_CAST_IMPL(T, uint16_t) \
IMPL_CAST_IMPL(T, uint32_t) \
IMPL_CAST_IMPL(T, uint64_t) \
IMPL_CAST_IMPL(T, bool) \
// IMPL_CAST_IMPL(T, BFloat16)

IMPL_CAST_IMPL_FROM(half)
IMPL_CAST_IMPL_FROM(float)
IMPL_CAST_IMPL_FROM(double)
IMPL_CAST_IMPL_FROM(int8_t)
IMPL_CAST_IMPL_FROM(int16_t)
IMPL_CAST_IMPL_FROM(int32_t)
IMPL_CAST_IMPL_FROM(int64_t)
IMPL_CAST_IMPL_FROM(uint8_t)
IMPL_CAST_IMPL_FROM(uint16_t)
IMPL_CAST_IMPL_FROM(uint32_t)
IMPL_CAST_IMPL_FROM(uint64_t)
IMPL_CAST_IMPL_FROM(bool)
// IMPL_CAST_IMPL_FROM(BFloat16)

} // namespace cuda
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include <stdint.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

namespace cuda {

// Cast

#define DECL_IMPL_CAST(InT, OutT) \
void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count);

#define DECL_IMPL_CAST_FROM(T) \
DECL_IMPL_CAST(T, half) \
DECL_IMPL_CAST(T, float) \
DECL_IMPL_CAST(T, double) \
DECL_IMPL_CAST(T, int8_t) \
DECL_IMPL_CAST(T, int16_t) \
DECL_IMPL_CAST(T, int32_t) \
DECL_IMPL_CAST(T, int64_t) \
DECL_IMPL_CAST(T, uint8_t) \
DECL_IMPL_CAST(T, uint16_t) \
DECL_IMPL_CAST(T, uint32_t) \
DECL_IMPL_CAST(T, uint64_t) \
DECL_IMPL_CAST(T, bool) \
// DECL_IMPL_CAST(T, BFloat16)

DECL_IMPL_CAST_FROM(half)
DECL_IMPL_CAST_FROM(float)
DECL_IMPL_CAST_FROM(double)
DECL_IMPL_CAST_FROM(int8_t)
DECL_IMPL_CAST_FROM(int16_t)
DECL_IMPL_CAST_FROM(int32_t)
DECL_IMPL_CAST_FROM(int64_t)
DECL_IMPL_CAST_FROM(uint8_t)
DECL_IMPL_CAST_FROM(uint16_t)
DECL_IMPL_CAST_FROM(uint32_t)
DECL_IMPL_CAST_FROM(uint64_t)
DECL_IMPL_CAST_FROM(bool)
// DECL_IMPL_CAST_FROM(BFloat16)

template <typename InT, typename OutT>
void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) {
Explicit_Impl_Cast(stream, input_data, output_data, count);
}

} // namespace cuda
Loading
Loading