microsoft · chilo-ms · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 25, 2025
diff --git a/plugin_execution_providers/tensorrt/CMakeLists.txt b/plugin_execution_providers/tensorrt/CMakeLists.txt
@@ -0,0 +1,161 @@
+# usage:
+# cd build/
+# cmake -S ../ -B ./ -DCMAKE_BUILD_TYPE=Debug -DORT_HOME=/home/lochi/onnxruntime-win-x64-gpu-1.23.0 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DTENSORRT_HOME=/home/lochi/tensorrt/TensorRT-10.3.0.26 -DCMAKE_POSITION_INDEPENDENT_CODE=ON (see the result of "nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits")
+# cmake --build ./ --config Debug
+cmake_minimum_required(VERSION 3.26)
+project(TensorRTEp VERSION 1.0)
+set(CMAKE_CXX_STANDARD 17)
+
+enable_language(CUDA) # via nvcc to get the CUDA tool kit
+file(TO_CMAKE_PATH "/usr/local/cuda" CUDAToolkit_ROOT)
+find_package(CUDAToolkit REQUIRED)
+
+# CMake config to force dynamic debug CRT or dynamic release CRT globally for all dependencies.
+# This is to address the issue of:
+# libprotobufd.lib(common.obj) : error LNK2038: mismatch detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value 'MDd_DynamicDebug' in unary_elementwise_ops_impl.obj
+if (WIN32)
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL" CACHE STRING "" FORCE)  # /MDd
+    set(BUILD_SHARED_LIBS OFF)  # Build protobuf as static .lib, but using dynamic runtime
+  endif()
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL" CACHE STRING "" FORCE)
+    set(BUILD_SHARED_LIBS OFF)  # Build protobuf as static .lib, but using dynamic runtime
+  endif()
+endif()
+
+add_definitions(-DONNX_NAMESPACE=onnx)
+add_definitions(-DONNX_ML)
+add_definitions(-DNV_TENSORRT_MAJOR=10)
+add_definitions(-DNOMINMAX)
+file(GLOB tensorrt_src "./*.cc" "./utils/*.cc" "./cuda/unary_elementwise_ops_impl.cu" "./*.h")
+add_library(TensorRTEp SHARED ${tensorrt_src})
+
+if (NOT ORT_HOME)
+  message(FATAL_ERROR "Please specify ORT_HOME, e.g. -DORT_HOME=/path/to/ort/")
+endif()
+
+if (NOT TENSORRT_HOME)
+  message(FATAL_ERROR "Please specify TENSORRT_HOME, e.g. -DTENSORRT_HOME=/path/to/trt/")
+endif()
+
+# Use release mode if not specified
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# Add dependencies
+include(FetchContent)
+
+# Add protobuf
+FetchContent_Declare(
+  protobuf
+  GIT_REPOSITORY https://github.com/protocolbuffers/protobuf.git
+  GIT_TAG        v21.12  # Use a specific tag or commit
+)
+
+if (WIN32)
+  # Sometimes, protobuf ignores CMAKE_MSVC_RUNTIME_LIBRARY. To ensure it works:
+  set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "" FORCE)
+endif()
+
+FetchContent_MakeAvailable(protobuf)
+
+# Add ONNX
+FetchContent_Declare(
+  onnx
+  GIT_REPOSITORY https://github.com/onnx/onnx.git
+  GIT_TAG        v1.18.0  # Use a specific tag or commit
+)
+
+FetchContent_MakeAvailable(onnx)
+
+# Add GSL
+FetchContent_Declare(
+  gsl
+  GIT_REPOSITORY https://github.com/microsoft/GSL.git
+  GIT_TAG        v4.0.0  # Use a specific tag or commit
+)
+
+FetchContent_MakeAvailable(gsl)
+
+# Add flatbuffers
+FetchContent_Declare(
+  flatbuffers
+  GIT_REPOSITORY https://github.com/google/flatbuffers.git
+  GIT_TAG        v23.5.26  # Use a specific tag or commit
+)
+
+FetchContent_MakeAvailable(flatbuffers)
+
+set(DEPS_PATH "${CMAKE_BINARY_DIR}/_deps")
+
+if (WIN32) # Windows
+  set(ORT_LIB "${ORT_HOME}/lib/onnxruntime.lib")
+  set(TRT_LIBS "${TENSORRT_HOME}/lib/nvinfer_10.lib" 
+               "${TENSORRT_HOME}/lib/nvinfer_plugin_10.lib"
+               "${TENSORRT_HOME}/lib/nvonnxparser_10.lib")
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEPS_LIBS ${DEPS_LIBS}
+                  "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobufd.lib"
+                  "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotocd.lib")
+  else()
+    set(DEPS_LIBS ${DEPS_LIBS} 
+                  "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobuf.lib"
+                  "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotoc.lib")
+  endif()
+
+  set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/${CMAKE_BUILD_TYPE}/flatbuffers.lib"
+                "${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx.lib"
+                "${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx_proto.lib")
+
+  set(TRT_EP_LIB_LINK_FLAG
+        "-DEF:${CMAKE_SOURCE_DIR}/tensorrt_execution_provider.def")
+
+else() # Linux
+  set(ORT_LIB "${ORT_HOME}/lib/libonnxruntime.so")
+  set(TRT_LIBS "${TENSORRT_HOME}/lib/libnvinfer.so"
+               "${TENSORRT_HOME}/lib/libnvinfer_plugin.so"
+               "${TENSORRT_HOME}/lib/libnvonnxparser.so")
+  set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/libflatbuffers.a"
+               "${DEPS_PATH}/onnx-build/libonnx.a"
+               "${DEPS_PATH}/onnx-build/libonnx_proto.a")
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEPS_LIBS ${DEPS_LIBS}
+                 "${DEPS_PATH}/protobuf-build/libprotobufd.a"
+                 "${DEPS_PATH}/protobuf-build/libprotocd.a")
+  else()
+    set(DEPS_LIBS ${DEPS_LIBS}
+                 "${DEPS_PATH}/protobuf-build/libprotobuf.a"
+                 "${DEPS_PATH}/protobuf-build/libprotoc.a")
+  endif()
+endif()
+
+MESSAGE(STATUS "Looking for following dependencies ...")
+MESSAGE(STATUS "ORT lib  : ${ORT_LIB}")
+MESSAGE(STATUS "TRT libs : ${TRT_LIBS}")
+MESSAGE(STATUS "Deps libs: ${DEPS_LIBS}")
+
+set_property(TARGET TensorRTEp APPEND_STRING PROPERTY LINK_FLAGS
+               ${TRT_EP_LIB_LINK_FLAG})
+
+target_include_directories(TensorRTEp PUBLIC "${ORT_HOME}/include"
+                                             "./utils"
+                                             "/usr/local/cuda/include"
+                                             "${TENSORRT_HOME}/include"
+                                             "${DEPS_PATH}/flatbuffers-src/include"
+                                             "${DEPS_PATH}/gsl-src/include" # GSL is header-only
+                                             "${DEPS_PATH}/onnx-src"
+                                             "${DEPS_PATH}/onnx-build"
+                                             "${DEPS_PATH}/protobuf-src/src"
+)
+
+target_link_libraries(TensorRTEp PUBLIC #${DEPS_LIBS}
+                                        protobuf::libprotobuf onnx flatbuffers
+                                        ${ORT_LIB}
+                                        ${TRT_LIBS}
+                                        CUDA::cudart
+)
diff --git a/plugin_execution_providers/tensorrt/cuda/cu_inc/unary_elementwise_impl.cuh b/plugin_execution_providers/tensorrt/cuda/cu_inc/unary_elementwise_impl.cuh
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+
+namespace cuda {
+
+// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
+// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
+#ifndef CUDA_LONG
+#define CUDA_LONG int32_t
+#endif
+
+template <class INT, class INT2>
+inline __host__ __device__ INT CeilDiv(INT a, INT2 b)  // ceil(a/b)
+{
+  return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b);  // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
+}
+
+struct GridDim {
+  enum : CUDA_LONG {
+    maxThreadsPerBlock = 256,  // max threads per block
+    maxElementsPerThread = 4,  // max element processed per thread
+  };
+};
+
+template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _UnaryElementWise(
+    const InT* input_data,
+    OutT* output_data,
+    const FuncT functor,
+    CUDA_LONG N) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  InT value[NumElementsPerThread];
+
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      value[i] = input_data[id];
+      id += NumThreadsPerBlock;
+    }
+  }
+
+  id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output_data[id] = functor(value[i]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename InT, typename OutT, typename FuncT>
+void UnaryElementWiseImpl(
+    cudaStream_t stream,
+    const InT* input_data,
+    OutT* output_data,
+    const FuncT& func,
+    size_t count) {
+  if (count == 0)  // special case where there's a dim value of 0 in the shape
+    return;
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  CUDA_LONG N = static_cast<CUDA_LONG>(count);
+  _UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input_data,
+          output_data,
+          func,
+          N);
+}
+
+}  // namespace cuda
diff --git a/plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.cu b/plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.cu
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cuda_runtime.h>
+#include "cu_inc/unary_elementwise_impl.cuh"
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+#include "cuda_fp8.h"
+#endif
+#include <cuda_fp16.h>
+
+namespace cuda {
+
+// the postfix of means the types supported by the op:
+// B: uint8_t
+// W: uint16_t
+// U: uint32_t
+// Z: uint64_t
+// C: int8_t
+// S: int16_t
+// I: int32_t
+// L: int64_t
+// H: float16
+// F: float
+// D: double
+// O: bool
+// X: BFloat16
+
+// When casting, half needs to be converted via float type from most other types
+template <typename T>
+struct ViaTypeMap {
+  typedef T ViaT;
+};
+
+template <>
+struct ViaTypeMap<half> {
+  typedef float ViaT;
+};
+
+template <typename InT, typename OutT>
+struct OP_Cast {
+  __device__ __inline__ OutT operator()(const InT& a) const {
+    const bool any_float16 = std::is_same<half, InT>::value || std::is_same<half, OutT>::value;
+    typedef typename std::conditional<any_float16, half, OutT>::type T;
+    typedef typename ViaTypeMap<T>::ViaT ViaT;
+    return (OutT)((ViaT)a);
+  }
+};
+
+#define IMPL_CAST_IMPL(InT, OutT)                                                                        \
+  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
+    UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count);                  \
+  }
+
+#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                              \
+  void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
+                          size_t /*count*/) {                                                        \
+    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                              \
+  }
+
+#define IMPL_CAST_IMPL_FROM(T) \
+  IMPL_CAST_IMPL(T, half)      \
+  IMPL_CAST_IMPL(T, float)     \
+  IMPL_CAST_IMPL(T, double)    \
+  IMPL_CAST_IMPL(T, int8_t)    \
+  IMPL_CAST_IMPL(T, int16_t)   \
+  IMPL_CAST_IMPL(T, int32_t)   \
+  IMPL_CAST_IMPL(T, int64_t)   \
+  IMPL_CAST_IMPL(T, uint8_t)   \
+  IMPL_CAST_IMPL(T, uint16_t)  \
+  IMPL_CAST_IMPL(T, uint32_t)  \
+  IMPL_CAST_IMPL(T, uint64_t)  \
+  IMPL_CAST_IMPL(T, bool)      \
+  // IMPL_CAST_IMPL(T, BFloat16)
+
+IMPL_CAST_IMPL_FROM(half)
+IMPL_CAST_IMPL_FROM(float)
+IMPL_CAST_IMPL_FROM(double)
+IMPL_CAST_IMPL_FROM(int8_t)
+IMPL_CAST_IMPL_FROM(int16_t)
+IMPL_CAST_IMPL_FROM(int32_t)
+IMPL_CAST_IMPL_FROM(int64_t)
+IMPL_CAST_IMPL_FROM(uint8_t)
+IMPL_CAST_IMPL_FROM(uint16_t)
+IMPL_CAST_IMPL_FROM(uint32_t)
+IMPL_CAST_IMPL_FROM(uint64_t)
+IMPL_CAST_IMPL_FROM(bool)
+// IMPL_CAST_IMPL_FROM(BFloat16)
+
+}  // namespace cuda
diff --git a/plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.h b/plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace cuda {
+
+// Cast
+
+#define DECL_IMPL_CAST(InT, OutT) \
+  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count);
+
+#define DECL_IMPL_CAST_FROM(T) \
+  DECL_IMPL_CAST(T, half)      \
+  DECL_IMPL_CAST(T, float)     \
+  DECL_IMPL_CAST(T, double)    \
+  DECL_IMPL_CAST(T, int8_t)    \
+  DECL_IMPL_CAST(T, int16_t)   \
+  DECL_IMPL_CAST(T, int32_t)   \
+  DECL_IMPL_CAST(T, int64_t)   \
+  DECL_IMPL_CAST(T, uint8_t)   \
+  DECL_IMPL_CAST(T, uint16_t)  \
+  DECL_IMPL_CAST(T, uint32_t)  \
+  DECL_IMPL_CAST(T, uint64_t)  \
+  DECL_IMPL_CAST(T, bool)      \
+  // DECL_IMPL_CAST(T, BFloat16)
+
+DECL_IMPL_CAST_FROM(half)
+DECL_IMPL_CAST_FROM(float)
+DECL_IMPL_CAST_FROM(double)
+DECL_IMPL_CAST_FROM(int8_t)
+DECL_IMPL_CAST_FROM(int16_t)
+DECL_IMPL_CAST_FROM(int32_t)
+DECL_IMPL_CAST_FROM(int64_t)
+DECL_IMPL_CAST_FROM(uint8_t)
+DECL_IMPL_CAST_FROM(uint16_t)
+DECL_IMPL_CAST_FROM(uint32_t)
+DECL_IMPL_CAST_FROM(uint64_t)
+DECL_IMPL_CAST_FROM(bool)
+// DECL_IMPL_CAST_FROM(BFloat16)
+
+template <typename InT, typename OutT>
+void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) {
+  Explicit_Impl_Cast(stream, input_data, output_data, count);
+}
+
+}  // namespace cuda