add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include "kernel.h"
+template <typename KeyT, typename ValueT>
+size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) {
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes,
+                                                     (const KeyT*)NULL, (KeyT*)NULL,
+                                                     (const ValueT*)NULL, (ValueT*)NULL,
+                                                     num_items,     // # items
+                                                     num_segments,  // # segments
+                                                     (const int*)NULL, (const int*)NULL);
+  return temp_storage_bytes;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#ifndef TRT_KERNEL_H
+#define TRT_KERNEL_H
+
+#include <cuda_runtime.h>
+
+#include <cassert>
+#include <cstdio>
+
+#include "cublas_v2.h"
+#include "trt_plugin_helper.hpp"
+
+using namespace nvinfer1;
+#define DEBUG_ENABLE 0
+
+template <typename T>
+struct Bbox {
+  T xmin, ymin, xmax, ymax;
+  Bbox(T xmin, T ymin, T xmax, T ymax) : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
+  Bbox() = default;
+};
+
+size_t get_cuda_arch(int devID);
+
+int8_t* alignPtr(int8_t* ptr, uintptr_t to);
+
+int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
+
+void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
+
+pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class,
+                           int top_k, float nms_threshold, bool share_location, bool isNormalized,
+                           DataType DT_SCORE, DataType DT_BBOX, void* bbox_data,
+                           void* beforeNMS_scores, void* beforeNMS_index_array,
+                           void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes,
+                                  int num_preds_per_class, int top_k, float nms_threshold,
+                                  bool share_location, bool isNormalized, DataType DT_SCORE,
+                                  DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores,
+                                  void* beforeNMS_index_array, void* afterNMS_scores,
+                                  void* afterNMS_index_array, bool flipXY = false);
+
+size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
+
+size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
+
+size_t sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class,
+                                       DataType DT_CONF);
+
+size_t sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
+
+pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image,
+                                  DataType DT_SCORE, void* unsorted_scores,
+                                  void* unsorted_bbox_indices, void* sorted_scores,
+                                  void* sorted_bbox_indices, void* workspace);
+
+pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes,
+                                  int num_preds_per_class, int background_label_id,
+                                  float confidence_threshold, DataType DT_SCORE,
+                                  void* conf_scores_gpu, void* index_array_gpu, void* workspace);
+
+size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
+
+pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data,
+                           int num_dim, DataType DT_DATA, bool confSigmoid, const void* data,
+                           void* new_data);
+
+size_t detectionForwardPreNMSSize(int N, int C2);
+
+size_t detectionForwardPostNMSSize(int N, int numClasses, int topK);
+
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages,
+                                int numPredsPerClass, int numClasses, int topK, int keepTopK,
+                                DataType DT_BBOX, DataType DT_SCORE, const void* indices,
+                                const void* scores, const void* bboxData, void* nmsedDets,
+                                void* nmsedLabels, void* nmsedIndex = nullptr,
+                                bool clipBoxes = true, bool rotated = false);
+
+size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
+                                       int numPredsPerClass, int topK, DataType DT_BBOX,
+                                       DataType DT_SCORE);
+
+#endif
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_PLUGIN_BASE_HPP
+#define TRT_PLUGIN_BASE_HPP
+#include "NvInferRuntime.h"
+#include "NvInferVersion.h"
+#include "trt_plugin_helper.hpp"
+
+namespace mmdeploy {
+
+#if NV_TENSORRT_MAJOR > 7
+#define TRT_NOEXCEPT noexcept
+#else
+#define TRT_NOEXCEPT
+#endif
+
+class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  TRTPluginBase(const std::string &name) : mLayerName(name) {}
+  // IPluginV2 Methods
+  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+  int initialize() TRT_NOEXCEPT override { return STATUS_SUCCESS; }
+  void terminate() TRT_NOEXCEPT override {}
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
+    mNamespace = pluginNamespace;
+  }
+  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+
+  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                               const nvinfer1::DynamicPluginTensorDesc *out,
+                               int nbOutputs) TRT_NOEXCEPT override {}
+
+  virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                  const nvinfer1::PluginTensorDesc *outputs,
+                                  int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  virtual void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                               nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override {}
+
+  virtual void detachFromContext() TRT_NOEXCEPT override {}
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+#if NV_TENSORRT_MAJOR < 8
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+#endif
+};
+
+class TRTPluginCreatorBase : public nvinfer1::IPluginCreator {
+ public:
+  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; };
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return &mFC; }
+
+  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
+    mNamespace = pluginNamespace;
+  }
+
+  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+
+ protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+}  // namespace mmdeploy
+#endif
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_PLUGIN_HELPER_HPP
+#define TRT_PLUGIN_HELPER_HPP
+#include <cudnn.h>
+
+#include <iostream>
+#include <stdexcept>
+
+#include "NvInferRuntime.h"
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype);
+
+// Enumerator for status
+typedef enum {
+  STATUS_SUCCESS = 0,
+  STATUS_FAILURE = 1,
+  STATUS_BAD_PARAM = 2,
+  STATUS_NOT_SUPPORTED = 3,
+  STATUS_NOT_INITIALIZED = 4
+} pluginStatus_t;
+
+#define ASSERT(assertion)                                                    \
+  {                                                                          \
+    if (!(assertion)) {                                                      \
+      std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
+      abort();                                                               \
+    }                                                                        \
+  }
+
+#define CUASSERT(status_)                                                                       \
+  {                                                                                             \
+    auto s_ = status_;                                                                          \
+    if (s_ != cudaSuccess) {                                                                    \
+      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
+                << std::endl;                                                                   \
+    }                                                                                           \
+  }
+#define CUBLASASSERT(status_)                                               \
+  {                                                                         \
+    auto s_ = status_;                                                      \
+    if (s_ != CUBLAS_STATUS_SUCCESS) {                                      \
+      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+    }                                                                       \
+  }
+#define CUERRORMSG(status_)                                                            \
+  {                                                                                    \
+    auto s_ = status_;                                                                 \
+    if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+  }
+
+#ifndef DEBUG
+
+#define CHECK(status)         \
+  do {                        \
+    if (status != 0) abort(); \
+  } while (0)
+
+#define ASSERT_PARAM(exp)                \
+  do {                                   \
+    if (!(exp)) return STATUS_BAD_PARAM; \
+  } while (0)
+
+#define ASSERT_FAILURE(exp)            \
+  do {                                 \
+    if (!(exp)) return STATUS_FAILURE; \
+  } while (0)
+
+#define CSC(call, err)               \
+  do {                               \
+    cudaError_t cudaStatus = call;   \
+    if (cudaStatus != cudaSuccess) { \
+      return err;                    \
+    }                                \
+  } while (0)
+
+#define DEBUG_PRINTF(...) \
+  do {                    \
+  } while (0)
+
+#else
+
+#define ASSERT_PARAM(exp)                                                   \
+  do {                                                                      \
+    if (!(exp)) {                                                           \
+      fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+      return STATUS_BAD_PARAM;                                              \
+    }                                                                       \
+  } while (0)
+
+#define ASSERT_FAILURE(exp)                                               \
+  do {                                                                    \
+    if (!(exp)) {                                                         \
+      fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+      return STATUS_FAILURE;                                              \
+    }                                                                     \
+  } while (0)
+
+#define CSC(call, err)                                                                    \
+  do {                                                                                    \
+    cudaError_t cudaStatus = call;                                                        \
+    if (cudaStatus != cudaSuccess) {                                                      \
+      printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
+      return err;                                                                         \
+    }                                                                                     \
+  } while (0)
+
+#define CHECK(status)                                                                       \
+  {                                                                                         \
+    if (status != 0) {                                                                      \
+      DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
+      abort();                                                                              \
+    }                                                                                       \
+  }
+
+#define DEBUG_PRINTF(...) \
+  do {                    \
+    printf(__VA_ARGS__);  \
+  } while (0)
+
+#endif
+
+namespace mmdeploy {
+
+const int MAXTENSORDIMS = 10;
+
+struct TensorDesc {
+  int shape[MAXTENSORDIMS];
+  int stride[MAXTENSORDIMS];
+  int dim;
+};
+
+inline unsigned int getElementSize(nvinfer1::DataType t) {
+  switch (t) {
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    // case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    default:
+      throw std::runtime_error("Invalid DataType.");
+  }
+  throw std::runtime_error("Invalid DataType.");
+  return 0;
+}
+
+inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
+  return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
+}
+
+}  // namespace mmdeploy
+#endif  // TRT_PLUGIN_HELPER_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
+
+#ifndef TRT_SERIALIZE_HPP
+#define TRT_SERIALIZE_HPP
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T,
+                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
+#endif  // TRT_SERIALIZE_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <vector>
+
+#include "nms/kernel.h"
+
+const static int BS = 512;
+
+template <typename T_BBOX>
+__device__ T_BBOX bboxSize(const Bbox<T_BBOX> &bbox, const bool normalized, T_BBOX offset) {
+  if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return 0;
+  } else {
+    T_BBOX width = bbox.xmax - bbox.xmin;
+    T_BBOX height = bbox.ymax - bbox.ymin;
+    if (normalized) {
+      return width * height;
+    } else {
+      // If bbox is not within range [0, 1].
+      return (width + offset) * (height + offset);
+    }
+  }
+}
+
+template <typename T_BBOX>
+__device__ void intersectBbox(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
+                              Bbox<T_BBOX> *intersect_bbox) {
+  if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
+      bbox2.ymax < bbox1.ymin) {
+    // Return [0, 0, 0, 0] if there is no intersection.
+    intersect_bbox->xmin = T_BBOX(0);
+    intersect_bbox->ymin = T_BBOX(0);
+    intersect_bbox->xmax = T_BBOX(0);
+    intersect_bbox->ymax = T_BBOX(0);
+  } else {
+    intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
+    intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
+    intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
+    intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
+  }
+}
+
+template <typename T_BBOX>
+__device__ float jaccardOverlap(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
+                                const bool normalized, T_BBOX offset) {
+  Bbox<T_BBOX> intersect_bbox;
+  intersectBbox(bbox1, bbox2, &intersect_bbox);
+  float intersect_width, intersect_height;
+  if (normalized) {
+    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
+    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
+  } else {
+    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin + offset;
+    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
+  }
+  if (intersect_width > 0 && intersect_height > 0) {
+    float intersect_size = intersect_width * intersect_height;
+    float bbox1_size = bboxSize(bbox1, normalized, offset);
+    float bbox2_size = bboxSize(bbox2, normalized, offset);
+    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+  } else {
+    return 0.;
+  }
+}
+
+/********** new NMS for only score and index array **********/
+
+// clang-format off
+template <typename T_SCORE, typename T_BBOX, int TSIZE>
+__global__ void
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
+__launch_bounds__(512)
+#endif
+#endif
+allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per_class,
+                    const int top_k, const float nms_threshold, const bool share_location,
+                    const bool isNormalized,
+                    T_BBOX *bbox_data,  // bbox_data should be float to preserve
+                                        // location information
+                    T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
+                    T_SCORE *afterNMS_scores, int *afterNMS_index_array, bool flipXY = false) {
+  // clang-format on
+  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+  __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
+  for (int i = 0; i < num; i++) {
+    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+    const int bbox_idx_offset =
+        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+    // local thread data
+    int loc_bboxIndex[TSIZE];
+    Bbox<T_BBOX> loc_bbox[TSIZE];
+
+    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+    // Eliminate shared memory RAW hazard
+    __syncthreads();
+#pragma unroll
+    for (int t = 0; t < TSIZE; t++) {
+      const int cur_idx = threadIdx.x + blockDim.x * t;
+      const int item_idx = offset + cur_idx;
+
+      if (item_idx < max_idx) {
+        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+        if (loc_bboxIndex[t] >= 0)
+        // if (loc_bboxIndex[t] != -1)
+        {
+          const int bbox_data_idx = share_location
+                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
+                                        : loc_bboxIndex[t];
+
+          loc_bbox[t].xmin =
+              flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
+          loc_bbox[t].ymin =
+              flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
+          loc_bbox[t].xmax =
+              flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
+          loc_bbox[t].ymax =
+              flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
+          kept_bboxinfo_flag[cur_idx] = true;
+        } else {
+          kept_bboxinfo_flag[cur_idx] = false;
+        }
+      } else {
+        kept_bboxinfo_flag[cur_idx] = false;
+      }
+    }
+
+    // filter out overlapped boxes with lower scores
+    int ref_item_idx = offset;
+    int ref_bbox_idx =
+        share_location
+            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
+            : beforeNMS_index_array[ref_item_idx];
+
+    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
+      Bbox<T_BBOX> ref_bbox;
+      ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
+      ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
+      ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
+      ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
+
+      // Eliminate shared memory RAW hazard
+      __syncthreads();
+
+      for (int t = 0; t < TSIZE; t++) {
+        const int cur_idx = threadIdx.x + blockDim.x * t;
+        const int item_idx = offset + cur_idx;
+
+        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
+          // TODO: may need to add bool normalized as argument, HERE true means
+          // normalized
+          if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold) {
+            kept_bboxinfo_flag[cur_idx] = false;
+          }
+        }
+      }
+      __syncthreads();
+
+      do {
+        ref_item_idx++;
+      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+      ref_bbox_idx =
+          share_location
+              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
+              : beforeNMS_index_array[ref_item_idx];
+    }
+
+    // store data
+    for (int t = 0; t < TSIZE; t++) {
+      const int cur_idx = threadIdx.x + blockDim.x * t;
+      const int read_item_idx = offset + cur_idx;
+      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+      /*
+       * If not not keeping the bbox
+       * Set the score to 0
+       * Set the bounding box index to -1
+       */
+      if (read_item_idx < max_idx) {
+        afterNMS_scores[write_item_idx] =
+            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+      }
+    }
+  }
+}
+
+template <typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
+                               const int num_preds_per_class, const int top_k,
+                               const float nms_threshold, const bool share_location,
+                               const bool isNormalized, void *bbox_data, void *beforeNMS_scores,
+                               void *beforeNMS_index_array, void *afterNMS_scores,
+                               void *afterNMS_index_array, bool flipXY = false) {
+#define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
+
+  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
+                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *, bool) = {
+      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
+  };
+
+  const int GS = num_classes;
+  const int t_size = (top_k + BS - 1) / BS;
+
+  ASSERT(t_size <= 10);
+  kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
+      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
+      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
+      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array, flipXY);
+
+  cudaError_t code = cudaGetLastError();
+  CUASSERT(code);
+  CSC(code, STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// allClassNMS LAUNCH CONFIG
+typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int,
+                                  const float, const bool, const bool, void *, void *, void *,
+                                  void *, void *, bool);
+
+struct nmsLaunchConfigSSD {
+  DataType t_score;
+  DataType t_bbox;
+  nmsFunc function;
+
+  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
+  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
+      : t_score(t_score), t_bbox(t_bbox), function(function) {}
+  bool operator==(const nmsLaunchConfigSSD &other) {
+    return t_score == other.t_score && t_bbox == other.t_bbox;
+  }
+};
+
+static std::vector<nmsLaunchConfigSSD> nmsFuncVec;
+
+bool nmsInit() {
+  nmsFuncVec.push_back(
+      nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
+  return true;
+}
+
+static bool initialized = nmsInit();
+
+pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes,
+                           const int num_preds_per_class, const int top_k,
+                           const float nms_threshold, const bool share_location,
+                           const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX,
+                           void *bbox_data, void *beforeNMS_scores, void *beforeNMS_index_array,
+                           void *afterNMS_scores, void *afterNMS_index_array, bool flipXY) {
+  nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
+  for (unsigned i = 0; i < nmsFuncVec.size(); ++i) {
+    if (lc == nmsFuncVec[i]) {
+      DEBUG_PRINTF("all class nms kernel %d\n", i);
+      return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
+                                    nms_threshold, share_location, isNormalized, bbox_data,
+                                    beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
+                                    afterNMS_index_array, flipXY);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#include <cmath>
+#include <vector>
+
+#include "nms/kernel.h"
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  __host__ __device__ __forceinline__ Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
+  __host__ __device__ __forceinline__ Point operator+(const Point &p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  __host__ __device__ __forceinline__ Point &operator+=(const Point &p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  __host__ __device__ __forceinline__ Point operator-(const Point &p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  __host__ __device__ __forceinline__ Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+__host__ __device__ __forceinline__ T dot_2d(const Point<T> &A, const Point<T> &B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ T cross_2d(const Point<T> &A, const Point<T> &B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T> &box,
+                                                              Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ int get_intersection_points(const Point<T> (&pts1)[4],
+                                                                const Point<T> (&pts2)[4],
+                                                                Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto &AB = vec2[0];
+    const auto &DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto &AB = vec1[0];
+    const auto &DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ int convex_hull_graham(const Point<T> (&p)[24],
+                                                           const int &num_in, Point<T> (&q)[24],
+                                                           bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto &start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int &m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T> &box1,
+                                                                 const RotatedBox<T> &box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+template <typename T>
+__host__ __device__ __forceinline__ T single_box_iou_rotated(T const *const box1_raw,
+                                                             T const *const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 1.0f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  baseS = (area1 + area2 - intersection);
+  const T iou = intersection / baseS;
+  return iou;
+}
+
+/********** new NMS for only score and index array **********/
+
+template <typename T_SCORE, typename T_BBOX, int TSIZE>
+__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
+                                          const int num_preds_per_class, const int top_k,
+                                          const float nms_threshold, const bool share_location,
+                                          const bool isNormalized,
+                                          T_BBOX *bbox_data,  // bbox_data should be float to
+                                                              // preserve location information
+                                          T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
+                                          T_SCORE *afterNMS_scores, int *afterNMS_index_array) {
+  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+  extern __shared__ bool kept_bboxinfo_flag[];
+  for (int i = 0; i < num; i++) {
+    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+    const int bbox_idx_offset =
+        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+    // local thread data
+    int loc_bboxIndex[TSIZE];
+    T_BBOX loc_bbox[TSIZE * 5];
+
+    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+    // Eliminate shared memory RAW hazard
+    __syncthreads();
+#pragma unroll
+    for (int t = 0; t < TSIZE; t++) {
+      const int cur_idx = threadIdx.x + blockDim.x * t;
+      const int item_idx = offset + cur_idx;
+
+      if (item_idx < max_idx) {
+        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+        if (loc_bboxIndex[t] >= 0)
+        // if (loc_bboxIndex[t] != -1)
+        {
+          const int bbox_data_idx = share_location
+                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
+                                        : loc_bboxIndex[t];
+          memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
+          kept_bboxinfo_flag[cur_idx] = true;
+        } else {
+          kept_bboxinfo_flag[cur_idx] = false;
+        }
+      } else {
+        kept_bboxinfo_flag[cur_idx] = false;
+      }
+    }
+
+    // filter out overlapped boxes with lower scores
+    int ref_item_idx = offset;
+    int ref_bbox_idx =
+        share_location
+            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
+            : beforeNMS_index_array[ref_item_idx];
+
+    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
+      T_BBOX ref_bbox[5];
+      memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
+
+      // Eliminate shared memory RAW hazard
+      __syncthreads();
+
+      for (int t = 0; t < TSIZE; t++) {
+        const int cur_idx = threadIdx.x + blockDim.x * t;
+        const int item_idx = offset + cur_idx;
+
+        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
+          // TODO: may need to add bool normalized as argument, HERE true means
+          // normalized
+          if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold) {
+            kept_bboxinfo_flag[cur_idx] = false;
+          }
+        }
+      }
+      __syncthreads();
+
+      do {
+        ref_item_idx++;
+      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+      ref_bbox_idx =
+          share_location
+              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
+              : beforeNMS_index_array[ref_item_idx];
+    }
+
+    // store data
+    for (int t = 0; t < TSIZE; t++) {
+      const int cur_idx = threadIdx.x + blockDim.x * t;
+      const int read_item_idx = offset + cur_idx;
+      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+      /*
+       * If not not keeping the bbox
+       * Set the score to 0
+       * Set the bounding box index to -1
+       */
+      if (read_item_idx < max_idx) {
+        afterNMS_scores[write_item_idx] =
+            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+      }
+    }
+  }
+}
+
+template <typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
+                                      const int num_preds_per_class, const int top_k,
+                                      const float nms_threshold, const bool share_location,
+                                      const bool isNormalized, void *bbox_data,
+                                      void *beforeNMS_scores, void *beforeNMS_index_array,
+                                      void *afterNMS_scores, void *afterNMS_index_array) {
+#define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
+
+  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
+                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *) = {
+      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
+  };
+
+  const int BS = 512;
+  const int GS = num_classes;
+  const int t_size = (top_k + BS - 1) / BS;
+
+  ASSERT(t_size <= 10);
+  kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
+      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
+      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
+      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array);
+
+  CSC(cudaGetLastError(), STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// allClassNMS LAUNCH CONFIG
+typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int,
+                                         const float, const bool, const bool, void *, void *,
+                                         void *, void *, void *);
+
+struct rotatedNmsLaunchConfig {
+  DataType t_score;
+  DataType t_bbox;
+  rotatedNmsFunc function;
+
+  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
+  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
+      : t_score(t_score), t_bbox(t_bbox), function(function) {}
+  bool operator==(const rotatedNmsLaunchConfig &other) {
+    return t_score == other.t_score && t_bbox == other.t_bbox;
+  }
+};
+
+static std::vector<rotatedNmsLaunchConfig> rotatedNmsFuncVec;
+
+bool rotatedNmsInit() {
+  rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT,
+                                                     allClassRotatedNMS_gpu<float, float>));
+  return true;
+}
+
+static bool initialized = rotatedNmsInit();
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes,
+                                  const int num_preds_per_class, const int top_k,
+                                  const float nms_threshold, const bool share_location,
+                                  const bool isNormalized, const DataType DT_SCORE,
+                                  const DataType DT_BBOX, void *bbox_data, void *beforeNMS_scores,
+                                  void *beforeNMS_index_array, void *afterNMS_scores,
+                                  void *afterNMS_index_array, bool) {
+  auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
+  if (__cuda_arch__ == 720 && top_k >= 1000) {
+    printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
+           top_k);
+  }
+  rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
+
+  for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i) {
+    if (lc == rotatedNmsFuncVec[i]) {
+      DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
+      return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
+                                           nms_threshold, share_location, isNormalized, bbox_data,
+                                           beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
+                                           afterNMS_index_array);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include "nms/batched_nms_kernel.hpp"
+
+pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
+                            const int perBatchScoresSize, const bool shareLocation,
+                            const int backgroundLabelId, const int numPredsPerClass,
+                            const int numClasses, const int topK, const int keepTopK,
+                            const float scoreThreshold, const float iouThreshold,
+                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
+                            const void* confData, void* nmsedDets, void* nmsedLabels,
+                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
+                            bool clipBoxes, bool rotated) {
+  const int topKVal = topK < 0 ? numPredsPerClass : topK;
+  const int keepTopKVal = keepTopK < 0 ? numPredsPerClass : keepTopK;
+  // locCount = batch_size * number_boxes_per_sample * 4
+  const int locCount = N * perBatchBoxesSize;
+  /*
+   * shareLocation
+   * Bounding box are shared among all classes, i.e., a bounding box could be
+   * classified as any candidate class. Otherwise Bounding box are designed for
+   * specific classes, i.e., a bounding box could be classified as one certain
+   * class or not (binary classification).
+   */
+  const int numLocClasses = shareLocation ? 1 : numClasses;
+
+  size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
+  void* bboxDataRaw = workspace;
+  cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
+  pluginStatus_t status;
+
+  /*
+   * bboxDataRaw format:
+   * [batch size, numPriors (per sample), numLocClasses, 4]
+   */
+  // float for now
+  void* bboxData;
+  size_t bboxPermuteSize =
+      detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
+  void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
+
+  /*
+   * After permutation, bboxData format:
+   * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
+   * This is equivalent to swapping axis
+   */
+  if (!shareLocation) {
+    status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4,
+                         DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+    bboxData = bboxPermute;
+  }
+  /*
+   * If shareLocation, numLocClasses = 1
+   * No need to permute data on linear memory
+   */
+  else {
+    bboxData = bboxDataRaw;
+  }
+
+  /*
+   * Conf data format
+   * [batch size, numPriors * param.numClasses, 1, 1]
+   */
+  const int numScores = N * perBatchScoresSize;
+  size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+  void* scores = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
+
+  // need a conf_scores
+  /*
+   * After permutation, bboxData format:
+   * [batch_size, numClasses, numPredsPerClass, 1]
+   */
+  status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT,
+                       confSigmoid, confData, scores);
+  ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+  size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+  void* indices = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
+
+  size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
+  size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
+  void* postNMSScores = nextWorkspacePtr((int8_t*)indices, indicesSize);
+  void* postNMSIndices = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
+
+  void* sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
+  // Sort the scores so that the following NMS could be applied.
+
+  status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId,
+                              scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
+  ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+  // This is set to true as the input bounding boxes are of the format [ymin,
+  // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
+  // ymax]
+  bool flipXY = false;
+  // NMS
+  if (rotated) {
+    status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
+                                shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT,
+                                bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
+  } else {
+    status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
+                         shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData,
+                         scores, indices, postNMSScores, postNMSIndices, flipXY);
+  }
+
+  ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+  // Sort the bounding boxes after NMS using scores
+  status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores,
+                              postNMSIndices, scores, indices, sortingWorkspace);
+
+  ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+  // Gather data from the sorted bounding boxes after NMS
+  status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal,
+                            keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores,
+                            bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
+
+  ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+  return STATUS_SUCCESS;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <vector>
+
+#include "nms/kernel.h"
+#include "trt_plugin_helper.hpp"
+
+template <typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__
+    void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages,
+                                 const int numPredsPerClass, const int numClasses, const int topK,
+                                 const int keepTopK, const int *indices, const T_SCORE *scores,
+                                 const T_BBOX *bboxData, T_BBOX *nmsedDets, int *nmsedLabels,
+                                 int *nmsedIndex, bool clipBoxes) {
+  if (keepTopK > topK) return;
+  for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
+       i += gridDim.x * nthds_per_cta) {
+    const int imgId = i / keepTopK;
+    const int detId = i % keepTopK;
+    const int offset = imgId * numClasses * topK;
+    const int index = indices[offset + detId];
+    const T_SCORE score = scores[offset + detId];
+    if (index == -1) {
+      nmsedLabels[i] = -1;
+      if (nmsedIndex != nullptr) {
+        nmsedIndex[i] = -1;
+      }
+      if (rotated) {
+        nmsedDets[i * 6] = 0;
+        nmsedDets[i * 6 + 1] = 0;
+        nmsedDets[i * 6 + 2] = 0;
+        nmsedDets[i * 6 + 3] = 0;
+        nmsedDets[i * 6 + 4] = 0;
+        nmsedDets[i * 6 + 5] = 0;
+      } else {
+        nmsedDets[i * 5] = 0;
+        nmsedDets[i * 5 + 1] = 0;
+        nmsedDets[i * 5 + 2] = 0;
+        nmsedDets[i * 5 + 3] = 0;
+        nmsedDets[i * 5 + 4] = 0;
+      }
+    } else {
+      const int bboxOffset =
+          imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
+      nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass;  // label
+      if (rotated) {
+        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
+                                           : index % (numClasses * numPredsPerClass)) +
+                            bboxOffset) *
+                           5;
+        if (nmsedIndex != nullptr) {
+          nmsedIndex[i] = bboxId / 5 - bboxOffset;
+        }
+        // clipped bbox xmin
+        nmsedDets[i * 6] =
+            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+        // clipped bbox ymin
+        nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 1];
+        // clipped bbox xmax
+        nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 2];
+        // clipped bbox ymax
+        nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 3];
+        // clipped bbox angle
+        nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 4];
+        nmsedDets[i * 6 + 5] = score;
+      } else {
+        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
+                                           : index % (numClasses * numPredsPerClass)) +
+                            bboxOffset) *
+                           4;
+        if (nmsedIndex != nullptr) {
+          nmsedIndex[i] = bboxId / 4 - bboxOffset;
+        }
+        // clipped bbox xmin
+        nmsedDets[i * 5] =
+            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+        // clipped bbox ymin
+        nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 1];
+        // clipped bbox xmax
+        nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 2];
+        // clipped bbox ymax
+        nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
+                                         : bboxData[bboxId + 3];
+        nmsedDets[i * 5 + 4] = score;
+      }
+    }
+  }
+}
+
+template <typename T_BBOX, typename T_SCORE, bool rotated>
+pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation,
+                                    const int numImages, const int numPredsPerClass,
+                                    const int numClasses, const int topK, const int keepTopK,
+                                    const void *indices, const void *scores, const void *bboxData,
+                                    void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
+                                    bool clipBoxes) {
+  const int BS = 32;
+  const int GS = 32;
+  gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
+      shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, (int *)indices,
+      (T_SCORE *)scores, (T_BBOX *)bboxData, (T_BBOX *)nmsedDets, (int *)nmsedLabels,
+      (int *)nmsedIndex, clipBoxes);
+
+  CSC(cudaGetLastError(), STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// gatherNMSOutputs LAUNCH CONFIG {{{
+typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int,
+                                     const int, const int, const void *, const void *, const void *,
+                                     void *, void *, void *, bool);
+struct nmsOutLaunchConfig {
+  DataType t_bbox;
+  DataType t_score;
+  bool rotated;
+  nmsOutFunc function;
+
+  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
+      : t_bbox(t_bbox), t_score(t_score), rotated(rotated) {}
+  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
+      : t_bbox(t_bbox), t_score(t_score), rotated(rotated), function(function) {}
+  bool operator==(const nmsOutLaunchConfig &other) {
+    return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
+  }
+};
+
+using nvinfer1::DataType;
+
+static std::vector<nmsOutLaunchConfig> nmsOutFuncVec;
+
+bool nmsOutputInit() {
+  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false,
+                                             gatherNMSOutputs_gpu<float, float, false>));
+  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true,
+                                             gatherNMSOutputs_gpu<float, float, true>));
+  return true;
+}
+
+static bool initialized = nmsOutputInit();
+
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages,
+                                const int numPredsPerClass, const int numClasses, const int topK,
+                                const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE,
+                                const void *indices, const void *scores, const void *bboxData,
+                                void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
+                                bool clipBoxes, bool rotated) {
+  nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
+  for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i) {
+    if (lc == nmsOutFuncVec[i]) {
+      DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
+      return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass,
+                                       numClasses, topK, keepTopK, indices, scores, bboxData,
+                                       nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <stdint.h>
+
+#include <cub/cub.cuh>
+
+#include "cublas_v2.h"
+#include "nms/kernel.h"
+#include "trt_plugin_helper.hpp"
+
+#define CUDA_MEM_ALIGN 256
+
+// return cuda arch
+size_t get_cuda_arch(int devID) {
+  int computeMode = -1, major = 0, minor = 0;
+  CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  return major * 100 + minor * 10;
+}
+
+// ALIGNPTR
+int8_t *alignPtr(int8_t *ptr, uintptr_t to) {
+  uintptr_t addr = (uintptr_t)ptr;
+  if (addr % to) {
+    addr += to - addr % to;
+  }
+  return (int8_t *)addr;
+}
+
+// NEXTWORKSPACEPTR
+int8_t *nextWorkspacePtr(int8_t *ptr, uintptr_t previousWorkspaceSize) {
+  uintptr_t addr = (uintptr_t)ptr;
+  addr += previousWorkspaceSize;
+  return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
+}
+
+// CALCULATE TOTAL WORKSPACE SIZE
+size_t calculateTotalWorkspaceSize(size_t *workspaces, int count) {
+  size_t total = 0;
+  for (int i = 0; i < count; i++) {
+    total += workspaces[i];
+    if (workspaces[i] % CUDA_MEM_ALIGN) {
+      total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+    }
+  }
+  return total;
+}
+
+using nvinfer1::DataType;
+
+template <unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__
+    void setUniformOffsets_kernel(const int num_segments, const int offset, int *d_offsets) {
+  const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
+  if (idx <= num_segments) d_offsets[idx] = idx * offset;
+}
+
+void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset,
+                       int *d_offsets) {
+  const int BS = 32;
+  const int GS = (num_segments + 1 + BS - 1) / BS;
+  setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
+}
+
+size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX) {
+  if (DT_BBOX == DataType::kFLOAT) {
+    return N * C1 * sizeof(float);
+  }
+
+  printf("Only FP32 type bounding boxes are supported.\n");
+  return (size_t)-1;
+}
+
+size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX) {
+  if (DT_BBOX == DataType::kFLOAT) {
+    return shareLocation ? 0 : N * C1 * sizeof(float);
+  }
+  printf("Only FP32 type bounding boxes are supported.\n");
+  return (size_t)-1;
+}
+
+size_t detectionForwardPreNMSSize(int N, int C2) {
+  ASSERT(sizeof(float) == sizeof(int));
+  return N * C2 * sizeof(float);
+}
+
+size_t detectionForwardPostNMSSize(int N, int numClasses, int topK) {
+  ASSERT(sizeof(float) == sizeof(int));
+  return N * numClasses * topK * sizeof(float);
+}
+
+size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
+                                       int numPredsPerClass, int topK, DataType DT_BBOX,
+                                       DataType DT_SCORE) {
+  size_t wss[7];
+  wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
+  wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
+  wss[2] = detectionForwardPreNMSSize(N, C2);
+  wss[3] = detectionForwardPreNMSSize(N, C2);
+  wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
+  wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
+  wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
+                    sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
+  return calculateTotalWorkspaceSize(wss, 7);
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <vector>
+
+#include "nms/kernel.h"
+
+template <typename Dtype, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__
+    void permuteData_kernel(const int nthreads, const int num_classes, const int num_data,
+                            const int num_dim, bool confSigmoid, const Dtype *data,
+                            Dtype *new_data) {
+  // data format: [batch_size, num_data, num_classes, num_dim]
+  for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
+       index += nthds_per_cta * gridDim.x) {
+    const int i = index % num_dim;
+    const int c = (index / num_dim) % num_classes;
+    const int d = (index / num_dim / num_classes) % num_data;
+    const int n = index / num_dim / num_classes / num_data;
+    const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
+    float result = data[index];
+    if (confSigmoid) result = exp(result) / (1 + exp(result));
+
+    new_data[new_index] = result;
+  }
+  // new data format: [batch_size, num_classes, num_data, num_dim]
+}
+
+template <typename Dtype>
+pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes,
+                               const int num_data, const int num_dim, bool confSigmoid,
+                               const void *data, void *new_data) {
+  const int BS = 512;
+  const int GS = (nthreads + BS - 1) / BS;
+  permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim,
+                                                       confSigmoid, (const Dtype *)data,
+                                                       (Dtype *)new_data);
+  CSC(cudaGetLastError(), STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// permuteData LAUNCH CONFIG
+typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool,
+                                 const void *, void *);
+
+struct pdLaunchConfig {
+  DataType t_data;
+  pdFunc function;
+
+  pdLaunchConfig(DataType t_data) : t_data(t_data) {}
+  pdLaunchConfig(DataType t_data, pdFunc function) : t_data(t_data), function(function) {}
+  bool operator==(const pdLaunchConfig &other) { return t_data == other.t_data; }
+};
+
+static std::vector<pdLaunchConfig> pdFuncVec;
+
+bool permuteDataInit() {
+  pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
+  return true;
+}
+
+static bool initialized = permuteDataInit();
+
+pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes,
+                           const int num_data, const int num_dim, const DataType DT_DATA,
+                           bool confSigmoid, const void *data, void *new_data) {
+  pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
+  for (unsigned i = 0; i < pdFuncVec.size(); ++i) {
+    if (lc == pdFuncVec[i]) {
+      DEBUG_PRINTF("permuteData kernel %d\n", i);
+      return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid,
+                                   data, new_data);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <vector>
+
+#include "cub/cub.cuh"
+#include "nms/cub_helper.h"
+#include "nms/kernel.h"
+#include "trt_plugin_helper.hpp"
+
+template <typename T_SCORE, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__
+    void prepareSortData(const int num, const int num_classes, const int num_preds_per_class,
+                         const int background_label_id, const float confidence_threshold,
+                         T_SCORE *conf_scores_gpu, T_SCORE *temp_scores, int *temp_idx,
+                         int *d_offsets) {
+  // Prepare scores data for sort
+  const int cur_idx = blockIdx.x * nthds_per_cta + threadIdx.x;
+  const int numPredsPerBatch = num_classes * num_preds_per_class;
+  if (cur_idx < numPredsPerBatch) {
+    const int class_idx = cur_idx / num_preds_per_class;
+    for (int i = 0; i < num; i++) {
+      const int targetIdx = i * numPredsPerBatch + cur_idx;
+      const T_SCORE score = conf_scores_gpu[targetIdx];
+
+      // "Clear" background labeled score and index
+      // Because we do not care about background
+      if (class_idx == background_label_id) {
+        // Set scores to 0
+        // Set label = -1
+        temp_scores[targetIdx] = 0.0f;
+        temp_idx[targetIdx] = -1;
+        conf_scores_gpu[targetIdx] = 0.0f;
+      }
+      // "Clear" scores lower than threshold
+      else {
+        if (score > confidence_threshold) {
+          temp_scores[targetIdx] = score;
+          temp_idx[targetIdx] = cur_idx + i * numPredsPerBatch;
+        } else {
+          // Set scores to 0
+          // Set label = -1
+          temp_scores[targetIdx] = 0.0f;
+          temp_idx[targetIdx] = -1;
+          conf_scores_gpu[targetIdx] = 0.0f;
+          // TODO: HERE writing memory too many times
+        }
+      }
+
+      if ((cur_idx % num_preds_per_class) == 0) {
+        const int offset_ct = i * num_classes + cur_idx / num_preds_per_class;
+        d_offsets[offset_ct] = offset_ct * num_preds_per_class;
+        // set the last element in d_offset
+        if (blockIdx.x == 0 && threadIdx.x == 0)
+          d_offsets[num * num_classes] = num * numPredsPerBatch;
+      }
+    }
+  }
+}
+
+template <typename T_SCORE>
+pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes,
+                                      const int num_preds_per_class, const int background_label_id,
+                                      const float confidence_threshold, void *conf_scores_gpu,
+                                      void *index_array_gpu, void *workspace) {
+  const int num_segments = num * num_classes;
+  void *temp_scores = workspace;
+  const int arrayLen = num * num_classes * num_preds_per_class;
+  void *temp_idx = nextWorkspacePtr((int8_t *)temp_scores, arrayLen * sizeof(T_SCORE));
+  void *d_offsets = nextWorkspacePtr((int8_t *)temp_idx, arrayLen * sizeof(int));
+  size_t cubOffsetSize = (num_segments + 1) * sizeof(int);
+  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, cubOffsetSize);
+
+  const int BS = 512;
+  const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
+  prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
+      num, num_classes, num_preds_per_class, background_label_id, confidence_threshold,
+      (T_SCORE *)conf_scores_gpu, (T_SCORE *)temp_scores, (int *)temp_idx, (int *)d_offsets);
+
+  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
+  cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(temp_scores),
+      (T_SCORE *)(conf_scores_gpu), (const int *)(temp_idx), (int *)(index_array_gpu), arrayLen,
+      num_segments, (const int *)d_offsets, (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8,
+      stream);
+  CSC(cudaGetLastError(), STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// sortScoresPerClass LAUNCH CONFIG
+typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int,
+                                   const float, void *, void *, void *);
+struct sspcLaunchConfig {
+  DataType t_score;
+  sspcFunc function;
+
+  sspcLaunchConfig(DataType t_score) : t_score(t_score) {}
+  sspcLaunchConfig(DataType t_score, sspcFunc function) : t_score(t_score), function(function) {}
+  bool operator==(const sspcLaunchConfig &other) { return t_score == other.t_score; }
+};
+
+static std::vector<sspcLaunchConfig> sspcFuncVec;
+bool sspcInit() {
+  sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
+  return true;
+}
+
+static bool initialized = sspcInit();
+
+pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes,
+                                  const int num_preds_per_class, const int background_label_id,
+                                  const float confidence_threshold, const DataType DT_SCORE,
+                                  void *conf_scores_gpu, void *index_array_gpu, void *workspace) {
+  sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
+  for (unsigned i = 0; i < sspcFuncVec.size(); ++i) {
+    if (lc == sspcFuncVec[i]) {
+      DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
+      return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class,
+                                     background_label_id, confidence_threshold, conf_scores_gpu,
+                                     index_array_gpu, workspace);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
+
+size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes,
+                                       const int num_preds_per_class, const DataType DT_CONF) {
+  size_t wss[4];
+  const int arrayLen = num * num_classes * num_preds_per_class;
+  wss[0] = arrayLen * mmdeploy::getElementSize(DT_CONF);  // temp scores
+  wss[1] = arrayLen * sizeof(int);                        // temp indices
+  wss[2] = (num * num_classes + 1) * sizeof(int);         // offsets
+  if (DT_CONF == DataType::kFLOAT) {
+    wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes);  // cub workspace
+  } else {
+    printf("SCORE type not supported\n");
+    return (size_t)-1;
+  }
+
+  return calculateTotalWorkspaceSize(wss, 4);
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include <vector>
+
+#include "cub/cub.cuh"
+#include "nms/cub_helper.h"
+#include "nms/kernel.h"
+
+template <typename T_SCORE>
+pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images,
+                                      const int num_items_per_image, void *unsorted_scores,
+                                      void *unsorted_bbox_indices, void *sorted_scores,
+                                      void *sorted_bbox_indices, void *workspace) {
+  void *d_offsets = workspace;
+  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, (num_images + 1) * sizeof(int));
+
+  setUniformOffsets(stream, num_images, num_items_per_image, (int *)d_offsets);
+
+  const int arrayLen = num_images * num_items_per_image;
+  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
+  cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(unsorted_scores),
+      (T_SCORE *)(sorted_scores), (const int *)(unsorted_bbox_indices),
+      (int *)(sorted_bbox_indices), arrayLen, num_images, (const int *)d_offsets,
+      (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8, stream);
+  CSC(cudaGetLastError(), STATUS_FAILURE);
+  return STATUS_SUCCESS;
+}
+
+// sortScoresPerImage LAUNCH CONFIG
+typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void *, void *, void *,
+                                   void *, void *);
+struct sspiLaunchConfig {
+  DataType t_score;
+  sspiFunc function;
+
+  sspiLaunchConfig(DataType t_score) : t_score(t_score) {}
+  sspiLaunchConfig(DataType t_score, sspiFunc function) : t_score(t_score), function(function) {}
+  bool operator==(const sspiLaunchConfig &other) { return t_score == other.t_score; }
+};
+
+static std::vector<sspiLaunchConfig> sspiFuncVec;
+bool sspiInit() {
+  sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
+  return true;
+}
+
+static bool initialized = sspiInit();
+
+pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images,
+                                  const int num_items_per_image, const DataType DT_SCORE,
+                                  void *unsorted_scores, void *unsorted_bbox_indices,
+                                  void *sorted_scores, void *sorted_bbox_indices, void *workspace) {
+  sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
+  for (unsigned i = 0; i < sspiFuncVec.size(); ++i) {
+    if (lc == sspiFuncVec[i]) {
+      DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
+      return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores,
+                                     unsorted_bbox_indices, sorted_scores, sorted_bbox_indices,
+                                     workspace);
+    }
+  }
+  return STATUS_BAD_PARAM;
+}
+
+size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image,
+                                       const DataType DT_SCORE) {
+  const int arrayLen = num_images * num_items_per_image;
+  size_t wss[2];
+  wss[0] = (num_images + 1) * sizeof(int);  // offsets
+  if (DT_SCORE == DataType::kFLOAT) {
+    wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
+                                                   num_images);  // cub workspace
+  } else {
+    printf("SCORE type not supported.\n");
+    return (size_t)-1;
+  }
+
+  return calculateTotalWorkspaceSize(wss, 2);
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "common_cuda_helper.hpp"
+#include "trt_plugin_helper.hpp"
+
+using mmdeploy::TensorDesc;
+
+template <class scalar_t>
+__global__ void copy_permute_kernel(scalar_t *__restrict__ dst, const scalar_t *__restrict__ src,
+                                    int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute) {
+  const int src_dim = ts_src_stride.dim;
+  const auto src_stride = ts_src_stride.stride;
+  const auto dst_stride = ts_dst_stride.stride;
+  const auto permute = ts_permute.shape;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    size_t dst_index = index;
+    size_t src_index = 0;
+    for (int i = 0; i < src_dim; ++i) {
+      int dim_index = dst_index / dst_stride[i];
+      dst_index = dst_index % dst_stride[i];
+      src_index += dim_index * src_stride[permute[i]];
+    }
+    dst[index] = src[src_index];
+  }
+}
+
+template <class scalar_t>
+void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, int *permute, int src_dim,
+                   cudaStream_t stream) {
+  size_t copy_size = 1;
+  TensorDesc ts_permute;
+  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+
+  TensorDesc ts_src_stride;
+  TensorDesc ts_dst_stride;
+  ts_src_stride.dim = src_dim;
+  ts_dst_stride.dim = src_dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *dst_size = &(ts_dst_stride.shape[0]);
+  src_stride[src_dim - 1] = 1;
+  dst_stride[src_dim - 1] = 1;
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    dst_size[i] = src_size[permute[i]];
+    if (i < src_dim - 1) {
+      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    }
+  }
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    copy_size *= dst_size[i];
+    if (i < src_dim - 1) {
+      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    }
+  }
+
+  copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+      dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+}
+
+template void memcpyPermute<float>(float *dst, const float *src, int *src_size, int *permute,
+                                   int src_dim, cudaStream_t stream);
+template void memcpyPermute<half>(half *dst, const half *src, int *src_size, int *permute,
+                                  int src_dim, cudaStream_t stream);
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t *cudnn_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      *cudnn_dtype = CUDNN_DATA_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *cudnn_dtype = CUDNN_DATA_HALF;
+      break;
+    default:
+      return CUDNN_STATUS_BAD_PARAM;
+  }
+  return CUDNN_STATUS_SUCCESS;
+}
+
+template <>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
+                                     cublasOperation_t transb, int m, int n, int k,
+                                     const float *alpha, const float *A, int lda, const float *B,
+                                     int ldb, const float *beta, float *C, int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa,
+                                    cublasOperation_t transb, int m, int n, int k,
+                                    const half *alpha, const half *A, int lda, const half *B,
+                                    int ldb, const half *beta, half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_deform_conv_kernel.hpp"
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
+}  // namespace
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string &name,
+                                                         const nvinfer1::Dims stride,
+                                                         const nvinfer1::Dims padding,
+                                                         const nvinfer1::Dims dilation,
+                                                         const int deformableGroup, const int group)
+    : TRTPluginBase(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group) {}
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void *data,
+                                                         size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+}
+DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
+  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
+      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  // input[0] == input
+  // input[1] == offset
+  // input[2] == weight
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[2].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool DeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  }
+}
+
+void DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
+                                                  int nbInputs,
+                                                  const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                                  int nbOutputs) TRT_NOEXCEPT {}
+
+size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                                     int nbInputs,
+                                                     const nvinfer1::PluginTensorDesc *outputs,
+                                                     int nbOutputs) const TRT_NOEXCEPT {
+  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[2].dims.d[2];
+  int kH = inputs[2].dims.d[3];
+  int im2col_step = std::min(32, batch_size);
+
+  size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                                             outputWidth * sizeof_dtype);
+
+  size_t out_size = 0;
+  if (im2col_step != 1)
+    out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
+                                        sizeof_dtype);
+
+  return col_size + out_size;
+}
+
+int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                         const nvinfer1::PluginTensorDesc *outputDesc,
+                                         const void *const *inputs, void *const *outputs,
+                                         void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int channels_out = outputDesc[0].dims.d[1];
+  int kernel_h = inputDesc[2].dims.d[2];
+  int kernel_w = inputDesc[2].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *weight = inputs[2];
+  void *output = outputs[0];
+  int im2col_step = std::min(batch, 32);
+
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      deform_conv<float>((float *)x, (float *)weight, (float *)offset, (float *)output, workSpace,
+                         batch, channels, height, width, channels_out, kernel_w, kernel_h,
+                         mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
+                         mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle,
+                         stream);
+      break;
+    case nvinfer1::DataType::kHALF:
+      deform_conv<half>((half *)x, (half *)weight, (half *)offset, (half *)output, workSpace, batch,
+                        channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
+                        mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1],
+                        mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+         serialized_size(mDeformableGroup) + serialized_size(mGroup);
+}
+
+void DeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+}
+
+void DeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
+  m_cublas_handle = cublasContext;
+}
+
+void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+////////////////////// creator /////////////////////////////
+
+DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("deform_groups") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("groups") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+  }
+
+  DeformableConvPluginDynamic *plugin =
+      new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_DEFORM_CONV_HPP
+#define TRT_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class DeformableConvPluginDynamic : public TRTPluginBase {
+ public:
+  DeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
+                              const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
+                              const int deformableGroup, const int group);
+
+  DeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
+
+  DeformableConvPluginDynamic() = delete;
+
+  ~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+
+ private:
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+
+  cublasHandle_t m_cublas_handle;
+};
+
+class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
+ public:
+  DeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_DEFORM_CONV_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#include "common_cuda_helper.hpp"
+#include "trt_deform_conv_kernel.cuh"
+#include "trt_deform_conv_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
+                        const int channels, const int height, const int width, const int ksize_h,
+                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+                        const int stride_w, const int dilation_h, const int dilation_w,
+                        const int parallel_imgs, const int deformable_group, cudaStream_t stream) {
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+      num_kernels, input, offset, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels,
+      deformable_group, height_col, width_col, column);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
+                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
+                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+                 int padH, int dilationW, int dilationH, int group, int deformable_group,
+                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  size_t word_size = sizeof(scalar_t);
+
+  im2col_step = std::min(int(batchSize), im2col_step);
+  long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  long outputHW = outputHeight * outputWidth;
+  long kHW = kH * kW;
+  long columns_size =
+      mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
+
+  // column buffer for img2col
+  char* workspace_ptr = reinterpret_cast<char*>(workspace);
+  scalar_t* columns = reinterpret_cast<scalar_t*>(workspace_ptr);
+  workspace_ptr = workspace_ptr + columns_size;
+
+  scalar_t* output_buffer;
+  if (im2col_step == 1) {
+    output_buffer = output;
+  } else {
+    // output need permute when im2col_step!=1
+    output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
+  }
+
+  long input_elt_step = im2col_step * nInputPlane * inputHeight * inputWidth;
+  long offset_elt_step = im2col_step * deformable_group * 2 * kHW * outputHW;
+  long out_buffer_step = nOutputPlane * im2col_step * outputHW;
+  long col_g_step = nInputPlane * kHW * im2col_step * outputHW / group;
+  long weight_g_step = nOutputPlane * nInputPlane * kHW / (group * group);
+  long out_buffer_g_step = out_buffer_step / group;
+  int m = nOutputPlane / group;
+  int n = im2col_step * outputHW;
+  int k = nInputPlane * kHW / group;
+  scalar_t alpha = 1.f;
+  scalar_t beta = 0.f;
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    const scalar_t* input_start = input + elt * input_elt_step;
+    const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+    deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight,
+                                 inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
+                                 im2col_step, deformable_group, stream);
+
+    for (int g = 0; g < group; ++g) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
+                               n, weight_start, k, &beta, out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (im2col_step != 1) {
+    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane, im2col_step,
+                                  static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
+    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
+                            &output_buffer_permute[0], 5, stream);
+  }
+}
+
+template void deform_conv<float>(const float* input, const float* weight, const float* offset,
+                                 float* output, void* workspace, int batchSize, int nInputPlane,
+                                 int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
+                                 int dW, int dH, int padW, int padH, int dilationW, int dilationH,
+                                 int group, int deformable_group, int im2col_step,
+                                 cublasHandle_t cublas_handle, cudaStream_t stream);
+
+template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset,
+                                  __half* output, void* workspace, int batchSize, int nInputPlane,
+                                  int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
+                                  int dW, int dH, int padW, int padH, int dilationW, int dilationH,
+                                  int group, int deformable_group, int im2col_step,
+                                  cublasHandle_t cublas_handle, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* __restrict__ input,
+                                                               const int height, const int width,
+                                                               float h, float w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+
+  input += h_low * width;
+  const scalar_t v1 = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+  const int w_high = w_low + 1;
+  const scalar_t v2 =
+      (h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+  const scalar_t lw = w - w_low;
+  const scalar_t v_low = fmaf(v2 - v1, lw, v1);
+  input += width;
+  const scalar_t v3 =
+      (h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+  const scalar_t v4 =
+      (h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+  const scalar_t v_high = fmaf(v4 - v3, lw, v3);
+  const scalar_t lh = h - h_low;
+  const scalar_t val = fmaf(v_high - v_low, lh, v_low);
+  return val;
+}
+
+template <>
+__device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __restrict__ input,
+                                                             const int height, const int width,
+                                                             float h, float w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+
+  input += h_low * width;
+  const float v1 = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+  const int w_high = w_low + 1;
+  const float v2 = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+  const float lw = w - w_low;
+  const float v_low = fmaf(v2 - v1, lw, v1);
+  input += width;
+  const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+  const float v4 =
+      (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+  const float v_high = fmaf(v4 - v3, lw, v3);
+  const float lh = h - h_low;
+  const float val = fmaf(v_high - v_low, lh, v_low);
+  return __float2half(val);
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const scalar_t* __restrict__ data_im, const scalar_t* __restrict__ data_offset,
+    const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col, const int width_col,
+    scalar_t* __restrict__ data_col) {
+  const int hw_col = height_col * width_col;
+  const int data_col_step = batch_size * hw_col;
+
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    int tmp_index = index;
+    const int w_col = tmp_index % width_col;
+    tmp_index /= width_col;
+    const int h_col = tmp_index % height_col;
+    tmp_index /= height_col;
+    const int b_col = tmp_index % batch_size;
+    const int c_im = tmp_index / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
+    const scalar_t* __restrict__ data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* __restrict__ data_offset_ptr =
+        data_offset +
+        ((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
+        h_col * width_col + w_col;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h = (i * kernel_w + j) * hw_col << 1;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h];
+        const int data_offset_w = data_offset_h + hw_col;
+        const scalar_t offset_w = data_offset_ptr[data_offset_w];
+        const scalar_t h_im = h_in + i * dilation_h + (float)offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + (float)offset_w;
+        const scalar_t val = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += data_col_step;
+      }
+    }
+  }
+}
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_DEFORM_CONV_KERNEL_HPP
+#define TRT_DEFORM_CONV_KERNEL_HPP
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+template <typename scalar_t>
+void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
+                        const int channels, const int height, const int width, const int ksize_h,
+                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+                        const int stride_w, const int dilation_h, const int dilation_w,
+                        const int parallel_imgs, const int deformable_group, cudaStream_t stream);
+
+template <typename scalar_t>
+void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
+                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
+                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+                 int padH, int dilationW, int dilationH, int group, int deformable_group,
+                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+#endif  // TRT_DEFORM_CONV_KERNEL_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "gather_topk.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "NvInferVersion.h"
+#include "gather_topk_kernel.hpp"
+#include "trt_serialize.hpp"
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"GatherTopk"};
+}  // namespace
+
+GatherTopk::GatherTopk(const std::string &name) : TRTPluginBase(name) {}
+
+GatherTopk::GatherTopk(const std::string name, const void *data, size_t length)
+    : TRTPluginBase(name) {}
+
+nvinfer1::IPluginV2DynamicExt *GatherTopk::clone() const TRT_NOEXCEPT {
+  GatherTopk *plugin = new GatherTopk(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  assert(inputs[0].nbDims >= inputs[1].nbDims);
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  for (int i = 0; i < inputs[1].nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i];
+  }
+  for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i) {
+    ret.d[i] = inputs[0].d[i];
+  }
+  return ret;
+}
+
+bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
+                                           int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  switch (pos) {
+    case 0:
+      // data
+      return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+             (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    case 1:
+      // indices
+      return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+             ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    case 2:
+      // output
+      return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+    default:
+      return true;
+  }
+  return true;
+}
+
+void GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+                                 const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                 int nbOutputs) TRT_NOEXCEPT {}
+
+size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                    const nvinfer1::PluginTensorDesc *outputs,
+                                    int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                        const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                        void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  const int *indices_dims = &(inputDesc[1].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+  int indice_nbDims = inputDesc[1].dims.nbDims;
+
+  const void *data = inputs[0];
+  const void *indices = inputs[1];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      gather_topk_impl<float>((float *)data, (int *)indices, dims, nbDims, indices_dims,
+                              indice_nbDims, (float *)output, stream);
+      break;
+
+    case nvinfer1::DataType::kINT32:
+      gather_topk_impl<int>((int *)data, (int *)indices, dims, nbDims, indices_dims, indice_nbDims,
+                            (int *)output, stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                                 int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GatherTopk::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GatherTopk::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+int GatherTopk::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT { return 0; }
+
+void GatherTopk::serialize(void *buffer) const TRT_NOEXCEPT {}
+
+GatherTopkCreator::GatherTopkCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GatherTopkCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+nvinfer1::IPluginV2 *GatherTopkCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  auto *plugin = new GatherTopk(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GatherTopkCreator::deserializePlugin(const char *name, const void *serialData,
+                                                          size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new GatherTopk(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
+}  // namespace mmdeploy