add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv.h"
+#include <cmath>
+#include <thread>
+#include <vector>
+#include "modulated_deform_conv/modulated_deform_conv_cpu.h"
+#include "ort_utils.h"
+namespace mmdeploy {
+void parallel_unroll_gemm(const float *A, const float *B, const float *V, const float *H,
+                          const int32_t M, const int32_t N, const int32_t K, const float alpha,
+                          const float beta, float *Y, const int32_t start_row,
+                          const int32_t end_row) {
+  std::vector<float> tmp(N);
+  for (int32_t m = start_row; m < end_row; ++m) {
+    for (int32_t n = 0; n < N; n++) {
+      tmp[n] = 0;
+    }
+    {
+      int32_t remainder = K % 8;  // unroll
+      for (int32_t k = 0; k < K; k += 8) {
+        for (int32_t n = 0; n < N; n++) {
+          tmp[n] += A[m * K + k] * B[k * N + n];
+          tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
+          tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
+          tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
+          tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
+          tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
+          tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
+          tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
+        }
+      }
+      for (int32_t k = K - remainder; k < K; k++) {
+        for (int32_t n = 0; n < N; n++) {
+          tmp[n] += A[m * K + k] * B[k * N + n];
+        }
+      }
+    }
+    for (int32_t n = 0; n < N; n++) {
+      tmp[n] *= alpha;
+      if (V) tmp[n] += beta * V[n];
+      if (H) tmp[n] += beta * H[m * N + n];
+      Y[m * N + n] = tmp[n];
+    }
+  }
+}
+void deformable_conv2d_ref_fp32(const float *src, const float *offset, const float *mask,
+                                const float *filter, const float *bias, const int64_t batch,
+                                const int64_t src_c, const int64_t src_h, const int64_t src_w,
+                                const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
+                                const int64_t group, const int64_t offset_group,
+                                const int64_t channels, const int64_t num_output,
+                                const int64_t kernel_h, const int64_t kernel_w,
+                                const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
+                                const int64_t pad_w, const int64_t dilation_h,
+                                const int64_t dilation_w, float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+  // Set up for launching threads
+  std::size_t num_threads = std::thread::hardware_concurrency();
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col_2d<float>(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h, src_w, kernel_h,
+          kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, ic_per_gp,
+          offset_group, dst_h, dst_w, mask != nullptr, columns);
+      float *dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+      if (bias != nullptr) {
+        const float *bias_ptr = bias + g * oc_per_gp;
+        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
+          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
+            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+          }
+        }
+      } else {
+        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+      }
+      if (num_threads > 1) {
+        // Calculate values to pass to threads
+        int32_t n_rows = (oc_per_gp + num_threads - 1) / num_threads;
+        int32_t end_row = 0;
+        for (int32_t i = 0; i < num_threads; i++) {
+          auto start_row = i * n_rows;
+          end_row = start_row + n_rows;
+          if (end_row > oc_per_gp) end_row = oc_per_gp;
+          std::thread t(parallel_unroll_gemm,
+                        filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr,
+                        dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f,
+                        1.0f, dst_ptr, start_row, end_row);
+          threads.emplace_back(std::move(t));
+        }
+        // Wait for all threads to complete
+        for (auto &t : threads) t.join();
+        threads.clear();
+      } else {  // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
+        parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
+                             nullptr, dst_ptr, oc_per_gp, dst_h * dst_w,
+                             ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
+      }
+    }
+  }
+}
+MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi &api,
+                                                             const OrtKernelInfo *info)
+    : ort_(api), info_(info) {
+  std::vector<int64_t> stride = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
+  const float *mask_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
+  const float *filter_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
+  const float *bias_data = (bias != nullptr)
+                               ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
+                               : nullptr;
+  // const float *bias_data = nullptr;
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+  int64_t batch = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t num_output = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+  // get output memory
+  int64_t out_height = floor(
+      (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
+      1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
+  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+  OrtValue *output =
+      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+  // allocate tmp memory
+  int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+  deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch,
+                             channels, in_height, in_width, num_output, out_height, out_width,
+                             group, deformable_group, channels, num_output, kernel_height,
+                             kernel_width, stride_height, stride_width, padding_height,
+                             padding_width, dilation_height, dilation_width, columns, out_ptr);
+  allocator_.Free(columns);
+}
+REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
+REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#include <onnxruntime_cxx_api.h>
+namespace mmdeploy {
+struct MMCVModulatedDeformConvKernel {
+  MMCVModulatedDeformConvKernel(const OrtApi &api, const OrtKernelInfo *info);
+  void Compute(OrtKernelContext *context);
+ protected:
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+};
+struct MMCVModulatedDeformConvOp
+    : Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel> {
+  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
+    return new MMCVModulatedDeformConvKernel(api, info);
+  }
+  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
+  size_t GetInputTypeCount() const { return 5; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
+    // The last input (index == 4) is optional, which is bias
+    if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+  // force cpu
+  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
+};
+}  // namespace mmdeploy
+#endif
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_match.h"
+#include <assert.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+#include "ort_utils.h"
+namespace mmdeploy {
+struct Box {
+  float x1, y1, x2, y2;
+};
+float nms_match_iou(Box box1, Box box2) {
+  auto inter_x1 = std::max(box1.x1, box2.x1);
+  auto inter_y1 = std::max(box1.y1, box2.y1);
+  auto inter_x2 = std::min(box1.x2, box2.x2);
+  auto inter_y2 = std::min(box1.y2, box2.y2);
+  auto eps = 1e-10;
+  auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
+  auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
+  auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
+  auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
+  auto inter = w * h;
+  auto ovr = inter / (area1 + area2 - inter + eps);
+  return ovr;
+}
+NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
+    : ort_(api), info_(info) {
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+void NMSMatchKernel::Compute(OrtKernelContext* context) {
+  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
+  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
+  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+  const OrtValue* iou_threshold_ = ort_.KernelContext_GetInput(context, 2);
+  const float iou_threshold_data = ort_.GetTensorData<float>(iou_threshold_)[0];
+  const OrtValue* score_threshold_ = ort_.KernelContext_GetInput(context, 3);
+  const float score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+  // loop over batch
+  int64_t nbatch = boxes_dim[0];
+  int64_t nboxes = boxes_dim[1];
+  int64_t nclass = scores_dim[1];
+  assert(boxes_dim[2] == 4);  //(x1, x2, y1, y2)
+  // alloc some temp memory
+  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+  std::vector<int64_t> res_order;
+  for (int64_t k = 0; k < nbatch; k++) {
+    for (int64_t g = 0; g < nclass; g++) {
+      for (int64_t i = 0; i < nboxes; i++) {
+        select[i] = true;
+      }
+      // scores
+      // k * nboxes * nclass means per batch
+      // g * nboxes means per class
+      // batch = 2 boxes = 3 classes = 4
+      std::vector<float> tmp_sc;
+      // get the class scores
+      for (int i = 0; i < nboxes; i++) {
+        tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
+      }
+      std::vector<int64_t> order(tmp_sc.size());
+      std::iota(order.begin(), order.end(), 0);
+      std::sort(order.begin(), order.end(),
+                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
+      for (int64_t _i = 0; _i < nboxes; _i++) {
+        auto i = order[_i];
+        if (select[i] == false) continue;
+        std::vector<int64_t> v_i;
+        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+          auto j = order[_j];
+          if (select[j] == false) continue;
+          Box vbox1, vbox2;
+          vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
+          vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
+          vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
+          vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
+          vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
+          vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
+          vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
+          vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
+          auto ovr = nms_match_iou(vbox1, vbox2);
+          if (ovr >= iou_threshold_data) {
+            select[j] = false;
+            v_i.push_back(j);
+          }
+        }
+        if (tmp_sc[i] > score_threshold_data && v_i.size() != 0) {
+          for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++) {
+            res_order.push_back(k);
+            res_order.push_back(g);
+            res_order.push_back(i);
+            res_order.push_back(v_i[v_i_idx]);
+          }
+        }
+      }
+    }
+  }
+  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
+  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+  allocator_.Free(select);
+}
+REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef ONNXRUNTIME_NMS_MATCH_H
+#define ONNXRUNTIME_NMS_MATCH_H
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+namespace mmdeploy {
+struct NMSMatchKernel {
+  NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
+  void Compute(OrtKernelContext* context);
+ private:
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo* info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+};
+struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel> {
+  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+    return new NMSMatchKernel(api, info);
+  }
+  const char* GetName() const { return "NMSMatch"; }
+  size_t GetInputTypeCount() const { return 4; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+  // force cpu
+  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
+};
+}  // namespace mmdeploy
+#endif  // ONNXRUNTIME_NMS_MATCH_H
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_rotated.h"
+#include <assert.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>  // std::iota
+#include <vector>
+#include "ort_utils.h"
+namespace mmdeploy {
+namespace {
+struct RotatedBox {
+  float x_ctr, y_ctr, w, h, a;
+};
+struct Point {
+  float x, y;
+  Point(const float& px = 0, const float& py = 0) : x(px), y(py) {}
+  Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); }
+  Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); }
+  Point operator*(const float coeff) const { return Point(x * coeff, y * coeff); }
+};
+float dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; }
+float cross_2d(const Point& A, const Point& B) { return A.x * B.y - B.x * A.y; }
+}  // namespace
+void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  float cosTheta2 = (float)cos(theta) * 0.5f;
+  float sinTheta2 = (float)sin(theta) * 0.5f;
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4],
+                            Point (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      float det = cross_2d(vec2[j], vec1[i]);
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+      auto vec12 = pts2[j] - pts1[i];
+      float t1 = cross_2d(vec2[j], vec12) / det;
+      float t2 = cross_2d(vec1[i], vec12) / det;
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d(AB, AB);
+    auto ADdotAD = dot_2d(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+      auto AP = pts1[i] - pts2[0];
+      auto APdotAB = dot_2d(AP, AB);
+      auto APdotAD = -dot_2d(AP, DA);
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d(AB, AB);
+    auto ADdotAD = dot_2d(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+      auto APdotAB = dot_2d(AP, AB);
+      auto APdotAD = -dot_2d(AP, DA);
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+  return num;
+}
+int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24],
+                       bool shift_to_zero = false) {
+  assert(num_in >= 2);
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  float dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d(q[i], q[i]);
+  }
+  // CPU version
+  std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool {
+    float temp = cross_2d(A, B);
+    if (fabs(temp) < 1e-6) {
+      return dot_2d(A, A) < dot_2d(B, B);
+    } else {
+      return temp > 0;
+    }
+  });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d(q[i], q[i]);
+  }
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+  return m;
+}
+float polygon_area(const Point (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+  float area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
+  }
+  return area / 2.0;
+}
+float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point intersectPts[24], orderedPts[24];
+  Point pts1[4];
+  Point pts2[4];
+  get_rotated_vertices(box1, pts1);
+  get_rotated_vertices(box2, pts2);
+  int num = get_intersection_points(pts1, pts2, intersectPts);
+  if (num <= 2) {
+    return 0.0;
+  }
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+  return polygon_area(orderedPts, num_convex);
+}
+NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
+    : ort_(api), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+void NMSRotatedKernel::Compute(OrtKernelContext* context) {
+  const float iou_threshold = iou_threshold_;
+  const float score_threshold = score_threshold_;
+  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
+  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
+  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+  // loop over batch
+  int64_t nbatch = boxes_dim[0];
+  int64_t nboxes = boxes_dim[1];
+  int64_t nclass = scores_dim[1];
+  assert(boxes_dim[2] == 5);  //(cx,cy,w,h,theta)
+  // allocate tmp memory
+  float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
+  float* sc = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
+  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+  memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
+  memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
+  // std::vector<std::vector<int64_t>> res_order;
+  std::vector<int64_t> res_order;
+  for (int64_t k = 0; k < nbatch; k++) {
+    for (int64_t g = 0; g < nclass; g++) {
+      for (int64_t i = 0; i < nboxes; i++) {
+        select[i] = true;
+      }
+      // sort scores
+      std::vector<float> tmp_sc;
+      for (int i = 0; i < nboxes; i++) {
+        tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
+      }
+      std::vector<int64_t> order(tmp_sc.size());
+      std::iota(order.begin(), order.end(), 0);
+      std::sort(order.begin(), order.end(),
+                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
+      for (int64_t _i = 0; _i < nboxes; _i++) {
+        if (select[_i] == false) continue;
+        auto i = order[_i];
+        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+          if (select[_j] == false) continue;
+          auto j = order[_j];
+          RotatedBox box1, box2;
+          auto center_shift_x =
+              (tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
+          auto center_shift_y =
+              (tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
+          box1.x_ctr = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
+          box1.y_ctr = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
+          box1.w = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
+          box1.h = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
+          box1.a = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
+          box2.x_ctr = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
+          box2.y_ctr = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
+          box2.w = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
+          box2.h = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
+          box2.a = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
+          auto area1 = box1.w * box1.h;
+          auto area2 = box2.w * box2.h;
+          auto intersection = rotated_boxes_intersection(box1, box2);
+          float baseS = 1.0;
+          baseS = (area1 + area2 - intersection);
+          auto ovr = intersection / baseS;
+          if (ovr > iou_threshold) select[_j] = false;
+        }
+      }
+      for (int i = 0; i < nboxes; i++) {
+        if (select[i] & (tmp_sc[order[i]] > score_threshold)) {
+          res_order.push_back(k);
+          res_order.push_back(g);
+          res_order.push_back(order[i]);
+        }
+      }
+    }  // class loop
+  }    // batch loop
+  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
+  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+  allocator_.Free(tmp_boxes);
+  allocator_.Free(sc);
+  allocator_.Free(select);
+}
+REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef ONNXRUNTIME_NMS_ROTATED_H
+#define ONNXRUNTIME_NMS_ROTATED_H
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+namespace mmdeploy {
+struct NMSRotatedKernel {
+  NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
+  void Compute(OrtKernelContext* context);
+ private:
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo* info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+  float iou_threshold_;
+  float score_threshold_;
+};
+struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel> {
+  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+    return new NMSRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "NMSRotated"; }
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+  // force cpu
+  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
+};
+}  // namespace mmdeploy
+#endif  // ONNXRUNTIME_NMS_ROTATED_H
--- a/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "onnxruntime_register.h"
+#include "ort_utils.h"
+const char *c_MMDeployOpDomain = "mmdeploy";
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api) {
+  const OrtApi *kOrtApi = api->GetApi(ORT_API_VERSION);
+  OrtStatus *status = nullptr;
+  for (auto &_op_list_pair : mmdeploy::get_mmdeploy_custom_ops()) {
+    OrtCustomOpDomain *domain = nullptr;
+    if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain)) {
+      return status;
+    }
+    auto &_op_list = _op_list_pair.second;
+    for (auto &_op : _op_list) {
+      if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op)) {
+        return status;
+      }
+    }
+    // TODO: figure out what will return if failed.
+    status = kOrtApi->AddCustomOpDomain(options, domain);
+  }
+  return status;
+}
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+#include "roi_align_rotated.h"
+#include "ort_utils.h"
+namespace mmdeploy {
+// implementation taken from Caffe2
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height,
+                                       const int pooled_width, const int iy_upper,
+                                       const int ix_upper, float roi_start_h, float roi_start_w,
+                                       float bin_size_h, float bin_size_w, int roi_bin_grid_h,
+                                       int roi_bin_grid_w, float roi_center_h, float roi_center_w,
+                                       float cos_theta, float sin_theta,
+                                       std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy = roi_start_h + ph * bin_size_h +
+                         static_cast<float>(iy + .5f) * bin_size_h /
+                             static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx =
+              roi_start_w + pw * bin_size_w +
+              static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+void ROIAlignRotatedForwardCPU(const int nthreads, const float *input, const float *rois,
+                               float *output, const float &spatial_scale, const int aligned,
+                               const int clockwise, const int channels, const int height,
+                               const int width, const int pooled_height, const int pooled_width,
+                               const int sampling_ratio) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+    const float *current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_center_w = current_roi[1] * spatial_scale - offset;
+    float roi_center_h = current_roi[2] * spatial_scale - offset;
+    float roi_width = current_roi[3] * spatial_scale;
+    float roi_height = current_roi[4] * spatial_scale;
+    // float theta = current_roi[5] * M_PI / 180.0;
+    float theta = current_roi[5];  // Radian angle by default
+    if (clockwise) {
+      theta = -theta;
+    }
+    float cos_theta = cos(theta);
+    float sin_theta = sin(theta);
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+    float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    float roi_start_h = -roi_height / 2.0;
+    float roi_start_w = -roi_width / 2.0;
+    pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h,
+                                      roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h,
+                                      bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h,
+                                      roi_center_w, cos_theta, sin_theta, pre_calc);
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input = input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois =
+      reinterpret_cast<const float *>(ort_.GetTensorData<const float *>(input_rois));
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+  OrtValue *output =
+      ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_,
+                            input_channels, input_height, input_width, aligned_height_,
+                            aligned_width_, sampling_ratio_);
+}
+REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+namespace mmdeploy {
+struct MMCVRoIAlignRotatedKernel {
+ public:
+  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) : ort_(ort) {
+    aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+  }
+  void Compute(OrtKernelContext* context);
+ private:
+  Ort::CustomOpApi ort_;
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int aligned_;
+  int clockwise_;
+};
+struct MMCVRoIAlignRotatedCustomOp
+    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoIAlignRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoIAlignRotated"; }
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+  // force cpu
+  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
+};
+}  // namespace mmdeploy
+#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
--- a/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+project(mmdeploy_tensorrt_ops)
+include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
+# cub
+if (NOT DEFINED CUB_ROOT_DIR)
+    if (CUDA_VERSION VERSION_LESS 11.0)
+        set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
+    endif ()
+endif ()
+file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
+add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_compile_definitions(${PROJECT_NAME}_obj
+        PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
+target_include_directories(${PROJECT_NAME}_obj
+        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
+target_include_directories(${PROJECT_NAME}_obj
+        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
+target_include_directories(${PROJECT_NAME}_obj
+        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+target_include_directories(${PROJECT_NAME}_obj PRIVATE ${TENSORRT_INCLUDE_DIR})
+target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUDNN_DIR}/include)
+target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUB_ROOT_DIR})
+target_link_libraries(${PROJECT_NAME}_obj
+        PUBLIC ${TENSORRT_LIBS} cublas cudnn)
+mmdeploy_export(${PROJECT_NAME}_obj)
+# Build module library. It is used to convert onnx model to tensorrt engine
+mmdeploy_add_module(${PROJECT_NAME} MODULE EXCLUDE "")
+target_link_libraries(${PROJECT_NAME} PRIVATE ${PROJECT_NAME}_obj)
+add_library(mmdeploy::tensorrt_ops ALIAS ${PROJECT_NAME})
+set(_TRT_OPS_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/lib)
+install(TARGETS ${PROJECT_NAME} DESTINATION ${_TRT_OPS_DIR})
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#include "trt_batched_nms.hpp"
+#include <cstring>
+#include "nms/batched_nms_kernel.hpp"
+#include "nms/kernel.h"
+#include "trt_serialize.hpp"
+namespace mmdeploy {
+using namespace nvinfer1;
+using nvinfer1::plugin::NMSParameters;
+namespace {
+static const char* NMS_PLUGIN_VERSION{"1"};
+static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
+}  // namespace
+TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
+    : TRTPluginBase(name), param(params), mReturnIndex(returnIndex) {}
+TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &param);
+  deserialize_value(&data, &length, &mClipBoxes);
+  deserialize_value(&data, &length, &mReturnIndex);
+}
+int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT {
+  int num = mReturnIndex ? 3 : 2;
+  return num;
+}
+nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  ASSERT(nbInputs == 2);
+  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+  ASSERT(inputs[0].nbDims == 4);
+  ASSERT(inputs[1].nbDims == 3);
+  nvinfer1::DimsExprs ret;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = exprBuilder.constant(param.keepTopK);
+  switch (outputIndex) {
+    case 0:
+      ret.nbDims = 3;
+      ret.d[2] = exprBuilder.constant(5);
+      break;
+    case 1:
+      ret.nbDims = 2;
+      break;
+    case 2:
+      ret.nbDims = 2;
+    default:
+      break;
+  }
+  return ret;
+}
+size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+                                       const nvinfer1::PluginTensorDesc* outputs,
+                                       int nbOutputs) const TRT_NOEXCEPT {
+  size_t batch_size = inputs[0].dims.d[0];
+  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+  size_t num_priors = inputs[0].dims.d[1];
+  bool shareLocation = (inputs[0].dims.d[2] == 1);
+  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
+                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
+                                         DataType::kFLOAT);
+}
+int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                           const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+                           void* const* outputs, void* workSpace,
+                           cudaStream_t stream) TRT_NOEXCEPT {
+  const void* const locData = inputs[0];
+  const void* const confData = inputs[1];
+  void* nmsedDets = outputs[0];
+  void* nmsedLabels = outputs[1];
+  void* nmsedIndex = mReturnIndex ? outputs[2] : nullptr;
+  size_t batch_size = inputDesc[0].dims.d[0];
+  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+  size_t num_priors = inputDesc[0].dims.d[1];
+  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
+  int topk =
+      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+  bool rotated = false;
+  pluginStatus_t status = nmsInference(
+      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
+      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
+      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nmsedIndex,
+      workSpace, param.isNormalized, false, mClipBoxes, rotated);
+  ASSERT(status == STATUS_SUCCESS);
+  return 0;
+}
+size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT {
+  // NMSParameters
+  return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
+}
+void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, param);
+  serialize_value(&buffer, mClipBoxes);
+  serialize_value(&buffer, mReturnIndex);
+}
+void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+                                    const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                    int nbOutputs) TRT_NOEXCEPT {
+  // Validate input arguments
+}
+bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc,
+                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 3 || pos == 4) {
+    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  }
+  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
+const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT { return NMS_PLUGIN_VERSION; }
+IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT {
+  auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  plugin->setClipParam(mClipBoxes);
+  return plugin;
+}
+nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                    int nbInputs) const TRT_NOEXCEPT {
+  ASSERT(index >= 0 && index < this->getNbOutputs());
+  if (index == 1 || index == 2) {
+    return nvinfer1::DataType::kINT32;
+  }
+  return inputTypes[0];
+}
+void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
+TRTBatchedNMSCreator::TRTBatchedNMSCreator() {
+  mPluginAttributes.emplace_back(
+      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(
+      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+  mPluginAttributes.emplace_back(
+      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
+const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return NMS_PLUGIN_VERSION;
+}
+IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char* name,
+                                                 const PluginFieldCollection* fc) TRT_NOEXCEPT {
+  const PluginField* fields = fc->fields;
+  bool clipBoxes = true;
+  bool returnIndex = false;
+  nvinfer1::plugin::NMSParameters params{};
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "background_label_id")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "num_classes")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.numClasses = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "topk")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.topK = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "keep_topk")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.keepTopK = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "score_threshold")) {
+      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+    } else if (!strcmp(attrName, "iou_threshold")) {
+      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+    } else if (!strcmp(attrName, "is_normalized")) {
+      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+    } else if (!strcmp(attrName, "clip_boxes")) {
+      clipBoxes = *(static_cast<const bool*>(fields[i].data));
+    } else if (!strcmp(attrName, "return_index")) {
+      returnIndex = *(static_cast<const bool*>(fields[i].data));
+    }
+  }
+  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
+  plugin->setClipParam(clipBoxes);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  return plugin;
+}
+IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData,
+                                                      size_t serialLength) TRT_NOEXCEPT {
+  // This object will be deleted when the network is destroyed, which will
+  // call NMS::destroy()
+  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#ifndef TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
+#define TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
+#include <string>
+#include <vector>
+#include "NvInferPluginUtils.h"
+#include "trt_plugin_base.hpp"
+namespace mmdeploy {
+enum NMSReturnType { RETURN_DETS = 1, RETURN_INDEX = 1 << 1 };
+class TRTBatchedNMS : public TRTPluginBase {
+ public:
+  TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
+  TRTBatchedNMS(const std::string& name, const void* data, size_t length);
+  ~TRTBatchedNMS() TRT_NOEXCEPT override = default;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+      TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+  void setClipParam(bool clip);
+ private:
+  nvinfer1::plugin::NMSParameters param{};
+  bool mClipBoxes{};
+  bool mReturnIndex{};
+};
+class TRTBatchedNMSCreator : public TRTPluginCreatorBase {
+ public:
+  TRTBatchedNMSCreator();
+  ~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
+                                            size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_batched_rotated_nms.hpp"
+#include <cstring>
+#include "nms/batched_nms_kernel.hpp"
+#include "nms/kernel.h"
+#include "trt_serialize.hpp"
+namespace mmdeploy {
+using namespace nvinfer1;
+using nvinfer1::plugin::NMSParameters;
+namespace {
+static const char* NMS_PLUGIN_VERSION{"1"};
+static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
+}  // namespace
+TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
+    : TRTPluginBase(name), param(params) {}
+TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &param);
+  deserialize_value(&data, &length, &mClipBoxes);
+}
+int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT { return 2; }
+nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  ASSERT(nbInputs == 2);
+  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+  ASSERT(inputs[0].nbDims == 4);
+  ASSERT(inputs[1].nbDims == 3);
+  nvinfer1::DimsExprs ret;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = exprBuilder.constant(param.keepTopK);
+  switch (outputIndex) {
+    case 0:
+      ret.nbDims = 3;
+      ret.d[2] = exprBuilder.constant(6);
+      break;
+    case 1:
+      ret.nbDims = 2;
+      break;
+    default:
+      break;
+  }
+  return ret;
+}
+size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                              int nbInputs,
+                                              const nvinfer1::PluginTensorDesc* outputs,
+                                              int nbOutputs) const TRT_NOEXCEPT {
+  size_t batch_size = inputs[0].dims.d[0];
+  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+  size_t num_priors = inputs[0].dims.d[1];
+  bool shareLocation = (inputs[0].dims.d[2] == 1);
+  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
+                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
+                                         DataType::kFLOAT);
+}
+int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                  const nvinfer1::PluginTensorDesc* outputDesc,
+                                  const void* const* inputs, void* const* outputs, void* workSpace,
+                                  cudaStream_t stream) TRT_NOEXCEPT {
+  const void* const locData = inputs[0];
+  const void* const confData = inputs[1];
+  void* nmsedDets = outputs[0];
+  void* nmsedLabels = outputs[1];
+  size_t batch_size = inputDesc[0].dims.d[0];
+  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+  size_t num_priors = inputDesc[0].dims.d[1];
+  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
+  int topk =
+      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+  bool rotated = true;
+  pluginStatus_t status = nmsInference(
+      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
+      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
+      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nullptr,
+      workSpace, param.isNormalized, false, mClipBoxes, rotated);
+  ASSERT(status == STATUS_SUCCESS);
+  return 0;
+}
+size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT {
+  // NMSParameters,
+  return sizeof(NMSParameters) + sizeof(bool);
+}
+void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, param);
+  serialize_value(&buffer, mClipBoxes);
+}
+void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                           int nbInputs,
+                                           const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                           int nbOutputs) TRT_NOEXCEPT {
+  // Validate input arguments
+}
+bool TRTBatchedRotatedNMS::supportsFormatCombination(int pos,
+                                                     const nvinfer1::PluginTensorDesc* ioDesc,
+                                                     int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 3) {
+    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  }
+  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
+const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT {
+  return NMS_PLUGIN_VERSION;
+}
+IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT {
+  auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  plugin->setClipParam(mClipBoxes);
+  return plugin;
+}
+nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int index,
+                                                           const nvinfer1::DataType* inputTypes,
+                                                           int nbInputs) const TRT_NOEXCEPT {
+  ASSERT(index >= 0 && index < this->getNbOutputs());
+  if (index == 1) {
+    return nvinfer1::DataType::kINT32;
+  }
+  return inputTypes[0];
+}
+void TRTBatchedRotatedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
+TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator() {
+  mPluginAttributes.emplace_back(
+      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(
+      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+  mPluginAttributes.emplace_back(
+      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT {
+  return NMS_PLUGIN_NAME;
+}
+const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return NMS_PLUGIN_VERSION;
+}
+IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
+    const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
+  const PluginField* fields = fc->fields;
+  bool clipBoxes = true;
+  nvinfer1::plugin::NMSParameters params{};
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "background_label_id")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "num_classes")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.numClasses = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "topk")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.topK = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "keep_topk")) {
+      ASSERT(fields[i].type == PluginFieldType::kINT32);
+      params.keepTopK = *(static_cast<const int*>(fields[i].data));
+    } else if (!strcmp(attrName, "score_threshold")) {
+      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+    } else if (!strcmp(attrName, "iou_threshold")) {
+      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+    } else if (!strcmp(attrName, "is_normalized")) {
+      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+    } else if (!strcmp(attrName, "clip_boxes")) {
+      clipBoxes = *(static_cast<const bool*>(fields[i].data));
+    }
+  }
+  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
+  plugin->setClipParam(clipBoxes);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  return plugin;
+}
+IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
+                                                             const void* serialData,
+                                                             size_t serialLength) TRT_NOEXCEPT {
+  // This object will be deleted when the network is destroyed, which will
+  // call NMS::destroy()
+  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_BATCHED_ROTATED_NMS_HPP
+#define TRT_BATCHED_ROTATED_NMS_HPP
+#include <string>
+#include <vector>
+#include "NvInferPluginUtils.h"
+#include "trt_plugin_base.hpp"
+namespace mmdeploy {
+class TRTBatchedRotatedNMS : public TRTPluginBase {
+ public:
+  TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
+  TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
+  ~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+      TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+  void setClipParam(bool clip);
+ private:
+  nvinfer1::plugin::NMSParameters param{};
+  bool mClipBoxes{};
+};
+class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase {
+ public:
+  TRTBatchedRotatedNMSCreator();
+  ~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
+                                            size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_bicubic_interpolate.hpp"
+#include <assert.h>
+#include <chrono>
+#include "trt_bicubic_interpolate_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+#include "trt_serialize.hpp"
+using namespace nvinfer1;
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"TRTBicubicInterpolate"};
+}  // namespace
+TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string &name,
+                                             std::vector<float> scale_factor, bool align_corners)
+    : TRTPluginBase(name), mScaleFactor(scale_factor), mAlignCorners(align_corners) {}
+TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void *data,
+                                             size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mScaleFactor);
+  deserialize_value(&data, &length, &mAlignCorners);
+}
+nvinfer1::IPluginV2DynamicExt *TRTBicubicInterpolate::clone() const TRT_NOEXCEPT {
+  TRTBicubicInterpolate *plugin =
+      new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  auto height = exprBuilder.constant(mScaleFactor[0]);
+  auto width = exprBuilder.constant(mScaleFactor[1]);
+  auto d2 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
+  auto d3 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
+  ret.d[2] = d2;
+  ret.d[3] = d3;
+  return ret;
+}
+bool TRTBicubicInterpolate::supportsFormatCombination(int pos,
+                                                      const nvinfer1::PluginTensorDesc *ioDesc,
+                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  }
+}
+void TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
+                                            int nbInputs,
+                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                            int nbOutputs) TRT_NOEXCEPT {}
+size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                               int nbInputs,
+                                               const nvinfer1::PluginTensorDesc *outputs,
+                                               int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs, void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) TRT_NOEXCEPT {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int height_out = outputDesc[0].dims.d[2];
+  int width_out = outputDesc[0].dims.d[3];
+  const void *x = inputs[0];
+  void *output = outputs[0];
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      bicubic_interpolate<float>((float *)x, (float *)output, batch, channels, height, width,
+                                 height_out, width_out, mAlignCorners, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+  return 0;
+}
+nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int index,
+                                                            const nvinfer1::DataType *inputTypes,
+                                                            int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+// IPluginV2 Methods
+const char *TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+const char *TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
+}
+void TRTBicubicInterpolate::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mScaleFactor);
+  serialize_value(&buffer, mAlignCorners);
+}
+////////////////////// creator /////////////////////////////
+TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+const char *TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+const char *TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  nvinfer1::Dims size{2, {1, 1}};
+  std::vector<float> scale_factor;
+  bool align_corners = 1;
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+    if (field_name.compare("scale_factor") == 0) {
+      int data_size = (fc->fields[i].length);
+      if (data_size != 2) {
+        data_size = data_size / sizeof(float);
+      }
+      ASSERT(data_size == 2)
+      const float *data_start = static_cast<const float *>(fc->fields[i].data);
+      scale_factor = std::vector<float>(data_start, data_start + data_size);
+    }
+    if (field_name.compare("align_corners") == 0) {
+      align_corners = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+  TRTBicubicInterpolate *plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+#ifndef TRT_BICUBIC_INTERPOLATE_HPP
+#define TRT_BICUBIC_INTERPOLATE_HPP
+#include <cublas_v2.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "trt_plugin_base.hpp"
+namespace mmdeploy {
+class TRTBicubicInterpolate : public TRTPluginBase {
+ public:
+  TRTBicubicInterpolate(const std::string &name, std::vector<float> scale_factor,
+                        bool align_corners);
+  TRTBicubicInterpolate(const std::string name, const void *data, size_t length);
+  TRTBicubicInterpolate() = delete;
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+ private:
+  std::vector<float> mScaleFactor;
+  bool mAlignCorners;
+};
+class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase {
+ public:
+  TRTBicubicInterpolateCreator();
+  const char *getPluginName() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_BICUBIC_INTERPOLATE_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+// Modified from
+// https://github.com/pytorch/pytorch/blob/6adbe044e39c8e8db158d91e151aa6dead6e9aa4/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "common_cuda_helper.hpp"
+#include "trt_bicubic_interpolate_kernel.hpp"
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+template <typename scalar_t>
+__device__ __forceinline__ static void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                                       scalar_t t) {
+  scalar_t A = -0.75;
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                                          scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t *data,
+                                                                      int batch, int channel,
+                                                                      int batchsize, int channels,
+                                                                      int height, int width, int y,
+                                                                      int x) {
+  int access_y = max(min(y, height - 1), 0);
+  int access_x = max(min(x, width - 1), 0);
+  return data[batch * channels * height * width + channel * height * width + access_y * width +
+              access_x];
+}
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t
+area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
+    // [Note] Follow Opencv resize logic:
+    // We allow negative src_idx here and later will use
+    //   dx = src_idx - floorf(src_idx)
+    // to compute the "distance"(which affects weights).
+    // For linear modes, weight distribution doesn't matter
+    // for negative indices as they use 2 pixels to interpolate.
+    // For example, [-1, 0], they both use pixel 0 value so it
+    // doesn't affect if we bound the src_idx to 0 or not.
+    // TODO: Our current linear mode impls use unbound indices
+    // where we should and then remove this cubic flag.
+    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+    // to interpolate and the weights can be affected.
+    return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
+  }
+}
+// cubic interpolation pytorch
+template <typename scalar_t>
+__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t *src,
+                                          const int batchsize, const int channels, int srcWidth,
+                                          int srcHeight, scalar_t *dst, int dstWidth, int dstHeight,
+                                          bool align_corners, float height_scale,
+                                          float width_scale) {
+  CUDA_1D_KERNEL_LOOP(index, num_elements) {
+    // Special case: input and output are the same size, just copy
+    const int output_x = index % dstWidth;
+    const int output_y = index / dstWidth;
+    if (srcHeight == dstHeight && srcWidth == dstWidth) {
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; c++) {
+          const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                                   output_y * dstWidth + output_x];
+          dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
+              output_x] = val;
+        }
+      }
+      return;
+    }
+    // Interpolation kernel
+    scalar_t real_x =
+        area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
+    int in_x = floorf(real_x);
+    scalar_t t_x = real_x - in_x;
+    scalar_t real_y =
+        area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
+    int in_y = floorf(real_y);
+    scalar_t t_y = real_y - in_y;
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; c++) {
+        scalar_t coefficients[4];
+        for (int k = 0; k < 4; k++) {
+          coefficients[k] = cubic_interp1d<scalar_t>(
+              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
+                                         in_y - 1 + k, in_x - 1),
+              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
+                                         in_y - 1 + k, in_x + 0),
+              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
+                                         in_y - 1 + k, in_x + 1),
+              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
+                                         in_y - 1 + k, in_x + 2),
+              t_x);
+        }
+        dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
+            output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2],
+                                                coefficients[3], t_y));
+      }
+    }
+  }
+}
+template <typename scalar_t>
+void resizeGPU(const scalar_t *pIn_d, scalar_t *pOut_d, int batch, int channels, int srcWidth,
+               int srcHeight, int dstWidth, int dstHeight, bool align_corners,
+               cudaStream_t stream) {
+  float height_scale = float(srcHeight) / dstHeight;
+  float width_scale = float(srcWidth) / dstWidth;
+  if (align_corners && dstWidth > 1 && dstHeight > 1) {
+    height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
+    width_scale = (float)(srcWidth - 1) / (dstWidth - 1);
+  }
+  int n = batch * dstWidth * dstHeight * channels;
+  resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
+      dstWidth * dstHeight, pIn_d, batch, channels, srcWidth, srcHeight, pOut_d, dstWidth,
+      dstHeight, align_corners, height_scale, width_scale);
+}
+template <typename scalar_t>
+void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
+                         int in_height, int in_width, int out_height, int out_width,
+                         bool align_corners, cudaStream_t stream) {
+  resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height,
+            align_corners, stream);
+}
+template void bicubic_interpolate<float>(const float *input, float *output, int batch, int channels,
+                                         int in_height, int in_width, int out_height, int out_width,
+                                         bool align_corners, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+#ifndef TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
+#define TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
+#include <cuda_runtime.h>
+#include "common_cuda_helper.hpp"
+template <typename scalar_t>
+void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
+                         int in_height, int in_width, int out_height, int out_width,
+                         bool align_corners, cudaStream_t stream);
+#endif  // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <algorithm>
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+#define THREADS_PER_BLOCK 512
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+  int max_block_num = 4096;
+  return std::min(optimal_block_num, max_block_num);
+}
+#define cudaCheckError()                                                               \
+  {                                                                                    \
+    cudaError_t e = cudaGetLastError();                                                \
+    if (e != cudaSuccess) {                                                            \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+      exit(0);                                                                         \
+    }                                                                                  \
+  }
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
+                   cudaStream_t stream = 0);
+template <typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
+                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
+                              const scalar_t* beta, scalar_t* C, int ldc);
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t bilinear_interpolate(const scalar_t* __restrict__ input,
+                                                         const int height, const int width,
+                                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+  y = min(scalar_t(height - 1), max(scalar_t(0), y));
+  x = min(scalar_t(width - 1), max(scalar_t(0), x));
+  const int y_low = floor(y);
+  const int x_low = floor(x);
+  const int y_high = ceil(y);
+  const int x_high = ceil(x);
+  const scalar_t v1 = input[y_low * width + x_low];
+  const scalar_t v2 = input[y_low * width + x_high];
+  const scalar_t v3 = input[y_high * width + x_low];
+  const scalar_t v4 = input[y_high * width + x_high];
+  // lerp can be performed by fma
+  const scalar_t ly = y - y_low;
+  const scalar_t lx = x - x_low;
+  const scalar_t v_low = fma(v2 - v1, lx, v1);
+  const scalar_t v_high = fma(v4 - v3, lx, v3);
+  const scalar_t val = fma(v_high - v_low, ly, v_low);
+  return val;
+}
+#endif  // COMMON_CUDA_HELPER
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// modify from
+// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+#ifndef TRT_BATCHED_NMS_KERNEL_HPP
+#define TRT_BATCHED_NMS_KERNEL_HPP
+#include "cuda_runtime_api.h"
+#include "kernel.h"
+pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
+                            const int perBatchScoresSize, const bool shareLocation,
+                            const int backgroundLabelId, const int numPredsPerClass,
+                            const int numClasses, const int topK, const int keepTopK,
+                            const float scoreThreshold, const float iouThreshold,
+                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
+                            const void* confData, void* nmsedDets, void* nmsedLabels,
+                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
+                            bool clipBoxes, bool rotated = false);
+#endif