add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_SCATTERND_HPP
+#define TRT_SCATTERND_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class GatherTopk : public TRTPluginBase {
+ public:
+  GatherTopk(const std::string &name);
+
+  GatherTopk(const std::string name, const void *data, size_t length);
+
+  GatherTopk() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+};
+
+class GatherTopkCreator : public TRTPluginCreatorBase {
+ public:
+  GatherTopkCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_SCATTERND_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "gather_topk_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output,
+                                   int batch, int num_input, int num_indices, int channel) {
+  CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel) {
+    const int b_id = index / (num_indices * channel);
+    const int n_id = (index / channel) % num_indices;
+    const int c_id = index % channel;
+
+    const int input_n_id = indices[b_id * num_indices + n_id];
+    const scalar_t value = input[b_id * num_input * channel + input_n_id * channel + c_id];
+    output[b_id * num_indices * channel + n_id * channel + c_id] = value;
+  }
+}
+
+template <typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
+                      const int* indices_dims, int indice_nbDims, scalar_t* output,
+                      cudaStream_t stream) {
+  int batch = 1;
+  for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
+  int num_input = dims[indice_nbDims - 1];
+  int num_indices = indices_dims[indice_nbDims - 1];
+  int channel = 1;
+  for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
+  const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
+  gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch,
+                                                                  num_input, num_indices, channel);
+}
+
+template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims,
+                                      int nbDims, const int* indices_dims, int indice_nbDims,
+                                      float* output, cudaStream_t stream);
+
+template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims,
+                                        int nbDims, const int* indices_dims, int indice_nbDims,
+                                        int32_t* output, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
+#define TRT_GRID_SAMPLER_KERNEL_HPP
+#include <cuda_runtime.h>
+
+template <typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
+                      const int* indices_dims, int indice_nbDims, scalar_t* output,
+                      cudaStream_t stream);
+#endif  // TRT_GRID_SAMPLER_KERNEL_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_grid_priors.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_grid_priors_kernel.hpp"
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"GridPriorsTRT"};
+}  // namespace
+
+GridPriorsTRT::GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride)
+    : TRTPluginBase(name), mStride(stride) {}
+
+GridPriorsTRT::GridPriorsTRT(const std::string name, const void *data, size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mStride);
+}
+GridPriorsTRT::~GridPriorsTRT() {}
+
+nvinfer1::IPluginV2DynamicExt *GridPriorsTRT::clone() const TRT_NOEXCEPT {
+  GridPriorsTRT *plugin = new GridPriorsTRT(mLayerName, mStride);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  // input[0] == base_anchor
+  // input[1] == empty_h
+  // input[2] == empty_w
+
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  auto area =
+      exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
+  ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
+  ret.d[1] = exprBuilder.constant(4);
+
+  return ret;
+}
+
+bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
+                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else if (pos - nbInputs == 0) {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  } else {
+    return true;
+  }
+}
+
+int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                           const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                           void *const *outputs, void *workSpace,
+                           cudaStream_t stream) TRT_NOEXCEPT {
+  int num_base_anchors = inputDesc[0].dims.d[0];
+  int feat_h = inputDesc[1].dims.d[0];
+  int feat_w = inputDesc[2].dims.d[0];
+
+  const void *base_anchor = inputs[0];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      trt_grid_priors_impl<float>((float *)base_anchor, (float *)output, num_base_anchors, feat_w,
+                                  feat_h, mStride.d[0], mStride.d[1], stream);
+      break;
+    default:
+      return 1;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                                    int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GridPriorsTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT { return serialized_size(mStride); }
+
+void GridPriorsTRT::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mStride);
+  ;
+}
+
+////////////////////// creator /////////////////////////////
+
+GridPriorsTRTCreator::GridPriorsTRTCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+nvinfer1::IPluginV2 *GridPriorsTRTCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  int stride_w = 1;
+  int stride_h = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("stride_w") == 0) {
+      stride_w = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+    if (field_name.compare("stride_h") == 0) {
+      stride_h = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+  nvinfer1::Dims stride{2, {stride_w, stride_h}};
+
+  GridPriorsTRT *plugin = new GridPriorsTRT(name, stride);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GridPriorsTRTCreator::deserializePlugin(const char *name,
+                                                             const void *serialData,
+                                                             size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new GridPriorsTRT(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_GRID_PRIORS_HPP
+#define TRT_GRID_PRIORS_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class GridPriorsTRT : public TRTPluginBase {
+ public:
+  GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride);
+
+  GridPriorsTRT(const std::string name, const void *data, size_t length);
+
+  GridPriorsTRT() = delete;
+
+  ~GridPriorsTRT() TRT_NOEXCEPT override;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+
+ private:
+  nvinfer1::Dims mStride;
+
+  cublasHandle_t m_cublas_handle;
+};
+
+class GridPriorsTRTCreator : public TRTPluginCreatorBase {
+ public:
+  GridPriorsTRTCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_GRID_PRIORS_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "trt_grid_priors_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output,
+                                       int num_base_anchors, int feat_w, int feat_h, int stride_w,
+                                       int stride_h) {
+  // load base anchor into shared memory.
+  extern __shared__ scalar_t shared_base_anchor[];
+  for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x) {
+    shared_base_anchor[i] = base_anchor[i];
+  }
+  __syncthreads();
+
+  CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h) {
+    const int a_offset = (index % num_base_anchors) << 2;
+    const scalar_t w = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
+    const scalar_t h = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
+
+    auto out_start = output + index * 4;
+    out_start[0] = shared_base_anchor[a_offset] + w;
+    out_start[1] = shared_base_anchor[a_offset + 1] + h;
+    out_start[2] = shared_base_anchor[a_offset + 2] + w;
+    out_start[3] = shared_base_anchor[a_offset + 3] + h;
+  }
+}
+
+template <typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
+                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream) {
+  trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK,
+                           DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
+      base_anchor, output, (int)num_base_anchors, (int)feat_w, (int)feat_h, (int)stride_w,
+      (int)stride_h);
+}
+
+template void trt_grid_priors_impl<float>(const float* base_anchor, float* output,
+                                          int num_base_anchors, int feat_w, int feat_h,
+                                          int stride_w, int stride_h, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_GRID_PRIORS_KERNEL_HPP
+#define TRT_GRID_PRIORS_KERNEL_HPP
+#include <cuda_runtime.h>
+
+template <typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
+                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
+
+#endif
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "trt_grid_sampler.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_grid_sampler_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+#include "trt_serialize.hpp"
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"grid_sampler"};
+}  // namespace
+
+TRTGridSampler::TRTGridSampler(const std::string &name, int mode, int paddingMode,
+                               bool alignCorners)
+    : TRTPluginBase(name), mMode(mode), mPaddingMode(paddingMode), mAlignCorners(alignCorners) {}
+
+TRTGridSampler::TRTGridSampler(const std::string name, const void *data, size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mMode);
+  deserialize_value(&data, &length, &mPaddingMode);
+  deserialize_value(&data, &length, &mAlignCorners);
+}
+
+nvinfer1::IPluginV2DynamicExt *TRTGridSampler::clone() const TRT_NOEXCEPT {
+  TRTGridSampler *plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  for (int i = 2; i < ret.nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i - 1];
+  }
+  return ret;
+}
+
+bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
+                                               int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  }
+}
+
+void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+                                     const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                     int nbOutputs) TRT_NOEXCEPT {
+  // Validate input arguments
+}
+
+size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                        const nvinfer1::PluginTensorDesc *outputs,
+                                        int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                            const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                            void *const *outputs, void *workSpace,
+                            cudaStream_t stream) TRT_NOEXCEPT {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims grid_dims = inputDesc[1].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+
+  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+  switch (mMode) {
+    case 0:
+      interp_mode = GridSamplerInterpolation::Bilinear;
+      break;
+    case 1:
+      interp_mode = GridSamplerInterpolation::Nearest;
+      break;
+    default:
+      break;
+  }
+
+  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+  switch (mPaddingMode) {
+    case 0:
+      padding_mode = GridSamplerPadding::Zeros;
+      break;
+
+    case 1:
+      padding_mode = GridSamplerPadding::Border;
+      break;
+
+    case 2:
+      padding_mode = GridSamplerPadding::Reflection;
+      break;
+    default:
+      break;
+  }
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      grid_sample<float>((float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
+                         &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
+                         input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType TRTGridSampler::getOutputDataType(int index,
+                                                     const nvinfer1::DataType *inputTypes,
+                                                     int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *TRTGridSampler::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
+}
+
+void TRTGridSampler::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mMode);
+  serialize_value(&buffer, mPaddingMode);
+  serialize_value(&buffer, mAlignCorners);
+}
+
+////////////////////// creator /////////////////////////////
+
+TRTGridSamplerCreator::TRTGridSamplerCreator() {
+  mPluginAttributes = std::vector<nvinfer1::PluginField>(
+      {nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"),
+       nvinfer1::PluginField("align_corners")});
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+nvinfer1::IPluginV2 *TRTGridSamplerCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  int mode = 0;
+  int paddingMode = 0;
+  bool alignCorners = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("interpolation_mode") == 0) {
+      mode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("padding_mode") == 0) {
+      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("align_corners") == 0) {
+      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
+    }
+  }
+
+  TRTGridSampler *plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *TRTGridSamplerCreator::deserializePlugin(const char *name,
+                                                              const void *serialData,
+                                                              size_t serialLength) TRT_NOEXCEPT {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new TRTGridSampler(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_GRID_SAMPLER_HPP
+#define TRT_GRID_SAMPLER_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+
+class TRTGridSampler : public TRTPluginBase {
+ public:
+  TRTGridSampler(const std::string &name, int mode, int paddingMode, bool alignCorners);
+
+  TRTGridSampler(const std::string name, const void *data, size_t length);
+
+  TRTGridSampler() = delete;
+
+  ~TRTGridSampler() TRT_NOEXCEPT override = default;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+
+ private:
+  int mMode;
+  int mPaddingMode;
+  bool mAlignCorners;
+};
+
+class TRTGridSamplerCreator : public TRTPluginCreatorBase {
+ public:
+  TRTGridSamplerCreator();
+
+  ~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_GRID_SAMPLER_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
+// and
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
+
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_grid_sampler_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+using mmdeploy::TensorDesc;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size,
+                                                                    bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low,
+                                                               int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x) {
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
+    scalar_t coord, int size, GridSamplerPadding padding_mode, bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t *input,
+                                       const scalar_t *grid, scalar_t *output,
+                                       TensorDesc input_desc, TensorDesc grid_desc,
+                                       TensorDesc output_desc,
+                                       const GridSamplerInterpolation interpolation_mode,
+                                       const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_H = input_desc.shape[2];
+  int inp_W = input_desc.shape[3];
+  int out_H = grid_desc.shape[1];
+  int out_W = grid_desc.shape[2];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sH = input_desc.stride[2];
+  int inp_sW = input_desc.stride[3];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sH = grid_desc.stride[1];
+  int grid_sW = grid_desc.stride[2];
+  int grid_sCoor = grid_desc.stride[3];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sH = output_desc.stride[2];
+  int out_sW = output_desc.stride[3];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int n = index / (out_H * out_W);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ix));
+      int iy_nw = static_cast<int>(::floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix) * (iy_se - iy);
+      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+      // calculate bilinear weighted pixel value and set output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<scalar_t>(0);
+        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t *input,
+                                       const scalar_t *grid, scalar_t *output,
+                                       TensorDesc input_desc, TensorDesc grid_desc,
+                                       TensorDesc output_desc,
+                                       const GridSamplerInterpolation interpolation_mode,
+                                       const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_D = input_desc.shape[2];
+  int inp_H = input_desc.shape[3];
+  int inp_W = input_desc.shape[4];
+  int out_D = grid_desc.shape[1];
+  int out_H = grid_desc.shape[2];
+  int out_W = grid_desc.shape[3];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sD = input_desc.stride[2];
+  int inp_sH = input_desc.stride[3];
+  int inp_sW = input_desc.stride[4];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sD = grid_desc.stride[1];
+  int grid_sH = grid_desc.stride[2];
+  int grid_sW = grid_desc.stride[3];
+  int grid_sCoor = grid_desc.stride[4];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sD = output_desc.stride[2];
+  int out_sH = output_desc.stride[3];
+  int out_sW = output_desc.stride[4];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int d = (index / (out_H * out_W)) % out_D;
+    const int n = index / (out_D * out_H * out_W);
+    const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+        //   tne
+        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+        // tse
+        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+        // bne
+        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+        // bse
+        *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+        }
+        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+        }
+        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+        }
+        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+        }
+        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+        }
+        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+        }
+        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+        }
+        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+      int iz_nearest = static_cast<int>(::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
+  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+  desc.stride[nb_dims - 1] = 1;
+  for (int i = nb_dims - 2; i >= 0; --i) {
+    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+  }
+}
+
+template <typename T>
+void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
+                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
+                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream) {
+  TensorDesc input_desc;
+  create_desc(input_dims, nb_dims, input_desc);
+
+  TensorDesc output_desc;
+  create_desc(output_dims, nb_dims, output_desc);
+
+  TensorDesc grid_desc;
+  create_desc(grid_dims, nb_dims, grid_desc);
+
+  int count = 1;
+  for (int i = 0; i < nb_dims; ++i) {
+    if (i == 1) {
+      continue;
+    }
+    count *= output_desc.shape[i];
+  }
+
+  if (nb_dims == 4) {
+    grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
+        align_corners);
+  } else if (nb_dims == 5) {
+    grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
+        align_corners);
+  } else {
+    printf("input and grid dims should be 4 or 5\n");
+  }
+}
+
+template void grid_sample<float>(float *output, const float *input, const float *grid,
+                                 int *output_dims, int *input_dims, int *grid_dims, int nb_dims,
+                                 GridSamplerInterpolation interp, GridSamplerPadding padding,
+                                 bool align_corners, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
+#define TRT_GRID_SAMPLER_KERNEL_HPP
+#include <cuda_runtime.h>
+
+enum class GridSamplerInterpolation { Bilinear, Nearest };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+
+template <typename T>
+void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
+                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
+                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
+#endif  // TRT_GRID_SAMPLER_KERNEL_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
+
+#include "trt_instance_norm.hpp"
+
+#include <cuda_fp16.h>
+
+#include <stdexcept>
+
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+namespace mmdeploy {
+namespace {
+constexpr const char* PLUGIN_VERSION{"1"};
+constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
+}  // namespace
+
+TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
+    : TRTPluginBase(name), mEpsilon(epsilon) {}
+
+TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData,
+                                                   size_t serialLength)
+    : TRTPluginBase(name) {
+  deserialize_value(&serialData, &serialLength, &mEpsilon);
+}
+
+TRTInstanceNormalization::~TRTInstanceNormalization() {}
+
+// TRTInstanceNormalization returns one output.
+int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+DimsExprs TRTInstanceNormalization::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs output(inputs[0]);
+  return output;
+}
+
+size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                  int nbInputs,
+                                                  const nvinfer1::PluginTensorDesc* outputs,
+                                                  int nbOutputs) const TRT_NOEXCEPT {
+  int n = inputs[0].dims.d[0];
+  int c = inputs[0].dims.d[1];
+  int elem_size = sizeof(float);
+  return getAlignedSize(n * c * elem_size) * 2;
+}
+
+int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                      const nvinfer1::PluginTensorDesc* outputDesc,
+                                      const void* const* inputs, void* const* outputs,
+                                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int h = input_dims.d[2];
+  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+  int elem_size = sizeof(float);
+
+  void* n_scales = (void*)workspace;
+  void* n_bias = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
+
+  const void* scales = (const void*)inputs[1];
+  const void* bias = (const void*)inputs[2];
+
+  for (int i = 0; i < n; ++i) {
+    cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+    cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+  }
+
+  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
+  cudnnDataType_t cudnn_dtype{};
+  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+  float alpha = 1;
+  float beta = 0;
+  void const* x_ptr = inputs[0];
+  void* y_ptr = outputs[0];
+  cudnnSetStream(_cudnn_handle, stream);
+  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+  //       acceptable.
+  cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha,
+                                         &beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales,
+                                         n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
+  return 0;
+}
+
+size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mEpsilon);
+}
+
+void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mEpsilon);
+}
+
+bool TRTInstanceNormalization::supportsFormatCombination(int pos,
+                                                         const nvinfer1::PluginTensorDesc* ioDesc,
+                                                         int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  switch (pos) {
+    case 0:
+    case 3:
+      return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+               ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+              ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+              ioDesc[pos].type == ioDesc[0].type);
+    case 1:
+    case 2:
+      return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+             ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
+    default:
+      return false;
+  }
+  return false;
+}
+
+const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT {
+  auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
+  plugin->setPluginNamespace(mPluginNamespace.c_str());
+  return plugin;
+}
+
+nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int index,
+                                                               const nvinfer1::DataType* inputTypes,
+                                                               int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the
+// access to some context resource.
+void TRTInstanceNormalization::attachToContext(cudnnContext* cudnnContext,
+                                               cublasContext* cublasContext,
+                                               IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
+  _cudnn_handle = cudnnContext;
+  cudnnCreateTensorDescriptor(&_b_desc);
+  cudnnCreateTensorDescriptor(&_x_desc);
+  cudnnCreateTensorDescriptor(&_y_desc);
+}
+
+// Detach the plugin object from its execution context.
+void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT {
+  if (_y_desc) {
+    cudnnDestroyTensorDescriptor(_y_desc);
+    _y_desc = nullptr;
+  }
+  if (_x_desc) {
+    cudnnDestroyTensorDescriptor(_x_desc);
+    _x_desc = nullptr;
+  }
+  if (_b_desc) {
+    cudnnDestroyTensorDescriptor(_b_desc);
+    _b_desc = nullptr;
+  }
+}
+
+void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                               int nbInputs,
+                                               const nvinfer1::DynamicPluginTensorDesc* out,
+                                               int nbOutputs) TRT_NOEXCEPT {}
+
+// TRTInstanceNormalizationCreator methods
+TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT {
+  return PLUGIN_NAME;
+}
+
+const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
+  float epsilon = 1e-5;
+  const PluginField* fields = fc->fields;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "epsilon")) {
+      epsilon = *(static_cast<const float*>(fields[i].data));
+    }
+  }
+
+  TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
+  TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
+
+#ifndef TRT_INSTANCE_NORMALIZATION_HPP
+#define TRT_INSTANCE_NORMALIZATION_HPP
+#include <cudnn.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+typedef unsigned short half_type;
+
+namespace mmdeploy {
+class TRTInstanceNormalization final : public TRTPluginBase {
+ public:
+  TRTInstanceNormalization(const std::string& name, float epsilon);
+
+  TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
+
+  TRTInstanceNormalization() = delete;
+
+  ~TRTInstanceNormalization() TRT_NOEXCEPT override;
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  // DynamicExt plugins returns DimsExprs class instead of Dims
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+      TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+              void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  // DynamicExt plugin supportsFormat update.
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  const char* getPluginType() const TRT_NOEXCEPT override;
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
+                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+ private:
+  float mEpsilon{};
+  cudnnHandle_t _cudnn_handle{};
+  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+  std::string mPluginNamespace{};
+};
+
+class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase {
+ public:
+  TRTInstanceNormalizationCreator();
+
+  ~TRTInstanceNormalizationCreator() override = default;
+
+  const char* getPluginName() const TRT_NOEXCEPT override;
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData,
+                                                   size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_INSTANCE_NORMALIZATION_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_modulated_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_modulated_deform_conv_kernel.hpp"
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+}  // namespace
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims stride, const nvinfer1::Dims padding,
+    const nvinfer1::Dims dilation, const int deformableGroup, const int group)
+    : TRTPluginBase(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group) {
+  mWithBias = false;
+}
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
+                                                                           const void *data,
+                                                                           size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  mWithBias = false;
+}
+ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
+  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
+      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+static const nvinfer1::IDimensionExpr *get_hw(const nvinfer1::IDimensionExpr *input,
+                                              const nvinfer1::IDimensionExpr *weight,
+                                              const nvinfer1::IDimensionExpr *stride,
+                                              const nvinfer1::IDimensionExpr *pad,
+                                              const nvinfer1::IDimensionExpr *dilation,
+                                              nvinfer1::IExprBuilder &exprBuilder) {
+  using DimOp = nvinfer1::DimensionOperation;
+  auto expr_1 = exprBuilder.constant(1);
+
+  // d*(w-1)+1
+  auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
+  auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
+  auto kernel = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
+
+  // (1+2*p-k)//stride -1
+  auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
+  auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
+  auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
+  auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
+  auto out = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
+
+  return out;
+}
+
+nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  using DimOp = nvinfer1::DimensionOperation;
+  auto weight_dim = inputs[3].d;
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[3].d[0];
+
+  auto input_h = inputs[0].d[2];
+  auto input_w = inputs[0].d[3];
+  auto weight_h = weight_dim[2];
+  auto weight_w = weight_dim[3];
+  auto dilation_w = exprBuilder.constant(mDilation.d[0]);
+  auto dilation_h = exprBuilder.constant(mDilation.d[1]);
+  auto pad_w = exprBuilder.constant(mPadding.d[0]);
+  auto pad_h = exprBuilder.constant(mPadding.d[1]);
+  auto stride_w = exprBuilder.constant(mStride.d[0]);
+  auto stride_h = exprBuilder.constant(mStride.d[1]);
+  auto expr_1 = exprBuilder.constant(1);
+  auto expr_2 = exprBuilder.constant(2);
+
+  ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
+  ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
+
+  return ret;
+}
+
+bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  }
+}
+
+void ModulatedDeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {
+  if (nbInputs == 5) {
+    mWithBias = true;
+  }
+}
+
+size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
+  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[3].dims.d[2];
+  int kH = inputs[3].dims.d[3];
+  int im2col_step = std::min(32, batch_size);
+
+  size_t col_size =
+      mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
+
+  return col_size;
+}
+
+int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                                  const nvinfer1::PluginTensorDesc *outputDesc,
+                                                  const void *const *inputs, void *const *outputs,
+                                                  void *workSpace,
+                                                  cudaStream_t stream) TRT_NOEXCEPT {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int channels_out = outputDesc[0].dims.d[1];
+  int kernel_h = inputDesc[3].dims.d[2];
+  int kernel_w = inputDesc[3].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *mask = inputs[2];
+  const void *weight = inputs[3];
+  const void *bias = mWithBias ? inputs[4] : nullptr;
+  void *output = outputs[0];
+  int im2col_step = std::min(batch, 32);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+          (float *)x, (float *)weight, (float *)bias, (float *)offset, (float *)mask,
+          (float *)output, workSpace, batch, channels, height, width, channels_out, kernel_w,
+          kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
+          mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    case nvinfer1::DataType::kHALF:
+      ModulatedDeformConvForwardCUDAKernelLauncher<half>(
+          (half *)x, (half *)weight, (half *)bias, (half *)offset, (half *)mask, (half *)output,
+          workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
+          mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+         serialized_size(mDeformableGroup) + serialized_size(mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
+  m_cublas_handle = cublasContext;
+}
+
+void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+////////////////////// creator /////////////////////////////
+
+ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("deform_groups") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("groups") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+  }
+
+  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
+      name, stride, padding, dilation, deformableGroup, group);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_MODULATED_DEFORM_CONV_HPP
+#define TRT_MODULATED_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class ModulatedDeformableConvPluginDynamic : public TRTPluginBase {
+ public:
+  ModulatedDeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
+                                       const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
+                                       const int deformableGroup, const int group);
+
+  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
+
+  ModulatedDeformableConvPluginDynamic() = delete;
+
+  ~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+
+ private:
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  bool mWithBias;
+
+  cublasHandle_t m_cublas_handle;
+};
+
+class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
+ public:
+  ModulatedDeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_MODULATED_DEFORM_CONV_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+// Copyright (c) OpenMMLab. All rights reserved
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "modulated_deform_conv/modulated_deform_conv_cuda.cuh"
+#include "trt_modulated_deform_conv_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_,
+                                     const int batch_size, const int channels, const int height_im,
+                                     const int width_im, const int height_col, const int width_col,
+                                     const int kernel_h, const int kenerl_w, const int pad_h,
+                                     const int pad_w, const int stride_h, const int stride_w,
+                                     const int dilation_h, const int dilation_w,
+                                     const int deformable_group, T* data_col_,
+                                     cudaStream_t stream) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  modulated_deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+          batch_size, channels, deformable_group, height_col, width_col, data_col_);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch,
+                                       size_t step_channel, size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) { output[index] += bias[(index % step_batch) / step_channel]; }
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+template <>
+__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
+                                               size_t step_batch, size_t step_channel, size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const __half b = bias[(index % step_batch) / step_channel];
+    const __half o = output[index];
+    output[index] = __hadd(o, b);
+  }
+}
+#else
+template <>
+__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
+                                               size_t step_batch, size_t step_channel, size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const __half b = bias[(index % step_batch) / step_channel];
+    const __half o = output[index];
+    output[index] = __float2half(__half2float(o) + __half2float(b));
+  }
+}
+#endif
+
+template <typename scalar_t>
+static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel,
+                            size_t height, size_t width, cudaStream_t stream) {
+  size_t step_channel = height * width;
+  size_t step_batch = step_channel * channel;
+  size_t n = step_batch * batch;
+  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch,
+                                                                          step_channel, n);
+}
+
+template <typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
+    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
+    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
+    cublasHandle_t cublas_handle, cudaStream_t stream) {
+  bool with_bias = (bias != nullptr);
+
+  im2col_step = std::min(int(batch), im2col_step);
+  assert(batch % im2col_step == 0);
+
+  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  scalar_t* columns = (scalar_t*)workspace;
+
+  const size_t input_step = channels * height * width;
+  const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
+  const size_t mask_step = deformable_group * kernel_h * kernel_w * height_out * width_out;
+  const size_t out_step = channels_out * height_out * width_out;
+  const size_t out_group_step = out_step / group;
+  const size_t col_g_step = channels * kernel_w * kernel_h / group * height_out * width_out;
+  const size_t weight_g_step = channels_out / group * channels / group * kernel_h * kernel_w;
+
+  const int m = channels_out / group;
+  const int n = height_out * width_out;
+  const int k = channels / group * kernel_h * kernel_w;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int b = 0; b < batch; b++) {
+    const scalar_t* input_start = input + b * input_step;
+    const scalar_t* offset_start = offset + b * offset_step;
+    const scalar_t* mask_start = mask + b * mask_step;
+    trt_modulated_deformable_im2col<scalar_t>(
+        input_start, offset_start, mask_start, 1, channels, height, width, height_out, width_out,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+        deformable_group, columns, stream);
+
+    for (int g = 0; g < group; g++) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
+
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
+                               n, weight_start, k, &beta, out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (with_bias) {
+    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
+  }
+}
+
+template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+    const float* input, const float* weight, const float* bias, const float* offset,
+    const float* mask, float* output, void* workspace, int batch, int channels, int height,
+    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
+    cublasHandle_t cublas_handle, cudaStream_t stream);
+
+template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(
+    const __half* input, const __half* weight, const __half* bias, const __half* offset,
+    const __half* mask, __half* output, void* workspace, int batch, int channels, int height,
+    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
+    cublasHandle_t cublas_handle, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
+#define TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+template <typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
+    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
+    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
+    cublasHandle_t cublas_handle, cudaStream_t stream);
+
+#endif
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "trt_multi_level_roi_align.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_multi_level_roi_align_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+#include "trt_serialize.hpp"
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
+}  // namespace
+
+TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight,
+                                             int alignedWidth, int poolMode, int sampleNum,
+                                             const std::vector<float> &featmapStrides,
+                                             float roiScaleFactor, int finestScale, bool aligned)
+    : TRTPluginBase(name),
+      mAlignedHeight(alignedHeight),
+      mAlignedWidth(alignedWidth),
+      mPoolMode(poolMode),
+      mSampleNum(sampleNum),
+      mFeatmapStrides(featmapStrides),
+      mRoiScaleFactor(roiScaleFactor),
+      mFinestScale(finestScale),
+      mAligned(aligned) {}
+
+TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void *data,
+                                             size_t length)
+    : TRTPluginBase(name) {
+  deserialize_value(&data, &length, &mAlignedHeight);
+  deserialize_value(&data, &length, &mAlignedWidth);
+  deserialize_value(&data, &length, &mPoolMode);
+  deserialize_value(&data, &length, &mSampleNum);
+  deserialize_value(&data, &length, &mRoiScaleFactor);
+  deserialize_value(&data, &length, &mFinestScale);
+  deserialize_value(&data, &length, &mAligned);
+  deserialize_value(&data, &length, &mFeatmapStrides);
+}
+
+nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT {
+  TRTMultiLevelRoiAlign *plugin =
+      new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
+                                mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  // warning, nbInputs should equal to mFeatmapStrides.size() + 1
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[1].d[1];
+  ret.d[2] = exprBuilder.constant(mAlignedHeight);
+  ret.d[3] = exprBuilder.constant(mAlignedWidth);
+
+  return ret;
+}
+
+bool TRTMultiLevelRoiAlign::supportsFormatCombination(int pos,
+                                                      const nvinfer1::PluginTensorDesc *ioDesc,
+                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
+                                            int nbInputs,
+                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                            int nbOutputs) TRT_NOEXCEPT {
+  // Validate input arguments
+  ASSERT(nbOutputs == 1);
+  ASSERT(nbInputs >= 1);
+  mFeatmapStrides =
+      std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
+}
+
+size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                               int nbInputs,
+                                               const nvinfer1::PluginTensorDesc *outputs,
+                                               int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs, void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) TRT_NOEXCEPT {
+  int num_rois = inputDesc[0].dims.d[0];
+  int batch_size = inputDesc[1].dims.d[0];
+  int channels = inputDesc[1].dims.d[1];
+
+  const int kMaxFeatMap = 10;
+  int heights[kMaxFeatMap];
+  int widths[kMaxFeatMap];
+  float strides[kMaxFeatMap];
+
+  int num_feats = mFeatmapStrides.size();
+  for (int i = 0; i < num_feats; ++i) {
+    heights[i] = inputDesc[i + 1].dims.d[2];
+    widths[i] = inputDesc[i + 1].dims.d[3];
+    strides[i] = mFeatmapStrides[i];
+  }
+
+  const void *rois = inputs[0];
+  const void *const *feats = inputs + 1;
+
+  multi_level_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats, num_feats,
+                               batch_size, channels, &heights[0], &widths[0], &strides[0],
+                               mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
+                               mRoiScaleFactor, mFinestScale, mAligned, stream);
+
+  return 0;
+}
+
+nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int index,
+                                                            const nvinfer1::DataType *inputTypes,
+                                                            int nbInputs) const TRT_NOEXCEPT {
+  return nvinfer1::DataType::kFLOAT;
+}
+
+// IPluginV2 Methods
+const char *TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
+  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+         serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
+         serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
+         serialized_size(mAligned);
+}
+
+void TRTMultiLevelRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
+  serialize_value(&buffer, mAlignedHeight);
+  serialize_value(&buffer, mAlignedWidth);
+  serialize_value(&buffer, mPoolMode);
+  serialize_value(&buffer, mSampleNum);
+  serialize_value(&buffer, mRoiScaleFactor);
+  serialize_value(&buffer, mFinestScale);
+  serialize_value(&buffer, mAligned);
+  serialize_value(&buffer, mFeatmapStrides);
+}
+
+TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() {
+  mPluginAttributes = std::vector<nvinfer1::PluginField>(
+      {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
+       nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"),
+       nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
+       nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  int alignedHeight = 7;
+  int alignedWidth = 7;
+  int poolMode = 0;
+  int sampleNum = 2;
+  std::vector<float> featmapStrides;
+  float roiScaleFactor = -1;
+  int finestScale = 56;
+  bool aligned = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("output_height") == 0) {
+      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("output_width") == 0) {
+      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("pool_mode") == 0) {
+      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("sampling_ratio") == 0) {
+      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("roi_scale_factor") == 0) {
+      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("finest_scale") == 0) {
+      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("featmap_strides") == 0) {
+      int data_size = (fc->fields[i].length);
+      const float *data_start = static_cast<const float *>(fc->fields[i].data);
+      featmapStrides = std::vector<float>(data_start, data_start + data_size);
+    } else if (field_name.compare("aligned") == 0) {
+      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
+      aligned = aligned_int != 0;
+    }
+  }
+
+  ASSERT(featmapStrides.size() != 0);
+
+  TRTMultiLevelRoiAlign *plugin =
+      new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum,
+                                featmapStrides, roiScaleFactor, finestScale, aligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_MULTI_LEVEL_ROI_ALIGN_HPP
+#define TRT_MULTI_LEVEL_ROI_ALIGN_HPP
+
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class TRTMultiLevelRoiAlign : public TRTPluginBase {
+ public:
+  TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight, int alignedWidth, int poolMode,
+                        int sampleNum, const std::vector<float> &featmapStrides,
+                        float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
+
+  TRTMultiLevelRoiAlign(const std::string name, const void *data, size_t length);
+
+  TRTMultiLevelRoiAlign() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+
+ private:
+  int mAlignedHeight;
+  int mAlignedWidth;
+  int mPoolMode;
+  int mSampleNum;
+  std::vector<float> mFeatmapStrides;
+  float mRoiScaleFactor;
+  int mFinestScale;
+  bool mAligned;
+};
+
+class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase {
+ public:
+  TRTMultiLevelRoiAlignCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_ROI_ALIGN_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <float.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "common_cuda_helper.hpp"
+#include "trt_multi_level_roi_align_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+const int kMAX_FEATMAP_SIZE = 10;
+struct FeatData {
+  const void *data[kMAX_FEATMAP_SIZE];
+  int batch_size;
+  int channels;
+  int h[kMAX_FEATMAP_SIZE];
+  int w[kMAX_FEATMAP_SIZE];
+  float spatial_scale[kMAX_FEATMAP_SIZE];
+  int num_featmap;
+};
+
+template <typename scalar_t, bool aligned, int pool_mode>
+__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
+                                     const int roi_batch_ind, const scalar_t roi_start_w,
+                                     const scalar_t roi_start_h, const scalar_t roi_end_w,
+                                     const scalar_t roi_end_h, const scalar_t spatial_scale,
+                                     const int pw, const int ph, const int c, const int sample_num,
+                                     const int channels, const int height, const int width,
+                                     const int pooled_height, const int pooled_width) {
+  // Force malformed ROIs to be 1x1
+  scalar_t roi_width = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
+  scalar_t roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
+
+  const scalar_t bin_size_h = roi_height / pooled_height;
+  const scalar_t bin_size_w = roi_width / pooled_width;
+
+  const scalar_t *offset_bottom_data =
+      bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+  const int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
+  const int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+  scalar_t output_val = (pool_mode == 0) ? -FLT_MAX : 0;
+  const scalar_t y_offset = roi_start_h + ph * bin_size_h;
+  const scalar_t y_scale = bin_size_h / (scalar_t)(sample_num_h);
+  const scalar_t x_offset = roi_start_w + pw * bin_size_w;
+  const scalar_t x_scale = bin_size_w / (scalar_t)(sample_num_w);
+  for (int iy = 0; iy < sample_num_h; iy++) {
+    const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
+    for (int ix = 0; ix < sample_num_w; ix++) {
+      const scalar_t x = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
+      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
+      if (pool_mode == 0) {
+        output_val = max(output_val, val);
+      } else {
+        output_val += val;
+      }
+    }
+  }
+  if (pool_mode != 0) {
+    output_val /= max(sample_num_h * sample_num_w, 1);
+  }
+
+  return output_val;
+}
+
+template <typename scalar_t, bool aligned>
+__global__ void roi_extractor_kernel(scalar_t *__restrict__ output,
+                                     const scalar_t *__restrict__ bottom_rois, FeatData feat_data,
+                                     const int pool_mode, const int sample_num,
+                                     const float roi_scale_factor, const int finest_scale,
+                                     const int pooled_height, const int pooled_width,
+                                     int nThreads) {
+  CUDA_1D_KERNEL_LOOP(index, nThreads) {
+    const int channels = feat_data.channels;
+    int tmp_index = index;
+    const int pw = tmp_index % pooled_width;
+    tmp_index /= pooled_width;
+    const int ph = tmp_index % pooled_height;
+    tmp_index /= pooled_height;
+    const int c = tmp_index % channels;
+    const int n = tmp_index / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+
+    scalar_t roi_offset_x0 = offset_bottom_rois[1];
+    scalar_t roi_offset_y0 = offset_bottom_rois[2];
+    scalar_t roi_offset_x1 = offset_bottom_rois[3];
+    scalar_t roi_offset_y1 = offset_bottom_rois[4];
+
+    const scalar_t scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
+
+    const int target_lvls =
+        min(feat_data.num_featmap - 1,
+            max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
+
+    if (roi_scale_factor > 0.) {
+      const scalar_t roi_off_cx = (roi_offset_x0 + roi_offset_x1) * 0.5;
+      const scalar_t roi_off_cy = (roi_offset_y0 + roi_offset_y1) * 0.5;
+      const scalar_t half_scale_factor = roi_scale_factor * 0.5;
+      const scalar_t half_roi_off_w =
+          fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
+      const scalar_t half_roi_off_h =
+          fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
+
+      roi_offset_x0 = roi_off_cx - half_roi_off_w;
+      roi_offset_x1 = roi_off_cx + half_roi_off_w;
+      roi_offset_y0 = roi_off_cy - half_roi_off_h;
+      roi_offset_y1 = roi_off_cy + half_roi_off_h;
+    }
+
+    const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
+    const int height = feat_data.h[target_lvls];
+    const int width = feat_data.w[target_lvls];
+    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
+
+    const int roi_batch_ind = offset_bottom_rois[0];
+    const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
+    const scalar_t roi_start_w =
+        fma(roi_offset_x0, spatial_scale, offset);  // roi_offset_x0 * spatial_scale + offset;
+    const scalar_t roi_start_h =
+        fma(roi_offset_y0, spatial_scale, offset);  // roi_offset_y0 * spatial_scale + offset;
+    const scalar_t roi_end_w =
+        fma(roi_offset_x1, spatial_scale, offset);  // (roi_offset_x1) * spatial_scale - offset;
+    const scalar_t roi_end_h =
+        fma(roi_offset_y1, spatial_scale, offset);  // (roi_offset_y1)*spatial_scale - offset;
+
+    if (pool_mode == 0) {
+      const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
+          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
+          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
+      output[index] = output_val;
+    } else {
+      const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
+          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
+          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
+      output[index] = output_val;
+    }
+  }
+}
+
+template <typename T>
+void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
+                           int num_feats, int n, int c, int *h, int *w, float *strides,
+                           int aligned_height, int aligned_width, int pool_mode, int sample_num,
+                           float roi_scale_factor, int finest_scale, bool aligned,
+                           cudaStream_t stream) {
+  FeatData feat_data;
+  feat_data.batch_size = n;
+  feat_data.channels = c;
+  feat_data.num_featmap = num_feats;
+  for (int i = 0; i < num_feats; ++i) {
+    feat_data.data[i] = feats[i];
+    feat_data.h[i] = h[i];
+    feat_data.w[i] = w[i];
+    feat_data.spatial_scale[i] = 1. / float(strides[i]);
+  }
+  int nThreads = num_rois * c * aligned_height * aligned_width;
+  if (aligned) {
+    roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
+        aligned_height, aligned_width, nThreads);
+  } else {
+    roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
+        aligned_height, aligned_width, nThreads);
+  }
+}
+
+template void multi_level_roi_align<float>(float *output, const float *rois, int num_rois,
+                                           const void *const *feats, int num_feats, int n, int c,
+                                           int *h, int *w, float *strides, int aligned_height,
+                                           int aligned_width, int pool_mode, int sample_num,
+                                           float roi_scale_factor, int finest_scale, bool aligned,
+                                           cudaStream_t stream);