[Refactor]: Remove deployment for dev-2.x (#2225)

* remove deploy for 2.0 * update onnx ut

[Refactor]: Remove deployment for dev-2.x (#2225)
* remove deploy for 2.0 * update onnx ut
2e5628b4 · q.yao · GitHub · 961373ad · 961373ad · 961373ad
Unverified Commit 2e5628b4 authored Aug 26, 2022 by q.yao Committed by GitHub Aug 26, 2022
20 changed files
--- a/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
+++ b/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ORT_MMCV_UTILS_H
-#define ORT_MMCV_UTILS_H
-#include <onnxruntime_cxx_api.h>
-
-#include <vector>
-
-struct OrtTensorDimensions : std::vector<int64_t> {
-  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
-    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
-    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
-    ort.ReleaseTensorTypeAndShapeInfo(info);
-  }
-};
-#endif  // ORT_MMCV_UTILS_H
--- a/mmcv/ops/csrc/onnxruntime/reduce_ops.h
+++ b/mmcv/ops/csrc/onnxruntime/reduce_ops.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_REDUCE_OPS_H
-#define ONNXRUNTIME_REDUCE_OPS_H
-
-#include <onnxruntime_cxx_api.h>
-
-struct MMCVCumMaxKernel {
- public:
-  MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
-
-    // create allocator
-    allocator_ = Ort::AllocatorWithDefaultOptions();
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t dim_;
-};
-
-struct MMCVCumMinKernel {
- public:
-  MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
-
-    // create allocator
-    allocator_ = Ort::AllocatorWithDefaultOptions();
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t dim_;
-};
-
-struct MMCVCumMaxCustomOp
-    : Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVCumMaxKernel(api, info);
-  }
-
-  const char* GetName() const { return "cummax"; }
-
-  size_t GetInputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  size_t GetOutputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetOutputType(size_t index) const {
-    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-
-struct MMCVCumMinCustomOp
-    : Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVCumMinKernel(api, info);
-  }
-
-  const char* GetName() const { return "cummin"; }
-
-  size_t GetInputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  size_t GetOutputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetOutputType(size_t index) const {
-    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-
-#endif  // ONNXRUNTIME_REDUCE_OPS_H
--- a/mmcv/ops/csrc/onnxruntime/roi_align.h
+++ b/mmcv/ops/csrc/onnxruntime/roi_align.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_ROI_ALIGN_H
-#define ONNXRUNTIME_ROI_ALIGN_H
-
-#include <assert.h>
-#include <onnxruntime_cxx_api.h>
-
-#include <cmath>
-#include <mutex>
-#include <string>
-#include <vector>
-
-struct MMCVRoiAlignKernel {
- public:
-  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
-    aligned_height_ =
-        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
-    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
-    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
-    sampling_ratio_ =
-        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-
-  int aligned_height_;
-  int aligned_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-  std::string pool_mode_;
-  int aligned_;
-};
-
-struct MMCVRoiAlignCustomOp
-    : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRoiAlignKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVRoiAlign"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  }
-};
-#endif  // ONNXRUNTIME_ROI_ALIGN_H
--- a/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
+++ b/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
-#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
-
-#include <assert.h>
-#include <onnxruntime_cxx_api.h>
-
-#include <cmath>
-#include <mutex>
-#include <string>
-#include <vector>
-
-struct MMCVRoIAlignRotatedKernel {
- public:
-  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    aligned_height_ =
-        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
-    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
-    sampling_ratio_ =
-        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
-    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  int aligned_height_;
-  int aligned_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-  int aligned_;
-  int clockwise_;
-};
-
-struct MMCVRoIAlignRotatedCustomOp
-    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
-                        MMCVRoIAlignRotatedKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRoIAlignRotatedKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVRoIAlignRotated"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  }
-};
-#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
--- a/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
+++ b/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
-#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
-#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
-
-#include <onnxruntime_cxx_api.h>
-
-#include <cmath>
-
-struct MMCVRotatedFeatureAlignKernel {
- public:
-  MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-    points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  float spatial_scale_;
-  int points_;
-};
-
-struct MMCVRotatedFeatureAlignCustomOp
-    : Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
-                        MMCVRotatedFeatureAlignKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRotatedFeatureAlignKernel(api, info);
-  }
-
-  const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  }
-};
-#endif  // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
--- a/mmcv/ops/csrc/onnxruntime/soft_nms.h
+++ b/mmcv/ops/csrc/onnxruntime/soft_nms.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_SOFT_NMS_H
-#define ONNXRUNTIME_SOFT_NMS_H
-#include <onnxruntime_cxx_api.h>
-
-struct SoftNmsKernel {
-  SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
-
-  void Compute(OrtKernelContext *context);
-
- protected:
-  OrtApi api_;
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  float iou_threshold_;
-  float sigma_;
-  float min_score_;
-  int64_t method_;
-  int64_t offset_;
-};
-
-struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
-  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-    return new SoftNmsKernel(api, info);
-  };
-
-  const char *GetName() const { return "SoftNonMaxSuppression"; };
-
-  size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  size_t GetOutputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetOutputType(size_t index) const {
-    if (index == 1) {
-      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-    }
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  // force cpu
-  const char *GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-#endif  // ONNXRUNTIME_SOFT_NMS_H
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_corner_pool.hpp"
-
-#include <assert.h>
-
-#include "trt_serialize.hpp"
-
-void CornerPoolForwardLauncher_float(const float *input, float *output,
-                                     const int batch_size, const int channels,
-                                     const int height, const int width,
-                                     const int pool_type, cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *CORNER_POOL_PLUGIN_NAME{"MMCVCornerPool"};
-}  // namespace
-
-CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string &name,
-                                                 TRT_CORNER_POOL_TYPE poolType)
-    : mLayerName(name), mPoolType(poolType) {}
-
-CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string name,
-                                                 const void *data,
-                                                 size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mPoolType);
-}
-
-CornerPoolPluginDynamic::~CornerPoolPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *CornerPoolPluginDynamic::clone() const {
-  CornerPoolPluginDynamic *plugin =
-      new CornerPoolPluginDynamic(mLayerName, mPoolType);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs CornerPoolPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  return inputs[0];
-}
-
-bool CornerPoolPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  switch (pos) {
-    // input[0]
-    case 0:
-      return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-    // output[0]
-    case 1:
-      return inOut[pos].type == inOut[0].type &&
-             inOut[pos].format == inOut[0].format;
-    default:
-      return false;
-  }
-}
-
-void CornerPoolPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
-
-size_t CornerPoolPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
-}
-
-int CornerPoolPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
-  const void *input = inputs[0];
-  void *output_value = outputs[0];
-
-  const int batch_size = inputDesc[0].dims.d[0];
-  const int channels = inputDesc[0].dims.d[1];
-  const int height = inputDesc[0].dims.d[2];
-  const int width = inputDesc[0].dims.d[3];
-
-  CornerPoolForwardLauncher_float((float *)input, (float *)output_value,
-                                  batch_size, channels, height, width,
-                                  int(mPoolType), stream);
-
-  return 0;
-}
-
-nvinfer1::DataType CornerPoolPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *CornerPoolPluginDynamic::getPluginType() const {
-  switch (mPoolType) {
-    case TRT_CORNER_POOL_TYPE::TRT_TOP_POOL:
-    case TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL:
-    case TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL:
-    case TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL:
-      return CORNER_POOL_PLUGIN_NAME;
-
-    default:
-      return "UnknownpoolType";
-  }
-}
-
-const char *CornerPoolPluginDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int CornerPoolPluginDynamic::getNbOutputs() const { return 1; }
-
-int CornerPoolPluginDynamic::initialize() { return 0; }
-
-void CornerPoolPluginDynamic::terminate() {}
-
-size_t CornerPoolPluginDynamic::getSerializationSize() const {
-  return sizeof(mPoolType);
-}
-
-void CornerPoolPluginDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mPoolType);
-}
-
-void CornerPoolPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void CornerPoolPluginDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *CornerPoolPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-CornerPoolPluginDynamicCreator::CornerPoolPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *CornerPoolPluginDynamicCreator::getPluginName() const {
-  return CORNER_POOL_PLUGIN_NAME;
-}
-
-const char *CornerPoolPluginDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-CornerPoolPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  TRT_CORNER_POOL_TYPE poolType;
-  int poolMode = -1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("mode") == 0) {
-      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-
-  assert(poolMode >= 0 && poolMode <= 3);
-  switch (poolMode) {
-    case 0:
-      poolType = TRT_CORNER_POOL_TYPE::TRT_TOP_POOL;
-      break;
-    case 1:
-      poolType = TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL;
-      break;
-    case 2:
-      poolType = TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL;
-      break;
-    case 3:
-      poolType = TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL;
-      break;
-
-    default:
-      break;
-  }
-
-  CornerPoolPluginDynamic *plugin = new CornerPoolPluginDynamic(name, poolType);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  // This object will be deleted when the network is destroyed, which will
-  // call FCPluginDynamic::destroy()
-  auto plugin = new CornerPoolPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void CornerPoolPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *CornerPoolPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-#include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-template <typename scalar_t>
-__global__ void top_bottom_pool_kernel(const scalar_t *input, scalar_t *output,
-                                       const int batch_size, const int channels,
-                                       const int height, const int width,
-                                       const int pool_type) {
-  const int nthreads = batch_size * channels * width;
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int n_idx = index / (channels * width);  // batch
-    int w_idx = index % width;               // width
-    int c_idx = (index / width) % channels;  // channels
-    int offset_n = n_idx * channels * width * height;
-    int offset_n_c = offset_n + c_idx * width * height;
-    int direction = -1;            // in [-1, 1], default for TopPool
-    int index_start = height - 2;  // default for TopPool
-    // pool_type in [0, 1]
-    if (pool_type == 0) {
-      // TopPool
-      // directly copy the most bottom value from input to output
-      output[offset_n_c + (height - 1) * width + w_idx] =
-          input[offset_n_c + (height - 1) * width + w_idx];
-    } else {
-      // BottomPool
-      // directly copy the most top value from input to output
-      output[offset_n_c + w_idx] = input[offset_n_c + w_idx];
-      index_start = 1;
-      direction = 1;
-    }
-    // do pool
-    for (int h = index_start; h >= 0 && h < height; h += direction) {
-      output[offset_n_c + h * width + w_idx] =
-          max(output[offset_n_c + (h - direction) * width + w_idx],
-              input[offset_n_c + h * width + w_idx]);
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void left_right_pool_kernel(const scalar_t *input, scalar_t *output,
-                                       const int batch_size, const int channels,
-                                       const int height, const int width,
-                                       const int pool_type) {
-  const int nthreads = batch_size * channels * height;
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int n_idx = index / (channels * height);  // batch
-    int h_idx = index % height;               // height
-    int c_idx = (index / height) % channels;  // channels
-    int offset_n = n_idx * channels * width * height;
-    int offset_n_c = offset_n + c_idx * width * height;
-    int offset_n_c_h = offset_n_c + h_idx * width;
-    int direction = -1;           // in [-1, 1], default for LeftPool
-    int index_start = width - 2;  // default for LeftPool
-    // pool_type in [2, 3]
-    if (pool_type == 2) {
-      // LeftPool
-      // directly copy the most right value from input to output
-      output[offset_n_c_h + width - 1] = input[offset_n_c_h + width - 1];
-    } else {
-      // RightPool
-      // directly copy the most left value from input to output
-      output[offset_n_c_h] = input[offset_n_c_h];
-      index_start = 1;
-      direction = 1;
-    }
-    // do pool
-    for (int w = index_start; w >= 0 && w < width; w += direction) {
-      output[offset_n_c_h + w] =
-          max(output[offset_n_c_h + w - direction], input[offset_n_c_h + w]);
-    }
-  }
-}
-
-template <typename scalar_t>
-void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
-                               const int batch_size, const int channels,
-                               const int height, const int width,
-                               const int pool_type, cudaStream_t stream) {
-  int nthreads = -1, col_block = -1;
-
-  switch (pool_type) {
-    case 0:
-    case 1:
-      nthreads = batch_size * channels * width;
-      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
-      top_bottom_pool_kernel<scalar_t>
-          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-              input, output, batch_size, channels, height, width, pool_type);
-      break;
-    case 2:
-    case 3:
-      nthreads = batch_size * channels * height;
-      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
-      left_right_pool_kernel<scalar_t>
-          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-              input, output, batch_size, channels, height, width, pool_type);
-      break;
-  }
-}
-
-void CornerPoolForwardLauncher_float(const float *input, float *output,
-                                     const int batch_size, const int channels,
-                                     const int height, const int width,
-                                     const int pool_type, cudaStream_t stream) {
-  CornerPoolForwardLauncher<float>(input, output, batch_size, channels, height,
-                                   width, pool_type, stream);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
-// Copyright (c) OpenMMLab. All rights reserved
-#include <cublas_v2.h>
-
-#include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-using mmcv::TensorDesc;
-
-template <class scalar_t>
-__global__ void copy_permute_kernel(scalar_t *dst, const scalar_t *src, int n,
-                                    TensorDesc ts_src_stride,
-                                    TensorDesc ts_dst_stride,
-                                    TensorDesc ts_permute) {
-  const int src_dim = ts_src_stride.dim;
-  int *src_stride = &(ts_src_stride.stride[0]);
-  int *dst_stride = &(ts_dst_stride.stride[0]);
-  int *permute = &(ts_permute.shape[0]);
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    size_t dst_index = index;
-    size_t src_index = 0;
-    for (int i = 0; i < src_dim; ++i) {
-      int dim_index = dst_index / dst_stride[i];
-      dst_index = dst_index % dst_stride[i];
-      src_index += dim_index * src_stride[permute[i]];
-    }
-    dst[index] = src[src_index];
-  }
-}
-
-template <class scalar_t>
-void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
-                   int *permute, int src_dim, cudaStream_t stream) {
-  size_t copy_size = 1;
-  TensorDesc ts_permute;
-  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
-
-  TensorDesc ts_src_stride;
-  TensorDesc ts_dst_stride;
-  ts_src_stride.dim = src_dim;
-  ts_dst_stride.dim = src_dim;
-  int *src_stride = &(ts_src_stride.stride[0]);
-  int *dst_stride = &(ts_dst_stride.stride[0]);
-  int *dst_size = &(ts_dst_stride.shape[0]);
-  src_stride[src_dim - 1] = 1;
-  dst_stride[src_dim - 1] = 1;
-
-  for (int i = src_dim - 1; i >= 0; --i) {
-    dst_size[i] = src_size[permute[i]];
-    if (i < src_dim - 1) {
-      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
-    }
-  }
-
-  for (int i = src_dim - 1; i >= 0; --i) {
-    copy_size *= dst_size[i];
-    if (i < src_dim - 1) {
-      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
-    }
-  }
-
-  copy_permute_kernel<scalar_t>
-      <<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
-          dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
-}
-
-template void memcpyPermute<float>(float *dst, const float *src, int *src_size,
-                                   int *permute, int src_dim,
-                                   cudaStream_t stream);
-
-template <>
-cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle,
-                                     cublasOperation_t transa,
-                                     cublasOperation_t transb, int m, int n,
-                                     int k, const float *alpha, const float *A,
-                                     int lda, const float *B, int ldb,
-                                     const float *beta, float *C, int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle,
-                                    cublasOperation_t transa,
-                                    cublasOperation_t transb, int m, int n,
-                                    int k, const half *alpha, const half *A,
-                                    int lda, const half *B, int ldb,
-                                    const half *beta, half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_cummaxmin.hpp"
-
-#include <assert.h>
-
-#include "trt_serialize.hpp"
-
-void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
-                                    int *output_index, const int *dims,
-                                    int nbDims, int cum_dim, int cum_type,
-                                    cudaStream_t stream);
-
-void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
-                                    int *output_index, const int *dims,
-                                    int nbDims, int cum_dim, int cum_type,
-                                    cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *CUMMAXMIN_PLUGIN_NAME{"cummaxmin"};
-static const char *CUMMAX_PLUGIN_NAME{"cummax"};
-static const char *CUMMIN_PLUGIN_NAME{"cummin"};
-}  // namespace
-
-CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string &name, int dim,
-                                               TRT_CUMCMPTYPE cumType)
-    : mLayerName(name), mDim(dim), mCumType(cumType) {}
-
-CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string name,
-                                               const void *data, size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mDim);
-  deserialize_value(&data, &length, &mCumType);
-}
-
-CumMaxMinPluginDynamic::~CumMaxMinPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *CumMaxMinPluginDynamic::clone() const {
-  CumMaxMinPluginDynamic *plugin =
-      new CumMaxMinPluginDynamic(mLayerName, mDim, mCumType);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs CumMaxMinPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  return inputs[0];
-}
-
-bool CumMaxMinPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  switch (pos) {
-    // input[0]
-    case 0:
-      return (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
-              inOut[pos].type == nvinfer1::DataType::kINT32) &&
-             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-    // output[0]
-    case 1:
-      return inOut[pos].type == inOut[0].type &&
-             inOut[pos].format == inOut[0].format;
-    // output[1]
-    case 2:
-      return inOut[pos].type == nvinfer1::DataType::kINT32 &&
-             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-    default:
-      return false;
-  }
-}
-
-void CumMaxMinPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
-
-size_t CumMaxMinPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
-}
-
-int CumMaxMinPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
-  const void *input = inputs[0];
-  void *output_value = outputs[0];
-  int *output_index = (int *)outputs[1];
-
-  const int *dims = &(inputDesc[0].dims.d[0]);
-  int nbDims = inputDesc[0].dims.nbDims;
-
-  switch (inputDesc[0].type) {
-    case nvinfer1::DataType::kFLOAT:
-      CumMaxMinForwardLauncher_float((float *)input, (float *)output_value,
-                                     output_index, dims, nbDims, mDim,
-                                     int(mCumType), stream);
-      break;
-    case nvinfer1::DataType::kINT32:
-      CumMaxMinForwardLauncher_int32((int *)input, (int *)output_value,
-                                     output_index, dims, nbDims, mDim,
-                                     int(mCumType), stream);
-      break;
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType CumMaxMinPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  switch (index) {
-    case 0:
-      return inputTypes[0];
-    case 1:
-      return nvinfer1::DataType::kINT32;
-    default:
-      break;
-  }
-}
-
-// IPluginV2 Methods
-const char *CumMaxMinPluginDynamic::getPluginType() const {
-  switch (mCumType) {
-    case TRT_CUMCMPTYPE::TRT_CUMMAX:
-      return CUMMAX_PLUGIN_NAME;
-    case TRT_CUMCMPTYPE::TRT_CUMMIN:
-      return CUMMIN_PLUGIN_NAME;
-    default:
-      return "UnknownCumType";
-  }
-}
-
-const char *CumMaxMinPluginDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int CumMaxMinPluginDynamic::getNbOutputs() const { return 2; }
-
-int CumMaxMinPluginDynamic::initialize() { return 0; }
-
-void CumMaxMinPluginDynamic::terminate() {}
-
-size_t CumMaxMinPluginDynamic::getSerializationSize() const {
-  return sizeof(mDim) + sizeof(mCumType);
-}
-
-void CumMaxMinPluginDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mDim);
-  serialize_value(&buffer, mCumType);
-}
-
-void CumMaxMinPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void CumMaxMinPluginDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *CumMaxMinPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-CumMaxMinPluginDynamicCreator::CumMaxMinPluginDynamicCreator(
-    TRT_CUMCMPTYPE cumType)
-    : mCumType(cumType) {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dim"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *CumMaxMinPluginDynamicCreator::getPluginName() const {
-  return CUMMAXMIN_PLUGIN_NAME;
-}
-
-const char *CumMaxMinPluginDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-CumMaxMinPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  int dim = 0;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("dim") == 0) {
-      dim = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-
-  CumMaxMinPluginDynamic *plugin =
-      new CumMaxMinPluginDynamic(name, dim, mCumType);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  // This object will be deleted when the network is destroyed, which will
-  // call FCPluginDynamic::destroy()
-  auto plugin = new CumMaxMinPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void CumMaxMinPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *CumMaxMinPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-CumMaxPluginDynamicCreator::CumMaxPluginDynamicCreator()
-    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMAX) {}
-
-const char *CumMaxPluginDynamicCreator::getPluginName() const {
-  return CUMMAX_PLUGIN_NAME;
-}
-
-CumMinPluginDynamicCreator::CumMinPluginDynamicCreator()
-    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMIN) {}
-
-const char *CumMinPluginDynamicCreator::getPluginName() const {
-  return CUMMIN_PLUGIN_NAME;
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-
-#include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-using mmcv::TensorDesc;
-
-template <typename scalar_t>
-__global__ void cummaxmin_kernel(const scalar_t *input, scalar_t *output_value,
-                                 int *output_index, TensorDesc tensor_desc,
-                                 int cum_dim, int cum_type) {
-  const size_t cum_size = tensor_desc.shape[cum_dim];
-  const size_t cum_stride = tensor_desc.stride[cum_dim];
-  const size_t data_size =
-      tensor_desc.stride[0] * tensor_desc.shape[0] / cum_size;
-  CUDA_1D_KERNEL_LOOP(index, data_size) {
-    size_t cum_offset =
-        index / cum_stride * (cum_size * cum_stride) + index % cum_stride;
-    int cum_index = 0;
-    auto cum_value = input[cum_offset];
-    output_value[cum_offset] = cum_value;
-    output_index[cum_offset] = cum_index;
-
-    for (size_t cum_index_current = 1; cum_index_current < cum_size;
-         ++cum_index_current) {
-      cum_offset += cum_stride;
-      const auto cum_value_current = input[cum_offset];
-      switch (cum_type) {
-        case 0:  // max
-          if (cum_value_current > cum_value) {
-            cum_value = cum_value_current;
-            cum_index = cum_index_current;
-          }
-          break;
-        case 1:  // min
-          if (cum_value_current < cum_value) {
-            cum_value = cum_value_current;
-            cum_index = cum_index_current;
-          }
-          break;
-      }
-      output_value[cum_offset] = cum_value;
-      output_index[cum_offset] = cum_index;
-    }
-  }
-}
-
-template <typename scalar_t>
-void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
-                              int *output_index, const int *dims, int nbDims,
-                              int cum_dim, int cum_type, cudaStream_t stream) {
-  // fill tensordesc and initial
-  TensorDesc tensor_desc;
-  memset((void *)&tensor_desc, 0, sizeof(TensorDesc));
-  tensor_desc.dim = nbDims;
-  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
-  tensor_desc.stride[nbDims - 1] = 1;
-  for (int i = nbDims - 2; i >= 0; --i) {
-    tensor_desc.shape[i] = dims[i];
-    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
-  }
-
-  // cum dim should be larger than 0
-  cum_dim = cum_dim >= 0 ? cum_dim : (nbDims + cum_dim);
-
-  const int data_size =
-      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
-
-  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
-
-  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-      input, output_value, output_index, tensor_desc, cum_dim, cum_type);
-}
-
-void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
-                                    int *output_index, const int *dims,
-                                    int nbDims, int cum_dim, int cum_type,
-                                    cudaStream_t stream) {
-  CumMaxMinForwardLauncher<float>(input, output_value, output_index, dims,
-                                  nbDims, cum_dim, cum_type, stream);
-}
-
-void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
-                                    int *output_index, const int *dims,
-                                    int nbDims, int cum_dim, int cum_type,
-                                    cudaStream_t stream) {
-  CumMaxMinForwardLauncher<int>(input, output_value, output_index, dims, nbDims,
-                                cum_dim, cum_type, stream);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_deform_conv.hpp"
-
-#include <assert.h>
-
-#include <chrono>
-
-#include "trt_serialize.hpp"
-
-void DeformConvForwardCUDAKernelLauncher_float(
-    const float *input, const float *weight, const float *offset, float *output,
-    void *workspace, int batchSize, int nInputPlane, int inputHeight,
-    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-    int padH, int dilationW, int dilationH, int group, int deformable_group,
-    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
-}  // namespace
-
-nvinfer1::PluginFieldCollection DeformableConvPluginDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    DeformableConvPluginDynamicCreator::mPluginAttributes;
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(
-    const std::string &name, const nvinfer1::Dims &stride,
-    const nvinfer1::Dims &padding, const nvinfer1::Dims &dilation,
-    const int deformableGroup, const int group, int im2colStep)
-    : mLayerName(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group),
-      mIm2colStep(im2colStep) {}
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name,
-                                                         const void *data,
-                                                         size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-  deserialize_value(&data, &length, &mIm2colStep);
-}
-DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const {
-  DeformableConvPluginDynamic *plugin =
-      new DeformableConvPluginDynamic(mLayerName, mStride, mPadding, mDilation,
-                                      mDeformableGroup, mGroup, mIm2colStep);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[2].d[0];
-
-  ret.d[2] = inputs[1].d[2];
-  ret.d[3] = inputs[1].d[3];
-
-  return ret;
-}
-
-bool DeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  if (pos == 0) {
-    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
-
-  } else {
-    return inOut[pos].type == inOut[0].type &&
-           inOut[pos].format == inOut[0].format;
-  }
-}
-
-void DeformableConvPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
-
-size_t DeformableConvPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[2].dims.d[2];
-  int kH = inputs[2].dims.d[3];
-  int im2col_step = std::min(batch_size, mIm2colStep);
-
-  size_t col_size =
-      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
-                           outputWidth * sizeof_dtype);
-
-  size_t out_size = 0;
-  if (im2col_step != 1)
-    out_size = mmcv::getAlignedSize(batch_size * nOutputPlane * outputHeight *
-                                    outputWidth * sizeof_dtype);
-
-  return col_size + out_size;
-}
-
-int DeformableConvPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
-  int batch_size = inputDesc[0].dims.d[0];
-  int inputChannel = inputDesc[0].dims.d[1];
-  int inputHeight = inputDesc[0].dims.d[2];
-  int inputWidth = inputDesc[0].dims.d[3];
-  int outputChannel = outputDesc[0].dims.d[1];
-  int kernelHeight = inputDesc[2].dims.d[2];
-  int kernelWidth = inputDesc[2].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *weight = inputs[2];
-  void *output = outputs[0];
-  int im2col_step = std::min(batch_size, mIm2colStep);
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      DeformConvForwardCUDAKernelLauncher_float(
-          (float *)x, (float *)weight, (float *)offset, (float *)output,
-          workSpace, batch_size, inputChannel, inputHeight, inputWidth,
-          outputChannel, kernelWidth, kernelHeight, mStride.d[0], mStride.d[1],
-          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
-          mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *DeformableConvPluginDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
-
-const char *DeformableConvPluginDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int DeformableConvPluginDynamic::getNbOutputs() const { return 1; }
-
-int DeformableConvPluginDynamic::initialize() { return 0; }
-
-void DeformableConvPluginDynamic::terminate() {}
-
-size_t DeformableConvPluginDynamic::getSerializationSize() const {
-  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
-         sizeof(mDeformableGroup) + sizeof(mGroup) + sizeof(mIm2colStep);
-}
-
-void DeformableConvPluginDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-  serialize_value(&buffer, mIm2colStep);
-}
-
-void DeformableConvPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void DeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) {
-  m_cublas_handle = cublasContext;
-}
-
-void DeformableConvPluginDynamic::detachFromContext() {}
-
-void DeformableConvPluginDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *DeformableConvPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("bias"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("im2col_step"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-DeformableConvPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-  int im2col_step = 32;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      if (fc->fields[i].length == 1) {
-        stride.d[1] = stride.d[0];
-      } else {
-        stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-      }
-    }
-
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      if (fc->fields[i].length == 1) {
-        padding.d[1] = padding.d[0];
-      } else {
-        padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-      }
-    }
-
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      if (fc->fields[i].length == 1) {
-        dilation.d[1] = dilation.d[0];
-      } else {
-        dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-      }
-    }
-
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("group") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("im2col_step") == 0) {
-      im2col_step = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-
-  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
-      name, stride, padding, dilation, deformableGroup, group, im2col_step);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void DeformableConvPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-#include <cuda_fp16.h>
-
-#include "common_cuda_helper.hpp"
-#include "deform_conv_cuda_kernel.cuh"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-template <typename T>
-void trt_deformable_im2col(const T* data_input, const T* data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           T* data_col, cudaStream_t stream) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  deformable_im2col_gpu_kernel<T>
-      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-          num_kernels, data_input, data_offset, height, width, ksize_h, ksize_w,
-          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-          channel_per_deformable_group, parallel_imgs, channels,
-          deformable_group, height_col, width_col, data_col);
-
-  cudaCheckError();
-}
-
-template <typename scalar_t>
-void DeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
-    scalar_t* output, void* workspace, int batchSize, int nInputPlane,
-    int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW,
-    int dH, int padW, int padH, int dilationW, int dilationH, int group,
-    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
-    cudaStream_t stream) {
-  size_t word_size = sizeof(scalar_t);
-
-  im2col_step = std::min(int(batchSize), im2col_step);
-  long outputWidth =
-      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight =
-      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  long long columns_size =
-      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
-                           outputWidth * word_size);
-
-  // column buffer for img2col
-  scalar_t* columns = (scalar_t*)workspace;
-  workspace = workspace + columns_size;
-
-  scalar_t* output_buffer;
-  long long output_buffer_size = 0;
-  if (im2col_step == 1) {
-    output_buffer = output;
-  } else {
-    // output need permute when im2col_step!=1
-    output_buffer = (scalar_t*)workspace;
-    output_buffer_size = batchSize * nOutputPlane * outputWidth * outputHeight;
-  }
-
-  long long input_elt_step =
-      im2col_step * nInputPlane * inputHeight * inputWidth;
-  long long offset_elt_step =
-      im2col_step * deformable_group * 2 * kH * kW * outputHeight * outputWidth;
-  long long out_buffer_step =
-      nOutputPlane * im2col_step * outputHeight * outputWidth;
-  long long col_g_step =
-      nInputPlane * kW * kH / group * im2col_step * outputHeight * outputWidth;
-  long long weight_g_step =
-      nOutputPlane / group * nInputPlane / group * kH * kW;
-  long long out_buffer_g_step =
-      nOutputPlane / group * im2col_step * outputHeight * outputWidth;
-  int m = nOutputPlane / group;
-  int n = im2col_step * outputHeight * outputWidth;
-  int k = nInputPlane / group * kH * kW;
-  scalar_t alpha = 1.;
-  scalar_t beta = 0.;
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    const scalar_t* input_start = input + elt * input_elt_step;
-    const scalar_t* offset_start = offset + elt * offset_elt_step;
-
-    trt_deformable_im2col<scalar_t>(input_start, offset_start, nInputPlane,
-                                    inputHeight, inputWidth, kH, kW, padH, padW,
-                                    dH, dW, dilationH, dilationW, im2col_step,
-                                    deformable_group, columns, stream);
-
-    for (int g = 0; g < group; ++g) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start =
-          output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
-
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
-                               &alpha, col_start, n, weight_start, k, &beta,
-                               out_buffer_start, n);
-      cudaCheckError();
-    }
-  }
-
-  if (im2col_step != 1) {
-    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane,
-                                  im2col_step, outputHeight, outputWidth};
-    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
-    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
-                            &output_buffer_permute[0], 5, stream);
-  }
-}
-
-void DeformConvForwardCUDAKernelLauncher_float(
-    const float* input, const float* weight, const float* offset, float* output,
-    void* workspace, int batchSize, int nInputPlane, int inputHeight,
-    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-    int padH, int dilationW, int dilationH, int group, int deformable_group,
-    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
-  DeformConvForwardCUDAKernelLauncher<float>(
-      input, weight, offset, output, workspace, batchSize, nInputPlane,
-      inputHeight, inputWidth, nOutputPlane, kW, kH, dW, dH, padW, padH,
-      dilationW, dilationH, group, deformable_group, im2col_step, cublas_handle,
-      stream);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_grid_sampler.hpp"
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <chrono>
-
-#include "trt_serialize.hpp"
-
-using mmcv::GridSamplerInterpolation;
-using mmcv::GridSamplerPadding;
-
-void grid_sample_float(float *output, const float *input, const float *grid,
-                       int *output_dims, int *input_dims, int *grid_dims,
-                       int nb_dims, GridSamplerInterpolation interp,
-                       GridSamplerPadding padding, bool align_corners,
-                       cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"grid_sampler"};
-}  // namespace
-
-nvinfer1::PluginFieldCollection GridSamplerDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField> GridSamplerDynamicCreator::mPluginAttributes;
-
-GridSamplerDynamic::GridSamplerDynamic(const std::string &name, int mode,
-                                       int paddingMode, bool alignCorners)
-    : mLayerName(name),
-      mMode(mode),
-      mPaddingMode(paddingMode),
-      mAlignCorners(alignCorners) {}
-
-GridSamplerDynamic::GridSamplerDynamic(const std::string name, const void *data,
-                                       size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mMode);
-  deserialize_value(&data, &length, &mPaddingMode);
-  deserialize_value(&data, &length, &mAlignCorners);
-}
-
-nvinfer1::IPluginV2DynamicExt *GridSamplerDynamic::clone() const {
-  GridSamplerDynamic *plugin =
-      new GridSamplerDynamic(mLayerName, mMode, mPaddingMode, mAlignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs GridSamplerDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  for (int i = 2; i < ret.nbDims; ++i) {
-    ret.d[i] = inputs[1].d[i - 1];
-  }
-  return ret;
-}
-
-bool GridSamplerDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  if (pos == 0) {
-    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return inOut[pos].type == inOut[0].type &&
-           inOut[pos].format == inOut[0].format;
-  }
-}
-
-void GridSamplerDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
-  // Validate input arguments
-}
-
-size_t GridSamplerDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  return 0;
-}
-
-int GridSamplerDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                const nvinfer1::PluginTensorDesc *outputDesc,
-                                const void *const *inputs, void *const *outputs,
-                                void *workSpace, cudaStream_t stream) {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  nvinfer1::Dims grid_dims = inputDesc[1].dims;
-  nvinfer1::Dims output_dims = outputDesc[0].dims;
-
-  using mmcv::GridSamplerInterpolation;
-  using mmcv::GridSamplerPadding;
-
-  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
-  switch (mMode) {
-    case 0:
-      interp_mode = GridSamplerInterpolation::Bilinear;
-      break;
-    case 1:
-      interp_mode = GridSamplerInterpolation::Nearest;
-      break;
-    default:
-      break;
-  }
-
-  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
-  switch (mPaddingMode) {
-    case 0:
-      padding_mode = GridSamplerPadding::Zeros;
-      break;
-
-    case 1:
-      padding_mode = GridSamplerPadding::Border;
-      break;
-
-    case 2:
-      padding_mode = GridSamplerPadding::Reflection;
-      break;
-    default:
-      break;
-  }
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      grid_sample_float(
-          (float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
-          &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
-          input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType GridSamplerDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *GridSamplerDynamic::getPluginType() const { return PLUGIN_NAME; }
-
-const char *GridSamplerDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int GridSamplerDynamic::getNbOutputs() const { return 1; }
-
-int GridSamplerDynamic::initialize() { return 0; }
-
-void GridSamplerDynamic::terminate() {}
-
-size_t GridSamplerDynamic::getSerializationSize() const {
-  return sizeof(mMode) + sizeof(mPaddingMode) + sizeof(mAlignCorners);
-}
-
-void GridSamplerDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mMode);
-  serialize_value(&buffer, mPaddingMode);
-  serialize_value(&buffer, mAlignCorners);
-}
-
-void GridSamplerDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void GridSamplerDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *GridSamplerDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-GridSamplerDynamicCreator::GridSamplerDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("interpolation_mode"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding_mode"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *GridSamplerDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char *GridSamplerDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-GridSamplerDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *GridSamplerDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  int mode = 0;
-  int paddingMode = 0;
-  bool alignCorners = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("interpolation_mode") == 0) {
-      mode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("padding_mode") == 0) {
-      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("align_corners") == 0) {
-      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
-    }
-  }
-
-  GridSamplerDynamic *plugin =
-      new GridSamplerDynamic(name, mode, paddingMode, alignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *GridSamplerDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  // This object will be deleted when the network is destroyed, which will
-  // call FCPluginDynamic::destroy()
-  auto plugin = new GridSamplerDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void GridSamplerDynamicCreator::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *GridSamplerDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
-// and
-// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
-
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
-#include "trt_grid_sampler.hpp"
-#include "trt_plugin_helper.hpp"
-
-using mmcv::GridSamplerInterpolation;
-using mmcv::GridSamplerPadding;
-using mmcv::TensorDesc;
-
-// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
-// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
-// if align_corners: -1 and +1 get sent to the centers of the corner pixels
-//     -1 --> 0
-//     +1 --> (size - 1)
-//     scale_factor = (size - 1) / 2
-// if not align_corners: -1 and +1 get sent to the image edges
-//     -1 --> -0.5
-//     +1 --> (size - 1) + 0.5 == size - 0.5
-//     scale_factor = size / 2
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t
-grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-// Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in,
-                                                            int clip_limit) {
-  return ::min(static_cast<scalar_t>(clip_limit - 1),
-               ::max(in, static_cast<scalar_t>(0)));
-}
-
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in,
-                                                               int twice_low,
-                                                               int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = ::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = ::fmod(in, span);
-  int flips = static_cast<int>(::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t
-safe_downgrade_to_int_range(scalar_t x) {
-  // -100.0 does not have special meaning. This is just to make sure
-  // it's not within_bounds_2d or within_bounds_3d, and does not cause
-  // undefined behavior. See #35506.
-  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
-    return static_cast<scalar_t>(-100.0);
-  return x;
-}
-
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
-    scalar_t coord, int size, GridSamplerPadding padding_mode,
-    bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
-    }
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  }
-
-  coord = safe_downgrade_to_int_range(coord);
-  return coord;
-}
-
-static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H,
-                                                        int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w,
-                                                        int D, int H, int W) {
-  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename scalar_t>
-__global__ void grid_sampler_2d_kernel(
-    const int nthreads, const scalar_t *input, const scalar_t *grid,
-    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
-    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
-    const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_H = input_desc.shape[2];
-  int inp_W = input_desc.shape[3];
-  int out_H = grid_desc.shape[1];
-  int out_W = grid_desc.shape[2];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sH = input_desc.stride[2];
-  int inp_sW = input_desc.stride[3];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sH = grid_desc.stride[1];
-  int grid_sW = grid_desc.stride[2];
-  int grid_sCoor = grid_desc.stride[3];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sH = output_desc.stride[2];
-  int out_sW = output_desc.stride[3];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int n = index / (out_H * out_W);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
-                                           align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
-                                           align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get NE, NW, SE, SW pixel values from (x, y)
-      int ix_nw = static_cast<int>(::floor(ix));
-      int iy_nw = static_cast<int>(::floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t nw = (ix_se - ix) * (iy_se - iy);
-      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
-      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
-      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
-
-      // calculate bilinear weighted pixel value and set output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C;
-           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<scalar_t>(0);
-        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C;
-           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<scalar_t>(0);
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void grid_sampler_3d_kernel(
-    const int nthreads, const scalar_t *input, const scalar_t *grid,
-    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
-    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
-    const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_D = input_desc.shape[2];
-  int inp_H = input_desc.shape[3];
-  int inp_W = input_desc.shape[4];
-  int out_D = grid_desc.shape[1];
-  int out_H = grid_desc.shape[2];
-  int out_W = grid_desc.shape[3];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sD = input_desc.stride[2];
-  int inp_sH = input_desc.stride[3];
-  int inp_sW = input_desc.stride[4];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sD = grid_desc.stride[1];
-  int grid_sH = grid_desc.stride[2];
-  int grid_sW = grid_desc.stride[3];
-  int grid_sCoor = grid_desc.stride[4];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sD = output_desc.stride[2];
-  int out_sH = output_desc.stride[3];
-  int out_sW = output_desc.stride[4];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int d = (index / (out_H * out_W)) % out_D;
-    const int n = index / (out_D * out_H * out_W);
-    const int grid_offset =
-        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y, z coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
-                                           align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
-                                           align_corners);
-    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode,
-                                           align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get corner pixel values from (x, y, z)
-      // for 4d, we used north-east-south-west
-      // for 5d, we add top-bottom
-      int ix_tnw = static_cast<int>(::floor(ix));
-      int iy_tnw = static_cast<int>(::floor(iy));
-      int iz_tnw = static_cast<int>(::floor(iz));
-
-      int ix_tne = ix_tnw + 1;
-      int iy_tne = iy_tnw;
-      int iz_tne = iz_tnw;
-
-      int ix_tsw = ix_tnw;
-      int iy_tsw = iy_tnw + 1;
-      int iz_tsw = iz_tnw;
-
-      int ix_tse = ix_tnw + 1;
-      int iy_tse = iy_tnw + 1;
-      int iz_tse = iz_tnw;
-
-      int ix_bnw = ix_tnw;
-      int iy_bnw = iy_tnw;
-      int iz_bnw = iz_tnw + 1;
-
-      int ix_bne = ix_tnw + 1;
-      int iy_bne = iy_tnw;
-      int iz_bne = iz_tnw + 1;
-
-      int ix_bsw = ix_tnw;
-      int iy_bsw = iy_tnw + 1;
-      int iz_bsw = iz_tnw + 1;
-
-      int ix_bse = ix_tnw + 1;
-      int iy_bse = iy_tnw + 1;
-      int iz_bse = iz_tnw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
-      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
-      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
-      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
-      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
-      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
-      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
-      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
-
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW =
-          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C;
-           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
-        //   tne
-        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
-        // tse
-        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
-        // bne
-        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
-        // bse
-        *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
-              tnw;
-        }
-        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
-              tne;
-        }
-        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
-              tsw;
-        }
-        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
-              tse;
-        }
-        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
-              bnw;
-        }
-        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
-              bne;
-        }
-        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
-              bsw;
-        }
-        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW +=
-              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
-              bse;
-        }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-      int iz_nearest = static_cast<int>(::round(iz));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW =
-          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C;
-           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H,
-                             inp_W)) {
-          *out_ptr_NCDHW =
-              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
-                         ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        }
-      }
-    }
-  }
-}
-
-void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
-  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
-  desc.stride[nb_dims - 1] = 1;
-  for (int i = nb_dims - 2; i >= 0; --i) {
-    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
-  }
-}
-
-template <typename T>
-void grid_sample(T *output, const T *input, const T *grid, int *output_dims,
-                 int *input_dims, int *grid_dims, int nb_dims,
-                 GridSamplerInterpolation interp, GridSamplerPadding padding,
-                 bool align_corners, cudaStream_t stream) {
-  TensorDesc input_desc;
-  create_desc(input_dims, nb_dims, input_desc);
-
-  TensorDesc output_desc;
-  create_desc(output_dims, nb_dims, output_desc);
-
-  TensorDesc grid_desc;
-  create_desc(grid_dims, nb_dims, grid_desc);
-
-  int count = 1;
-  for (int i = 0; i < nb_dims; ++i) {
-    if (i == 1) {
-      continue;
-    }
-    count *= output_desc.shape[i];
-  }
-
-  if (nb_dims == 4) {
-    grid_sampler_2d_kernel<T>
-        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-            count, input, grid, output, input_desc, grid_desc, output_desc,
-            interp, padding, align_corners);
-  } else if (nb_dims == 5) {
-    grid_sampler_3d_kernel<T>
-        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-            count, input, grid, output, input_desc, grid_desc, output_desc,
-            interp, padding, align_corners);
-  } else {
-    printf("input and grid dims should be 4 or 5\n");
-  }
-}
-
-void grid_sample_float(float *output, const float *input, const float *grid,
-                       int *output_dims, int *input_dims, int *grid_dims,
-                       int nb_dims, GridSamplerInterpolation interp,
-                       GridSamplerPadding padding, bool align_corners,
-                       cudaStream_t stream) {
-  grid_sample<float>(output, input, grid, output_dims, input_dims, grid_dims,
-                     nb_dims, interp, padding, align_corners, stream);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from:
-// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
-
-#include "trt_instance_norm.hpp"
-
-#include <cuda_fp16.h>
-
-#include <stdexcept>
-
-#include "trt_serialize.hpp"
-
-using namespace nvinfer1;
-
-cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
-                                      cudnnDataType_t* cudnn_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      *cudnn_dtype = CUDNN_DATA_FLOAT;
-      break;
-    case nvinfer1::DataType::kHALF:
-      *cudnn_dtype = CUDNN_DATA_HALF;
-      break;
-    default:
-      return CUDNN_STATUS_BAD_PARAM;
-  }
-  return CUDNN_STATUS_SUCCESS;
-}
-
-namespace {
-constexpr const char* PLUGIN_VERSION{"1"};
-constexpr const char* PLUGIN_NAME{"MMCVInstanceNormalization"};
-}  // namespace
-
-PluginFieldCollection InstanceNormalizationDynamicCreator::mFC{};
-std::vector<PluginField> InstanceNormalizationDynamicCreator::mPluginAttributes;
-
-InstanceNormalizationDynamic::InstanceNormalizationDynamic(
-    const std::string& name, float epsilon)
-    : mLayerName(name), mEpsilon(epsilon) {}
-
-InstanceNormalizationDynamic::InstanceNormalizationDynamic(
-    const std::string& name, void const* serialData, size_t serialLength)
-    : mLayerName(name) {
-  deserialize_value(&serialData, &serialLength, &mEpsilon);
-}
-
-InstanceNormalizationDynamic::~InstanceNormalizationDynamic() {}
-
-// InstanceNormalizationDynamic returns one output.
-int InstanceNormalizationDynamic::getNbOutputs() const { return 1; }
-
-DimsExprs InstanceNormalizationDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) {
-  nvinfer1::DimsExprs output(inputs[0]);
-  return output;
-}
-
-int InstanceNormalizationDynamic::initialize() { return 0; }
-
-void InstanceNormalizationDynamic::terminate() {}
-
-size_t InstanceNormalizationDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
-  int n = inputs[0].dims.d[0];
-  int c = inputs[0].dims.d[1];
-  int elem_size = mmcv::getElementSize(inputs[1].type);
-  return mmcv::getAlignedSize(n * c * elem_size) * 2;
-}
-
-int InstanceNormalizationDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc* inputDesc,
-    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-    void* const* outputs, void* workspace, cudaStream_t stream) {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  int n = input_dims.d[0];
-  int c = input_dims.d[1];
-  int h = input_dims.d[2];
-  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
-  int elem_size = mmcv::getElementSize(inputDesc[1].type);
-
-  void* n_scales = (void*)workspace;
-  void* n_bias = (void*)(workspace + mmcv::getAlignedSize(n * c * elem_size));
-
-  const void* scales = (const void*)inputs[1];
-  const void* bias = (const void*)inputs[2];
-
-  for (int i = 0; i < n; ++i) {
-    cudaMemcpyAsync(n_scales + i * c * elem_size, scales, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-    cudaMemcpyAsync(n_bias + i * c * elem_size, bias, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-  }
-
-  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1,
-                             n * c, 1, 1);
-  cudnnDataType_t cudnn_dtype{};
-  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
-  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
-                             h, w);
-  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
-                             h, w);
-  float alpha = 1;
-  float beta = 0;
-  void const* x_ptr = inputs[0];
-  void* y_ptr = outputs[0];
-  cudnnSetStream(_cudnn_handle, stream);
-  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
-  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
-  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
-  //       acceptable.
-  cudnnBatchNormalizationForwardTraining(
-      _cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc,
-      x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr,
-      mEpsilon, nullptr, nullptr);
-  return 0;
-}
-
-size_t InstanceNormalizationDynamic::getSerializationSize() const {
-  return serialized_size(mEpsilon);
-}
-
-void InstanceNormalizationDynamic::serialize(void* buffer) const {
-  serialize_value(&buffer, mEpsilon);
-}
-
-bool InstanceNormalizationDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
-    int nbOutputs) {
-  return ((inOut[pos].type == nvinfer1::DataType::kFLOAT ||
-           inOut[pos].type == nvinfer1::DataType::kHALF) &&
-          inOut[pos].format == nvinfer1::PluginFormat::kLINEAR &&
-          inOut[pos].type == inOut[0].type);
-}
-
-const char* InstanceNormalizationDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
-
-const char* InstanceNormalizationDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-void InstanceNormalizationDynamic::destroy() { delete this; }
-
-IPluginV2DynamicExt* InstanceNormalizationDynamic::clone() const {
-  auto* plugin = new InstanceNormalizationDynamic{mLayerName, mEpsilon};
-  plugin->setPluginNamespace(mPluginNamespace.c_str());
-  return plugin;
-}
-
-// Set plugin namespace
-void InstanceNormalizationDynamic::setPluginNamespace(
-    const char* pluginNamespace) {
-  mPluginNamespace = pluginNamespace;
-}
-
-const char* InstanceNormalizationDynamic::getPluginNamespace() const {
-  return mPluginNamespace.c_str();
-}
-
-nvinfer1::DataType InstanceNormalizationDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
-  return inputTypes[0];
-}
-
-// Attach the plugin object to an execution context and grant the plugin the
-// access to some context resource.
-void InstanceNormalizationDynamic::attachToContext(
-    cudnnContext* cudnnContext, cublasContext* cublasContext,
-    IGpuAllocator* gpuAllocator) {
-  _cudnn_handle = cudnnContext;
-  cudnnCreateTensorDescriptor(&_b_desc);
-  cudnnCreateTensorDescriptor(&_x_desc);
-  cudnnCreateTensorDescriptor(&_y_desc);
-}
-
-// Detach the plugin object from its execution context.
-void InstanceNormalizationDynamic::detachFromContext() {
-  cudnnDestroyTensorDescriptor(_y_desc);
-  cudnnDestroyTensorDescriptor(_x_desc);
-  cudnnDestroyTensorDescriptor(_b_desc);
-}
-
-void InstanceNormalizationDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
-
-// InstanceNormalizationDynamicCreator methods
-InstanceNormalizationDynamicCreator::InstanceNormalizationDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(
-      PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* InstanceNormalizationDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char* InstanceNormalizationDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const PluginFieldCollection*
-InstanceNormalizationDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
-  float epsilon = 1e-5;
-  const PluginField* fields = fc->fields;
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "epsilon")) {
-      epsilon = *(static_cast<const float*>(fields[i].data));
-    }
-  }
-
-  InstanceNormalizationDynamic* obj =
-      new InstanceNormalizationDynamic(name, epsilon);
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-
-IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::deserializePlugin(
-    const char* name, const void* serialData, size_t serialLength) {
-  InstanceNormalizationDynamic* obj =
-      new InstanceNormalizationDynamic{name, serialData, serialLength};
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-
-void InstanceNormalizationDynamicCreator::setPluginNamespace(
-    const char* libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char* InstanceNormalizationDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_modulated_deform_conv.hpp"
-
-#include <assert.h>
-
-#include <chrono>
-
-#include "trt_serialize.hpp"
-
-void ModulatedDeformConvForwardCUDAKernelLauncher_float(
-    const float *input, const float *weight, const float *bias,
-    const float *offset, const float *mask, float *output, void *workspace,
-    int batch, int channels, int height, int width, int channels_out,
-    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
-    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
-}  // namespace
-
-nvinfer1::PluginFieldCollection
-    ModulatedDeformableConvPluginDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    ModulatedDeformableConvPluginDynamicCreator::mPluginAttributes;
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
-    const std::string &name, const nvinfer1::Dims stride,
-    const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
-    const int deformableGroup, const int group)
-    : mLayerName(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group) {
-  mWithBias = false;
-}
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
-    const std::string name, const void *data, size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-  mWithBias = false;
-}
-ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone()
-    const {
-  ModulatedDeformableConvPluginDynamic *plugin =
-      new ModulatedDeformableConvPluginDynamic(
-          mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[3].d[0];
-
-  ret.d[2] = inputs[1].d[2];
-  ret.d[3] = inputs[1].d[3];
-
-  return ret;
-}
-
-bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  if (pos == 0) {
-    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
-
-  } else {
-    return inOut[pos].type == inOut[0].type &&
-           inOut[pos].format == inOut[0].format;
-  }
-}
-
-void ModulatedDeformableConvPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
-  if (nbInputs == 5) {
-    mWithBias = true;
-  }
-}
-
-size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[3].dims.d[2];
-  int kH = inputs[3].dims.d[3];
-  int im2col_step = std::min(32, batch_size);
-
-  size_t col_size = mmcv::getAlignedSize(nInputPlane * kW * kH * outputHeight *
-                                         outputWidth * sizeof_dtype);
-
-  return col_size;
-}
-
-int ModulatedDeformableConvPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-  int channels_out = outputDesc[0].dims.d[1];
-  int kernel_h = inputDesc[3].dims.d[2];
-  int kernel_w = inputDesc[3].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *mask = inputs[2];
-  const void *weight = inputs[3];
-  const void *bias = mWithBias ? inputs[4] : nullptr;
-  void *output = outputs[0];
-  int im2col_step = std::min(batch, 32);
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      ModulatedDeformConvForwardCUDAKernelLauncher_float(
-          (float *)x, (float *)weight, (float *)bias, (float *)offset,
-          (float *)mask, (float *)output, workSpace, batch, channels, height,
-          width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1],
-          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
-          mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *ModulatedDeformableConvPluginDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int ModulatedDeformableConvPluginDynamic::getNbOutputs() const { return 1; }
-
-int ModulatedDeformableConvPluginDynamic::initialize() { return 0; }
-
-void ModulatedDeformableConvPluginDynamic::terminate() {}
-
-size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const {
-  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
-         sizeof(mDeformableGroup) + sizeof(mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void ModulatedDeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) {
-  m_cublas_handle = cublasContext;
-}
-
-void ModulatedDeformableConvPluginDynamic::detachFromContext() {}
-
-void ModulatedDeformableConvPluginDynamic::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *ModulatedDeformableConvPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-ModulatedDeformableConvPluginDynamicCreator::
-    ModulatedDeformableConvPluginDynamicCreator() {
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion()
-    const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-ModulatedDeformableConvPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("group") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-    }
-
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-    }
-
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
-    }
-  }
-
-  ModulatedDeformableConvPluginDynamic *plugin =
-      new ModulatedDeformableConvPluginDynamic(name, stride, padding, dilation,
-                                               deformableGroup, group);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *
-ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  auto plugin =
-      new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void ModulatedDeformableConvPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginNamespace()
-    const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-#include <assert.h>
-#include <cuda_fp16.h>
-
-#include "common_cuda_helper.hpp"
-#include "modulated_deform_conv_cuda_kernel.cuh"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-template <typename T>
-void trt_modulated_deformable_im2col(
-    const T* data_im_, const T* data_offset_, const T* data_mask_,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, T* data_col_,
-    cudaStream_t stream) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  modulated_deformable_im2col_gpu_kernel<T>
-      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im,
-          kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-          dilation_w, channel_per_deformable_group, batch_size, channels,
-          deformable_group, height_col, width_col, data_col_);
-
-  cudaCheckError();
-}
-
-template <typename scalar_t>
-__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias,
-                                       size_t step_batch, size_t step_channel,
-                                       size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    output[index] += bias[(index % step_batch) / step_channel];
-  }
-}
-
-template <typename scalar_t>
-static void output_add_bias(scalar_t* output, const scalar_t* bias,
-                            size_t batch, size_t channel, size_t height,
-                            size_t width, cudaStream_t stream) {
-  size_t step_channel = height * width;
-  size_t step_batch = step_channel * channel;
-  size_t n = step_batch * batch;
-  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
-      output, bias, step_batch, step_channel, n);
-}
-
-template <typename scalar_t>
-void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* bias,
-    const scalar_t* offset, const scalar_t* mask, scalar_t* output,
-    void* workspace, int batch, int channels, int height, int width,
-    int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h,
-    int pad_w, int pad_h, int dilation_w, int dilation_h, int group,
-    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
-    cudaStream_t stream) {
-  size_t sizeof_dtype = sizeof(scalar_t);
-  bool with_bias = (bias != nullptr);
-
-  im2col_step = std::min(int(batch), im2col_step);
-  assert(batch % im2col_step == 0);
-  const int channels_kernel = channels / group;
-
-  const int height_out =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  scalar_t* columns = (scalar_t*)workspace;
-
-  const size_t input_step = channels * height * width;
-  const size_t offset_step =
-      deformable_group * kernel_h * kernel_w * 2 * height * width;
-  const size_t mask_step =
-      deformable_group * kernel_h * kernel_w * height * width;
-  const size_t out_step = channels_out * height_out * width_out;
-  const size_t out_group_step = out_step / group;
-  const size_t col_g_step =
-      channels * kernel_w * kernel_h / group * height_out * width_out;
-  const size_t weight_g_step =
-      channels_out / group * channels / group * kernel_h * kernel_w;
-
-  const int m = channels_out / group;
-  const int n = height_out * width_out;
-  const int k = channels / group * kernel_h * kernel_w;
-  scalar_t alpha = 1.;
-  scalar_t beta = 0.;
-
-  for (int b = 0; b < batch; b++) {
-    const scalar_t* input_start = input + b * input_step;
-    const scalar_t* offset_start = offset + b * offset_step;
-    const scalar_t* mask_start = mask + b * mask_step;
-    trt_modulated_deformable_im2col<scalar_t>(
-        input_start, offset_start, mask_start, 1, channels, height, width,
-        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-        stride_w, dilation_h, dilation_w, deformable_group, columns, stream);
-
-    for (int g = 0; g < group; g++) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
-
-      // cudaMemsetAsync(out_buffer_start, 0, 1, stream);
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
-                               &alpha, col_start, n, weight_start, k, &beta,
-                               out_buffer_start, n);
-      cudaCheckError();
-    }
-  }
-
-  if (with_bias) {
-    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out,
-                              width_out, stream);
-  }
-}
-
-void ModulatedDeformConvForwardCUDAKernelLauncher_float(
-    const float* input, const float* weight, const float* bias,
-    const float* offset, const float* mask, float* output, void* workspace,
-    int batch, int channels, int height, int width, int channels_out,
-    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
-    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
-  ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-      input, weight, bias, offset, mask, output, workspace, batch, channels,
-      height, width, channels_out, kernel_w, kernel_h, stride_w, stride_h,
-      pad_w, pad_h, dilation_w, dilation_h, group, deformable_group,
-      im2col_step, cublas_handle, stream);
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "trt_nms.hpp"
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <chrono>
-
-#include "trt_serialize.hpp"
-
-extern size_t get_onnxnms_workspace_size(
-    size_t num_batches, size_t spatial_dimension, size_t num_classes,
-    size_t boxes_word_size, int center_point_box, size_t output_length);
-
-extern void TRTNMSCUDAKernelLauncher_float(
-    const float *boxes, const float *scores,
-    const int max_output_boxes_per_class, const float iou_threshold,
-    const float score_threshold, const int offset, int *output,
-    int center_point_box, int num_batches, int spatial_dimension,
-    int num_classes, size_t output_length, void *workspace,
-    cudaStream_t stream);
-
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"NonMaxSuppression"};
-}  // namespace
-
-nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    NonMaxSuppressionDynamicCreator::mPluginAttributes;
-
-NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
-    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
-    float iouThreshold, float scoreThreshold, int offset)
-    : mLayerName(name),
-      mCenterPointBox(centerPointBox),
-      mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
-      mIouThreshold(iouThreshold),
-      mScoreThreshold(scoreThreshold),
-      mOffset(offset) {}
-
-NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
-                                                   const void *data,
-                                                   size_t length)
-    : mLayerName(name) {
-  deserialize_value(&data, &length, &mCenterPointBox);
-  deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
-  deserialize_value(&data, &length, &mIouThreshold);
-  deserialize_value(&data, &length, &mScoreThreshold);
-  deserialize_value(&data, &length, &mOffset);
-}
-
-nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
-  NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
-      mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
-      mScoreThreshold, mOffset);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 2;
-  auto num_batches = inputs[0].d[0];
-  auto spatial_dimension = inputs[0].d[1];
-  if (mMaxOutputBoxesPerClass > 0) {
-    spatial_dimension = exprBuilder.operation(
-        nvinfer1::DimensionOperation::kMIN, *spatial_dimension,
-        *exprBuilder.constant(mMaxOutputBoxesPerClass));
-  }
-  auto num_classes = inputs[1].d[1];
-  ret.d[0] = exprBuilder.operation(
-      nvinfer1::DimensionOperation::kPROD, *num_batches,
-      *exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
-                             *spatial_dimension, *num_classes));
-  ret.d[1] = exprBuilder.constant(3);
-
-  return ret;
-}
-
-bool NonMaxSuppressionDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
-  if (pos < nbInputs) {
-    switch (pos) {
-      case 0:
-        // boxes
-        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-      case 1:
-        // scores
-        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-      default:
-        return true;
-    }
-  } else {
-    switch (pos - nbInputs) {
-      case 0:
-        // selected_indices
-        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
-               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
-      default:
-        return true;
-    }
-  }
-  return true;
-}
-
-void NonMaxSuppressionDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
-
-size_t NonMaxSuppressionDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
-  size_t boxes_word_size = mmcv::getElementSize(inputs[0].type);
-  size_t num_batches = inputs[0].dims.d[0];
-  size_t spatial_dimension = inputs[0].dims.d[1];
-  size_t num_classes = inputs[1].dims.d[1];
-  size_t output_length = outputs[0].dims.d[0];
-
-  return get_onnxnms_workspace_size(num_batches, spatial_dimension, num_classes,
-                                    boxes_word_size, mCenterPointBox,
-                                    output_length);
-}
-
-int NonMaxSuppressionDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
-  int num_batches = inputDesc[0].dims.d[0];
-  int spatial_dimension = inputDesc[0].dims.d[1];
-  int num_classes = inputDesc[1].dims.d[1];
-  int output_length = outputDesc[0].dims.d[0];
-
-  const float *boxes = (const float *)inputs[0];
-  const float *scores = (const float *)inputs[1];
-  int *output = (int *)outputs[0];
-  TRTNMSCUDAKernelLauncher_float(
-      boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
-      mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
-      num_classes, output_length, workSpace, stream);
-
-  return 0;
-}
-
-nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
-  return nvinfer1::DataType::kINT32;
-}
-
-// IPluginV2 Methods
-const char *NonMaxSuppressionDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
-
-const char *NonMaxSuppressionDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
-
-int NonMaxSuppressionDynamic::initialize() { return 0; }
-
-void NonMaxSuppressionDynamic::terminate() {}
-
-size_t NonMaxSuppressionDynamic::getSerializationSize() const {
-  return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
-         sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
-}
-
-void NonMaxSuppressionDynamic::serialize(void *buffer) const {
-  serialize_value(&buffer, mCenterPointBox);
-  serialize_value(&buffer, mMaxOutputBoxesPerClass);
-  serialize_value(&buffer, mIouThreshold);
-  serialize_value(&buffer, mScoreThreshold);
-  serialize_value(&buffer, mOffset);
-}
-
-void NonMaxSuppressionDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
-  mPluginAttributes.emplace_back(
-      nvinfer1::PluginField("max_output_boxes_per_class"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("iou_threshold"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("score_threshold"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("offset"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-const nvinfer1::PluginFieldCollection *
-NonMaxSuppressionDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  int centerPointBox = 0;
-  int maxOutputBoxesPerClass = 0;
-  float iouThreshold = 0.0f;
-  float scoreThreshold = 0.0f;
-  int offset = 0;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("center_point_box") == 0) {
-      centerPointBox = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("max_output_boxes_per_class") == 0) {
-      maxOutputBoxesPerClass = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("iou_threshold") == 0) {
-      iouThreshold = static_cast<const float *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("score_threshold") == 0) {
-      scoreThreshold = static_cast<const float *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("offset") == 0) {
-      offset = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-  NonMaxSuppressionDynamic *plugin =
-      new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
-                                   iouThreshold, scoreThreshold, offset);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-void NonMaxSuppressionDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
-// Copyright (c) OpenMMLab. All rights reserved
-#include <stdio.h>
-#include <thrust/execution_policy.h>
-#include <thrust/gather.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-#include <chrono>
-#include <thread>
-#include <vector>
-
-#include "common_cuda_helper.hpp"
-#include "nms_cuda_kernel.cuh"
-#include "trt_cuda_helper.cuh"
-#include "trt_plugin_helper.hpp"
-
-struct NMSBox {
-  float box[4];
-};
-
-struct nms_centerwh2xyxy {
-  __host__ __device__ NMSBox operator()(const NMSBox box) {
-    NMSBox out;
-    out.box[0] = box.box[0] - box.box[2] / 2.0f;
-    out.box[1] = box.box[1] - box.box[3] / 2.0f;
-    out.box[2] = box.box[0] + box.box[2] / 2.0f;
-    out.box[3] = box.box[1] + box.box[3] / 2.0f;
-    return out;
-  }
-};
-
-struct nms_sbox_idle {
-  const float* idle_box_;
-  __host__ __device__ nms_sbox_idle(const float* idle_box) {
-    idle_box_ = idle_box;
-  }
-
-  __host__ __device__ NMSBox operator()(const NMSBox box) {
-    return {idle_box_[0], idle_box_[1], idle_box_[2], idle_box_[3]};
-  }
-};
-
-struct nms_score_threshold {
-  float score_threshold_;
-  __host__ __device__ nms_score_threshold(const float score_threshold) {
-    score_threshold_ = score_threshold;
-  }
-
-  __host__ __device__ bool operator()(const float score) {
-    return score < score_threshold_;
-  }
-};
-
-__global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const int old_index = output[index * 3 + 2];
-    output[index * 3 + 2] = index_cache[old_index];
-  }
-}
-
-__global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
-                                      const int* index, int* output,
-                                      int* output_count, int batch_id,
-                                      int cls_id, int spatial_dimension,
-                                      int col_blocks,
-                                      int max_output_boxes_per_class) {
-  extern __shared__ unsigned long long remv[];
-
-  // fill remv with 0
-  CUDA_1D_KERNEL_LOOP(i, col_blocks) { remv[i] = 0; }
-  __syncthreads();
-
-  int start = *output_count;
-  int out_per_class_count = 0;
-  for (int i = 0; i < spatial_dimension; i++) {
-    const int nblock = i / threadsPerBlock;
-    const int inblock = i % threadsPerBlock;
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      if (threadIdx.x == 0) {
-        output[start * 3 + 0] = batch_id;
-        output[start * 3 + 1] = cls_id;
-        output[start * 3 + 2] = index[i];
-        start += 1;
-      }
-      out_per_class_count += 1;
-      if (out_per_class_count >= max_output_boxes_per_class) {
-        break;
-      }
-      __syncthreads();
-      // set every overlap box with bit 1 in remv
-      const unsigned long long* p = dev_mask + i * col_blocks;
-      CUDA_1D_KERNEL_LOOP(j, col_blocks) {
-        if (j >= nblock) {
-          remv[j] |= p[j];
-        }
-      }  // j
-      __syncthreads();
-    }
-  }  // i
-  if (threadIdx.x == 0) {
-    *output_count = start;
-  }
-}
-
-size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
-                                  size_t num_classes, size_t boxes_word_size,
-                                  int center_point_box, size_t output_length) {
-  size_t boxes_xyxy_workspace = 0;
-  if (center_point_box == 1) {
-    boxes_xyxy_workspace = mmcv::getAlignedSize(
-        num_batches * spatial_dimension * 4 * boxes_word_size);
-  }
-  size_t scores_workspace =
-      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
-  size_t boxes_workspace =
-      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
-  const int col_blocks =
-      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
-  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
-                                               sizeof(unsigned long long));
-  size_t index_template_workspace =
-      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
-  size_t index_workspace =
-      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
-  size_t count_workspace = mmcv::getAlignedSize(sizeof(int));
-  return scores_workspace + boxes_xyxy_workspace + boxes_workspace +
-         mask_workspace + index_template_workspace + index_workspace +
-         count_workspace;
-}
-
-/**
- * Launch the NonMaxSuppression kernel
- *
- * The NMS will be performed on each batch/class, share the kernel implement
- * `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
- * be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
- * that, the output would be generated by `mask_to_output_kernel` with
- * `dev_mask` and `sorted_cache`.
- *
- * @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
- * @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
- *     input scores
- * @param[in] max_output_boxes_per_class max output boxes per class
- * @param[in] iou_threshold threshold of iou
- * @param[in] score_threshold threshold of scores
- * @param[in] offset box offset, only 0 or 1 is valid
- * @param[out] output with shape [output_length, 3], each row contain index
- *     (batch_id, class_id, boxes_id), filling -1 if result is not valid.
- * @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
- *     boxes is [center_x, center_y, width, height]
- * @param[in] num_batches batch size of boxes and scores
- * @param[in] spatial_dimension boxes numbers each batch
- * @param[in] num_classes class numbers
- * @param[in] output_length the max output rows
- * @param[in] workspace memory for all temporary variables.
- * @param[in] stream cuda stream
- */
-void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
-                                    const int max_output_boxes_per_class,
-                                    const float iou_threshold,
-                                    const float score_threshold,
-                                    const int offset, int* output,
-                                    int center_point_box, int num_batches,
-                                    int spatial_dimension, int num_classes,
-                                    size_t output_length, void* workspace,
-                                    cudaStream_t stream) {
-  const int col_blocks =
-      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
-  float* boxes_sorted = (float*)workspace;
-  workspace = static_cast<char*>(workspace) +
-              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
-
-  float* boxes_xyxy = nullptr;
-  if (center_point_box == 1) {
-    boxes_xyxy = (float*)workspace;
-    workspace = static_cast<char*>(workspace) +
-                mmcv::getAlignedSize(num_batches * spatial_dimension * 4 *
-                                     sizeof(float));
-    thrust::transform(thrust::cuda::par.on(stream), (NMSBox*)boxes,
-                      (NMSBox*)(boxes + num_batches * spatial_dimension * 4),
-                      (NMSBox*)boxes_xyxy, nms_centerwh2xyxy());
-    cudaCheckError();
-  }
-
-  float* scores_sorted = (float*)workspace;
-  workspace = static_cast<char*>(workspace) +
-              mmcv::getAlignedSize(spatial_dimension * sizeof(float));
-
-  unsigned long long* dev_mask = (unsigned long long*)workspace;
-  workspace = static_cast<char*>(workspace) +
-              mmcv::getAlignedSize(spatial_dimension * col_blocks *
-                                   sizeof(unsigned long long));
-
-  int* index_cache = (int*)workspace;
-  workspace = static_cast<char*>(workspace) +
-              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
-
-  // generate sequence [0,1,2,3,4 ....]
-  int* index_template = (int*)workspace;
-  workspace = static_cast<char*>(workspace) +
-              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
-  thrust::sequence(thrust::cuda::par.on(stream), index_template,
-                   index_template + spatial_dimension, 0);
-
-  int max_output_boxes_per_class_cpu = max_output_boxes_per_class;
-  if (max_output_boxes_per_class_cpu <= 0) {
-    max_output_boxes_per_class_cpu = spatial_dimension;
-  }
-
-  int* output_count = (int*)workspace;
-  workspace = static_cast<char*>(workspace) + mmcv::getAlignedSize(sizeof(int));
-  cudaMemsetAsync(output_count, 0, sizeof(int), stream);
-
-  // fill output with -1
-  thrust::fill(thrust::cuda::par.on(stream), output, output + output_length * 3,
-               -1);
-  cudaCheckError();
-
-  dim3 blocks(col_blocks, col_blocks);
-  dim3 threads(threadsPerBlock);
-
-  for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
-    for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
-      const int batch_cls_id = batch_id * num_classes + cls_id;
-
-      // sort boxes by score
-      cudaMemcpyAsync(scores_sorted, scores + batch_cls_id * spatial_dimension,
-                      spatial_dimension * sizeof(float),
-                      cudaMemcpyDeviceToDevice, stream);
-      cudaCheckError();
-
-      cudaMemcpyAsync(index_cache, index_template,
-                      spatial_dimension * sizeof(int), cudaMemcpyDeviceToDevice,
-                      stream);
-      cudaCheckError();
-
-      thrust::sort_by_key(thrust::cuda::par.on(stream), scores_sorted,
-                          scores_sorted + spatial_dimension, index_cache,
-                          thrust::greater<float>());
-
-      if (center_point_box == 1) {
-        thrust::gather(thrust::cuda::par.on(stream), index_cache,
-                       index_cache + spatial_dimension,
-                       (NMSBox*)(boxes_xyxy + batch_id * spatial_dimension * 4),
-                       (NMSBox*)boxes_sorted);
-      } else {
-        thrust::gather(thrust::cuda::par.on(stream), index_cache,
-                       index_cache + spatial_dimension,
-                       (NMSBox*)(boxes + batch_id * spatial_dimension * 4),
-                       (NMSBox*)boxes_sorted);
-      }
-
-      cudaCheckError();
-
-      if (score_threshold > 0.0f) {
-        thrust::transform_if(
-            thrust::cuda::par.on(stream), (NMSBox*)boxes_sorted,
-            (NMSBox*)(boxes_sorted + spatial_dimension * 4), scores_sorted,
-            (NMSBox*)boxes_sorted, nms_sbox_idle(boxes_sorted),
-            nms_score_threshold(score_threshold));
-      }
-
-      nms_cuda<<<blocks, threads, 0, stream>>>(spatial_dimension, iou_threshold,
-                                               offset, boxes_sorted, dev_mask);
-
-      // will be performed when dev_mask is full.
-      mask_to_output_kernel<<<1, threadsPerBlock,
-                              col_blocks * sizeof(unsigned long long),
-                              stream>>>(
-          dev_mask, index_cache, output, output_count, batch_id, cls_id,
-          spatial_dimension, col_blocks, max_output_boxes_per_class_cpu);
-    }  // cls_id
-  }    // batch_id
-}