[Refactor]: Remove deployment for dev-2.x (#2225)

* remove deploy for 2.0 * update onnx ut

[Refactor]: Remove deployment for dev-2.x (#2225)
* remove deploy for 2.0 * update onnx ut
2e5628b4 · q.yao · GitHub · 961373ad · 2e5628b4 · 2e5628b4
Unverified Commit 2e5628b4 authored Aug 26, 2022 by q.yao Committed by GitHub Aug 26, 2022
20 changed files
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from torch import Tensor, nn
-from torch.autograd import Function
 _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
@@ -35,58 +34,6 @@ def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
    return output
-class TopPoolFunction(Function):
-    @staticmethod
-    def symbolic(g, input: Tensor) -> Tensor:
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
-        return output
-    @staticmethod
-    def forward(ctx, input: Tensor) -> Tensor:
-        return _corner_pool(input, 2, True)
-class BottomPoolFunction(Function):
-    @staticmethod
-    def symbolic(g, input: Tensor) -> Tensor:
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
-        return output
-    @staticmethod
-    def forward(ctx, input: Tensor) -> Tensor:
-        return _corner_pool(input, 2, False)
-class LeftPoolFunction(Function):
-    @staticmethod
-    def symbolic(g, input: Tensor) -> Tensor:
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
-        return output
-    @staticmethod
-    def forward(ctx, input: Tensor) -> Tensor:
-        return _corner_pool(input, 3, True)
-class RightPoolFunction(Function):
-    @staticmethod
-    def symbolic(g, input: Tensor) -> Tensor:
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
-        return output
-    @staticmethod
-    def forward(ctx, input: Tensor) -> Tensor:
-        return _corner_pool(input, 3, False)
 class CornerPool(nn.Module):
    """Corner Pooling.
@@ -110,13 +57,6 @@ class CornerPool(nn.Module):
        Feature map after pooling.
    """
-    pool_functions = {
-        'bottom': BottomPoolFunction,
-        'left': LeftPoolFunction,
-        'right': RightPoolFunction,
-        'top': TopPoolFunction,
-    }
    cummax_dim_flip = {
        'bottom': (2, False),
        'left': (3, True),
@@ -126,21 +66,11 @@ class CornerPool(nn.Module):
    def __init__(self, mode: str):
        super().__init__()
-        assert mode in self.pool_functions
+        assert mode in self.cummax_dim_flip
        self.mode = mode
-        self.corner_pool: Function = self.pool_functions[mode]
    def forward(self, x: Tensor) -> Tensor:
        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
-            if torch.onnx.is_in_onnx_export():
-                assert torch.__version__ >= '1.7.0', \
-                    'When `cummax` serves as an intermediate component whose '\
-                    'outputs is used as inputs for another modules, it\'s '\
-                    'expected that pytorch version must be >= 1.7.0, '\
-                    'otherwise Error appears like: `RuntimeError: tuple '\
-                    'appears in op that does not forward tuples, unsupported '\
-                    'kind: prim::PythonOp`.'
            dim, flip = self.cummax_dim_flip[self.mode]
            if flip:
                x = x.flip(dim)
@@ -149,8 +79,5 @@ class CornerPool(nn.Module):
                pool_tensor = pool_tensor.flip(dim)
            return pool_tensor
        else:
-            if torch.onnx.is_in_onnx_export():
+            dim, flip = self.cummax_dim_flip[self.mode]
-                return self.corner_pool.apply(x)
+            return _corner_pool(x, dim, flip)
-            else:
-                dim, flip = self.cummax_dim_flip[self.mode]
-                return _corner_pool(x, dim, flip)
--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -26,51 +26,28 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   │   └── ...
 |   └── utils
 │   │   └── ...
-├── onnxruntime
-│   ├── onnxruntime_register.h
-│   ├── onnxruntime_session_options_config_keys.h
-│   ├── ort_mmcv_utils.h
-│   ├── ...
-│   ├── onnx_ops.h
-│   └── cpu
-│       ├── onnxruntime_register.cpp
-│       ├── ...
-│       └── onnx_ops_impl.cpp
 ├── parrots
 │   ├── ...
 │   ├── ops.cpp
 │   ├── ops_parrots.cpp
 │   └── ops_pytorch.h
-├── pytorch
+└── pytorch
-│   ├── info.cpp
+    ├── info.cpp
-│   ├── pybind.cpp
+    ├── pybind.cpp
-│   ├── ...
+    ├── ...
-│   ├── ops.cpp
+    ├── ops.cpp
-│   ├── cuda
+    ├── cuda
-│   │   ├── ...
+    │   ├── ...
-│   │   └── ops_cuda.cu
+    │   └── ops_cuda.cu
-│   ├── cpu
+    ├── cpu
-│   │   ├── ...
+    │   ├── ...
-│   │   └── ops.cpp
+    │   └── ops.cpp
-│   ├── mps
+    ├── mps
-│   │   ├── ...
+    │   ├── ...
-│   |   └── op_mps.mm
+    |   └── op_mps.mm
-│   └── mlu
+    └── mlu
-│       ├── ...
+        ├── ...
-│       └── op_mlu.cpp
+        └── op_mlu.cpp
-└── tensorrt
-    ├── trt_cuda_helper.cuh
-    ├── trt_plugin_helper.hpp
-    ├── trt_plugin.hpp
-    ├── trt_serialize.hpp
-    ├── ...
-    ├── trt_ops.hpp
-    └── plugins
-        ├── trt_cuda_helper.cu
-        ├── trt_plugin.cpp
-        ├── ...
-        ├── trt_ops.cpp
-        └── trt_ops_kernel.cu
 ```
 ## Components
@@ -80,16 +57,12 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
  - `utils`: The kernels and utils of spconv.
- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
-  - `cpu`: CPU implementation of supported ops.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
  - `mlu`: This directory contain launchers of each MLU kernels.
  - `mps`: MPS ops implementation and launchers.
- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
-  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
 ## How to add new PyTorch ops?

--- a/mmcv/ops/csrc/onnxruntime/corner_pool.h
+++ b/mmcv/ops/csrc/onnxruntime/corner_pool.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_CORNER_POOL_H
-#define ONNXRUNTIME_CORNER_POOL_H
-#include <assert.h>
-#include <onnxruntime_cxx_api.h>
-struct MMCVCornerPoolKernel {
- public:
-  MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
-      : ort_(ort) {
-    mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
-  }
-  void Compute(OrtKernelContext* context);
- private:
-  Ort::CustomOpApi ort_;
-  int64_t mode_;
-};
-struct MMCVCornerPoolCustomOp
-    : Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVCornerPoolKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVCornerPool"; }
-  size_t GetInputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-  // force cpu
-  const char* GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  }
-};
-#endif  // ONNXRUNTIME_CORNER_POOL_H
--- a/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "corner_pool.h"
-#include "../ort_mmcv_utils.h"
-void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
-                       const int channels, const int height, const int width) {
-  for (int n = 0; n < batch_size; n++) {
-    int index_n = n * channels * width * height;
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * width * height;
-      for (int w = 0; w < width; w++) {
-        // directly copy the most bottom value from input to output
-        output[index_n_c + (height - 1) * width + w] =
-            input[index_n_c + (height - 1) * width + w];
-        // do top_pool
-        for (int h = height - 2; h >= 0; h--) {
-          output[index_n_c + h * width + w] =
-              std::max(output[index_n_c + (h + 1) * width + w],
-                       input[index_n_c + h * width + w]);
-        }  // for h
-      }    // for w
-    }      // for c
-  }        // for n
-}
-void BottomPoolForwardCPU(const float *input, float *output,
-                          const int batch_size, const int channels,
-                          const int height, const int width) {
-  for (int n = 0; n < batch_size; n++) {
-    int index_n = n * channels * width * height;
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * width * height;
-      for (int w = 0; w < width; w++) {
-        // directly copy the most top value from input to output
-        output[index_n_c + w] = input[index_n_c + w];
-        // do top_pool
-        for (int h = 1; h < height; h++) {
-          output[index_n_c + h * width + w] =
-              std::max(output[index_n_c + (h - 1) * width + w],
-                       input[index_n_c + h * width + w]);
-        }  // for h
-      }    // for w
-    }      // for c
-  }        // for n
-}
-void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
-                        const int channels, const int height, const int width) {
-  for (int n = 0; n < batch_size; n++) {
-    int index_n = n * channels * width * height;
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * width * height;
-      for (int h = 0; h < height; h++) {
-        // directly copy the most right value from input to output
-        output[index_n_c + h * width + width - 1] =
-            input[index_n_c + h * width + width - 1];
-        // do left_pool
-        for (int w = width - 2; w >= 0; w--) {
-          output[index_n_c + h * width + w] =
-              std::max(output[index_n_c + h * width + w + 1],
-                       input[index_n_c + h * width + w]);
-        }  // for w
-      }    // for h
-    }      // for c
-  }        // for n
-}
-void RightPoolForwardCPU(const float *input, float *output,
-                         const int batch_size, const int channels,
-                         const int height, const int width) {
-  for (int n = 0; n < batch_size; n++) {
-    int index_n = n * channels * width * height;
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * width * height;
-      for (int h = 0; h < height; h++) {
-        // directly copy the most left value from input to output
-        output[index_n_c + h * width] = input[index_n_c + h * width];
-        // do right_pool
-        for (int w = 1; w < width; w++) {
-          output[index_n_c + h * width + w] =
-              std::max(output[index_n_c + h * width + w - 1],
-                       input[index_n_c + h * width + w]);
-        }  // for w
-      }    // for h
-    }      // for c
-  }        // for n
-}
-void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
-  const int mode = int(mode_);
-  typedef float T;
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const T *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
-  // get output memory
-  OrtTensorDimensions out_dimensions(ort_, input);
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  T *output_data = ort_.GetTensorMutableData<T>(output);
-  // 'top': 0, 'bottom': 1, 'left': 2, 'right':3
-  assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
-  // do corner_pool
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-  if (mode == 0)
-    TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
-                      input_height, input_width);
-  else if (mode == 1)
-    BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
-                         input_height, input_width);
-  else if (mode == 2)
-    LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
-                       input_height, input_width);
-  else
-    RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
-                        input_height, input_width);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "deform_conv.h"
-#include <cmath>
-#include <vector>
-#include "../ort_mmcv_utils.h"
-void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
-                          const float *H, const int32_t trans_A,
-                          const int32_t trans_B, const int32_t M,
-                          const int32_t N, const int32_t K, const float alpha,
-                          const float beta, float *Y) {
-  if (!trans_A && !trans_B) {  // MK, KN; NN
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[m * K + k] * B[k * N + n];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (trans_A && !trans_B) {  // KM, KN; TN
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[k * M + m] * B[k * N + n];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (trans_A && trans_B) {  // KM, NK; TT
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[k * M + m] * B[n * K + k];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (!trans_A && trans_B) {  // MK, NK; NT
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[m * K + k] * B[n * K + k];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-}
-float bilinear_interpolate(const float *src, const int64_t src_h,
-                           const int64_t src_w, const float h, const float w) {
-  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
-    return 0;
-  }
-  int64_t h_low = floor(h);
-  int64_t w_low = floor(w);
-  int64_t h_high = h_low + 1;
-  int64_t w_high = w_low + 1;
-  float lh = h - h_low;
-  float lw = w - w_low;
-  float hh = 1 - lh;
-  float hw = 1 - lw;
-  float v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
-  float v2 = 0;
-  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
-  float v3 = 0;
-  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
-  float v4 = 0;
-  if (h_high <= src_h - 1 && w_high <= src_w - 1)
-    v4 = src[h_high * src_w + w_high];
-  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-void deformable_im2col(const float *input, const float *offset,
-                       const int64_t src_h, const int64_t src_w,
-                       const int64_t kernel_h, const int64_t kernel_w,
-                       const int64_t pad_h, const int64_t pad_w,
-                       const int64_t stride_h, const int64_t stride_w,
-                       const int64_t dilation_h, const int64_t dilation_w,
-                       const int64_t channels, const int64_t offset_groups,
-                       const int64_t dst_h, const int64_t dst_w,
-                       float *columns) {
-  const int64_t indices = channels * dst_h * dst_w;
-  for (int64_t index = 0; index != indices; ++index) {
-    const int64_t w_col = index % dst_w;
-    const int64_t h_col = (index / dst_w) % dst_h;
-    const int64_t c_im = index / (dst_w * dst_h);
-    const int64_t c_col = c_im * kernel_h * kernel_w;
-    int64_t c_per_offset_grp = channels / offset_groups;
-    const int64_t grp_idx = c_im / c_per_offset_grp;
-    auto columns_ptr =
-        columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
-    auto input_ptr = input + c_im * (src_h * src_w);
-    auto offset_ptr =
-        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
-    for (int64_t kh = 0; kh < kernel_h; ++kh) {
-      for (int64_t kw = 0; kw < kernel_w; ++kw) {
-        const int data_offset_h_ptr =
-            ((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
-        const float offset_h = offset_ptr[data_offset_h_ptr];
-        const float offset_w = offset_ptr[data_offset_w_ptr];
-        const float ih =
-            (h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
-        const float iw =
-            (w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
-        *columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
-        columns_ptr += dst_h * dst_w;
-      }
-    }
-  }
-}
-void deformable_conv_forward(
-    const float *src, const float *offset, const float *filter,
-    const int64_t batch, const int64_t src_c, const int64_t src_h,
-    const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
-    const int64_t dst_w, const int64_t group, const int64_t offset_group,
-    const int64_t channels, const int64_t num_output, const int64_t kernel_h,
-    const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
-    const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
-    const int64_t dilation_w, float *columns, float *dst) {
-  const int64_t ic_per_gp = channels / group;
-  const int64_t oc_per_gp = num_output / group;
-  for (int64_t b = 0; b < batch; ++b) {
-    for (int64_t g = 0; g < group; ++g) {
-      deformable_im2col(
-          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
-          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
-          src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
-          columns);
-      float *dst_ptr =
-          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
-      memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
-      gemm_ref_fp32_deform(
-          filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
-          nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
-          ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
-    }
-  }
-}
-MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
-                                           const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info) {
-  std::vector<int64_t> stride =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
-  stride_height_ = stride[0];
-  stride_width_ = stride[1];
-  std::vector<int64_t> padding =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
-  padding_height_ = padding[0];
-  padding_width_ = padding[1];
-  std::vector<int64_t> dilation =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
-  dilation_height_ = dilation[0];
-  dilation_width_ = dilation[1];
-  deformable_group_ =
-      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
-  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
-  const int64_t stride_height = stride_height_;
-  const int64_t stride_width = stride_width_;
-  const int64_t padding_height = padding_height_;
-  const int64_t padding_width = padding_width_;
-  const int64_t dilation_height = dilation_height_;
-  const int64_t dilation_width = dilation_width_;
-  const int64_t deformable_group = deformable_group_;
-  const int64_t group = group_;
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
-  const float *offset_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
-  const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
-  const float *filter_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions filter_dims(ort_, filter);
-  int64_t batch_size = input_dims[0];
-  int64_t in_channels = input_dims[1];
-  int64_t in_height = input_dims[2];
-  int64_t in_width = input_dims[3];
-  int64_t out_channels = filter_dims[0];
-  int64_t kernel_height = filter_dims[2];
-  int64_t kernel_width = filter_dims[3];
-  // get output memory
-  int64_t out_height = floor((in_height + 2 * padding_height -
-                              dilation_height * (kernel_height - 1) - 1) /
-                                 stride_height +
-                             1);
-  int64_t out_width = floor(
-      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
-          stride_width +
-      1);
-  std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
-                                      out_width};
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-  // allocate tmp memory
-  int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
-                       out_height * out_width;
-  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
-  deformable_conv_forward(
-      input_data, offset_data, filter_data, batch_size, in_channels, in_height,
-      in_width, out_channels, out_height, out_width, group, deformable_group,
-      in_channels, out_channels, kernel_height, kernel_width, stride_height,
-      stride_width, padding_height, padding_width, dilation_height,
-      dilation_width, columns, out_ptr);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include <cmath>
-#include "../ort_mmcv_utils.h"
-#include "grid_sample.h"
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-#define MAX(a, b) (((a) < (b)) ? (b) : (a))
-#define CLIP_COORDINATES(in, out, clip_limit) \
-  out = MIN((clip_limit - 1), MAX(in, 0))
-// modified from
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
-GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info) {
-  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
-  interpolation_mode_ =
-      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
-  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
-enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
-template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
-                                                bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1) / 2) * (size - 1);
-  } else {
-    return ((coord + 1) * size - 1) / 2;
-  }
-}
-// Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
-  return std::min(static_cast<scalar_t>(clip_limit - 1),
-                  std::max(in, static_cast<scalar_t>(0)));
-}
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template <typename scalar_t>
-static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
-                                           int64_t twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = std::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-template <typename scalar_t>
-static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
-                                           int64_t padding_mode,
-                                           bool align_corners) {
-  if (padding_mode == GridSamplerPadding::Border) {
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
-    }
-    coord = clip_coordinates(coord, size);
-  }
-  return coord;
-}
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
-                                                         int64_t size,
-                                                         int64_t padding_mode,
-                                                         bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  coord = compute_coordinates(coord, size, padding_mode, align_corners);
-  return coord;
-}
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
-                                    int64_t W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-template <typename scalar_t>
-static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
-                                         scalar_t y, int64_t W, int64_t H,
-                                         int64_t sW, int64_t sH,
-                                         int64_t padding_mode,
-                                         bool align_corners) {
-  x = compute_coordinates(x, W, padding_mode, align_corners);
-  y = compute_coordinates(y, H, padding_mode, align_corners);
-  int64_t ix = static_cast<int64_t>(x);
-  int64_t iy = static_cast<int64_t>(y);
-  if (within_bounds_2d(iy, ix, H, W)) {
-    return data[iy * sH + ix * sW];
-  }
-  return static_cast<scalar_t>(0);
-}
-template <typename scalar_t>
-static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
-}
-template <typename scalar_t>
-static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
-}
-template <typename scalar_t>
-static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
-                                                   scalar_t t) {
-  scalar_t A = -0.75;
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
-}
-template <typename scalar_t>
-static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
-                                      scalar_t x3, scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
-}
-void GridSampleKernel::Compute(OrtKernelContext *context) {
-  const bool align_corners = align_corners_;
-  const int64_t padding_mode = padding_mode_;
-  const int64_t interpolation_mode = interpolation_mode_;
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
-  const float *grid_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions grid_dims(ort_, grid);
-  int64_t N = input_dims[0];
-  int64_t C = input_dims[1];
-  int64_t inp_H = input_dims[2];
-  int64_t inp_W = input_dims[3];
-  int64_t out_H = grid_dims[1];
-  int64_t out_W = grid_dims[2];
-  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
-  int64_t inp_sC = input_dims[2] * input_dims[3];
-  int64_t inp_sH = input_dims[3];
-  int64_t inp_sW = 1;
-  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
-  int64_t grid_sH = grid_dims[2] * grid_dims[3];
-  int64_t grid_sW = grid_dims[3];
-  int64_t grid_sCoor = 1;
-  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
-  int64_t out_sC = output_dims[2] * output_dims[3];
-  int64_t out_sH = output_dims[3];
-  int64_t out_sW = 1;
-  // loop over each output pixel
-  for (int64_t n = 0; n < N; ++n) {
-    const float *grid_ptr_N = grid_data + n * grid_sN;
-    const float *inp_ptr_N = input_data + n * inp_sN;
-    for (int64_t h = 0; h < out_H; ++h) {
-      for (int64_t w = 0; w < out_W; ++w) {
-        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-        float x = *grid_ptr_NHW;
-        float y = grid_ptr_NHW[grid_sCoor];
-        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
-                                                     align_corners);
-        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
-                                                     align_corners);
-        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-          // get corner pixel values from (x, y)
-          // for 4d, we use north-east-south-west
-          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-          int64_t ix_ne = ix_nw + 1;
-          int64_t iy_ne = iy_nw;
-          int64_t ix_sw = ix_nw;
-          int64_t iy_sw = iy_nw + 1;
-          int64_t ix_se = ix_nw + 1;
-          int64_t iy_se = iy_nw + 1;
-          // get surfaces to each neighbor:
-          float nw = (ix_se - ix) * (iy_se - iy);
-          float ne = (ix - ix_sw) * (iy_sw - iy);
-          float sw = (ix_ne - ix) * (iy - iy_ne);
-          float se = (ix - ix_nw) * (iy - iy_nw);
-          // calculate bilinear weighted pixel value and set output pixel
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C;
-               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            auto res = static_cast<float>(0);
-            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-            }
-            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-            }
-            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-            }
-            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-            }
-            *out_ptr_NCHW = res;
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
-          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
-          // assign nearest neighbor pixel value to output pixel
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          const float *inp_ptr_NC = inp_ptr_N;
-          for (int64_t c = 0; c < C;
-               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-              *out_ptr_NCHW =
-                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-            } else {
-              *out_ptr_NCHW = static_cast<float>(0);
-            }
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
-          // grid_sampler_compute_source_index will "clip the value" of idx
-          // depends on the padding,
-          // which would cause calculation to be wrong,
-          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
-          // = floor(x) = -1
-          // There would be more problem in reflection padding, since the -1 and
-          // +1 direction is not fixed in boundary condition
-          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
-          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
-          float ix_nw = std::floor(ix);
-          float iy_nw = std::floor(iy);
-          const float tx = ix - ix_nw;
-          const float ty = iy - iy_nw;
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C;
-               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            float coefficients[4];
-            // Interpolate 4 values in the x direction
-            for (int64_t i = 0; i < 4; ++i) {
-              coefficients[i] = cubic_interp1d<float>(
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
-                                           inp_W, inp_H, inp_sW, inp_sH,
-                                           padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
-                                           inp_W, inp_H, inp_sW, inp_sH,
-                                           padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
-                                           inp_W, inp_H, inp_sW, inp_sH,
-                                           padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
-                                           inp_W, inp_H, inp_sW, inp_sH,
-                                           padding_mode, align_corners),
-                  tx);
-            }
-            // Interpolate in the y direction
-            *out_ptr_NCHW =
-                cubic_interp1d<float>(coefficients[0], coefficients[1],
-                                      coefficients[2], coefficients[3], ty);
-          }
-        }
-      }
-    }
-  }
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "modulated_deform_conv.h"
-#include <cmath>
-#include <vector>
-#include "../ort_mmcv_utils.h"
-float bilinear_interpolate_2d(const float *src, const int64_t src_h,
-                              const int64_t src_w, const float h,
-                              const float w) {
-  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
-    return 0;
-  }
-  int64_t h_low = floor(h);
-  int64_t w_low = floor(w);
-  int64_t h_high = h_low + 1;
-  int64_t w_high = w_low + 1;
-  float lh = h - h_low;
-  float lw = w - w_low;
-  float hh = 1 - lh;
-  float hw = 1 - lw;
-  float v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
-  float v2 = 0;
-  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
-  float v3 = 0;
-  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
-  float v4 = 0;
-  if (h_high <= src_h - 1 && w_high <= src_w - 1)
-    v4 = src[h_high * src_w + w_high];
-  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
-void deformable_im2col_2d(const float *input, const float *offset,
-                          const float *mask, const int64_t src_h,
-                          const int64_t src_w, const int64_t kernel_h,
-                          const int64_t kernel_w, const int64_t pad_h,
-                          const int64_t pad_w, const int64_t stride_h,
-                          const int64_t stride_w, const int64_t dilation_h,
-                          const int64_t dilation_w, const int64_t channels,
-                          const int64_t offset_groups, const int64_t dst_h,
-                          const int64_t dst_w, const bool use_mask,
-                          float *columns) {
-  const int64_t workload = channels * dst_h * dst_w;
-  for (int64_t index = 0; index != workload; ++index) {
-    const int64_t ow = index % dst_w;
-    const int64_t oh = (index / dst_w) % dst_h;
-    const int64_t ic = index / (dst_w * dst_h);
-    const int64_t oc = ic * kernel_h * kernel_w;
-    int64_t c_per_offset_grp = channels / offset_groups;
-    const int64_t grp_idx = ic / c_per_offset_grp;
-    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
-    auto input_ptr = input + ic * (src_h * src_w);
-    auto offset_ptr =
-        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
-    auto mask_ptr = mask;
-    if (use_mask) {
-      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
-    }
-    for (int64_t kh = 0; kh < kernel_h; ++kh) {
-      for (int64_t kw = 0; kw < kernel_w; ++kw) {
-        const int64_t mask_idx = kh * kernel_w + kw;
-        const int64_t offset_idx = 2 * mask_idx;
-        float mask_value = 1;
-        if (use_mask) {
-          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        }
-        const float offset_h =
-            offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        const float offset_w =
-            offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
-        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
-        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
-        *columns_ptr = mask_value *
-                       bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
-        columns_ptr += dst_h * dst_w;
-      }
-    }
-  }
-}
-void gemm_ref_fp32(const float *A, const float *B, const float *V,
-                   const float *H, const int32_t trans_A, const int32_t trans_B,
-                   const int32_t M, const int32_t N, const int32_t K,
-                   const float alpha, const float beta, float *Y) {
-  if (!trans_A && !trans_B) {  // MK, KN; NN
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[m * K + k] * B[k * N + n];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (trans_A && !trans_B) {  // KM, KN; TN
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[k * M + m] * B[k * N + n];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (trans_A && trans_B) {  // KM, NK; TT
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[k * M + m] * B[n * K + k];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-  if (!trans_A && trans_B) {  // MK, NK; NT
-    for (int64_t m = 0; m < M; ++m) {
-      for (int64_t n = 0; n < N; ++n) {
-        float y = 0.0f;
-        for (int64_t k = 0; k < K; ++k) {
-          y += A[m * K + k] * B[n * K + k];
-        }
-        y *= alpha;
-        if (V) y += beta * V[n];
-        if (H) y += beta * H[m * N + n];
-        Y[m * N + n] = y;
-      }
-    }
-  }
-}
-void deformable_conv2d_ref_fp32(
-    const float *src, const float *offset, const float *mask,
-    const float *filter, const float *bias, const int64_t batch,
-    const int64_t src_c, const int64_t src_h, const int64_t src_w,
-    const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
-    const int64_t group, const int64_t offset_group, const int64_t channels,
-    const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
-    const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
-    const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
-    float *columns, float *dst) {
-  const int64_t ic_per_gp = channels / group;
-  const int64_t oc_per_gp = num_output / group;
-  for (int64_t b = 0; b < batch; ++b) {
-    for (int64_t g = 0; g < group; ++g) {
-      deformable_im2col_2d(
-          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
-          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
-          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
-          src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
-          mask != nullptr, columns);
-      float *dst_ptr =
-          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
-      if (bias != nullptr) {
-        const float *bias_ptr = bias + g * oc_per_gp;
-        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
-          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
-            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
-          }
-        }
-      } else {
-        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
-      }
-      gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
-                    columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
-                    ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
-    }
-  }
-}
-MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
-    OrtApi api, const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info) {
-  std::vector<int64_t> stride =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
-  stride_height_ = stride[0];
-  stride_width_ = stride[1];
-  std::vector<int64_t> padding =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
-  padding_height_ = padding[0];
-  padding_width_ = padding[1];
-  std::vector<int64_t> dilation =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
-  dilation_height_ = dilation[0];
-  dilation_width_ = dilation[1];
-  deformable_group_ =
-      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
-  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
-  const int64_t stride_height = stride_height_;
-  const int64_t stride_width = stride_width_;
-  const int64_t padding_height = padding_height_;
-  const int64_t padding_width = padding_width_;
-  const int64_t dilation_height = dilation_height_;
-  const int64_t dilation_width = dilation_width_;
-  const int64_t deformable_group = deformable_group_;
-  const int64_t group = group_;
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
-  const float *offset_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
-  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
-  const float *mask_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
-  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
-  const float *filter_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
-  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
-  const float *bias_data =
-      (bias != nullptr)
-          ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
-          : nullptr;
-  // const float *bias_data = nullptr;
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions filter_dims(ort_, filter);
-  int64_t batch = input_dims[0];
-  int64_t channels = input_dims[1];
-  int64_t in_height = input_dims[2];
-  int64_t in_width = input_dims[3];
-  int64_t num_output = filter_dims[0];
-  int64_t kernel_height = filter_dims[2];
-  int64_t kernel_width = filter_dims[3];
-  // get output memory
-  int64_t out_height = floor((in_height + 2 * padding_height -
-                              dilation_height * (kernel_height - 1) - 1) /
-                                 stride_height +
-                             1);
-  int64_t out_width = floor(
-      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
-          stride_width +
-      1);
-  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-  // allocate tmp memory
-  int64_t column_len = (channels / group) * kernel_height * kernel_width *
-                       out_height * out_width;
-  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
-  deformable_conv2d_ref_fp32(
-      input_data, offset_data, mask_data, filter_data, bias_data, batch,
-      channels, in_height, in_width, num_output, out_height, out_width, group,
-      deformable_group, channels, num_output, kernel_height, kernel_width,
-      stride_height, stride_width, padding_height, padding_width,
-      dilation_height, dilation_width, columns, out_ptr);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "nms.h"
-#include <assert.h>
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <iterator>
-#include <numeric>  // std::iota
-#include <vector>
-#include "../ort_mmcv_utils.h"
-NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info) {
-  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
-  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-void NmsKernel::Compute(OrtKernelContext *context) {
-  const float iou_threshold = iou_threshold_;
-  const int64_t offset = offset_;
-  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
-  const float *boxes_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
-  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
-  const float *scores_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-  int64_t nboxes = boxes_dim[0];
-  assert(boxes_dim[1] == 4);
-  // allocate tmp memory
-  float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
-  float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
-  float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
-  bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
-  for (int64_t i = 0; i < nboxes; i++) {
-    select[i] = true;
-  }
-  memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
-  memcpy(sc, scores_data, sizeof(float) * nboxes);
-  // sort scores
-  std::vector<float> tmp_sc;
-  for (int i = 0; i < nboxes; i++) {
-    tmp_sc.push_back(sc[i]);
-  }
-  std::vector<int64_t> order(tmp_sc.size());
-  std::iota(order.begin(), order.end(), 0);
-  std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
-    return tmp_sc[id1] > tmp_sc[id2];
-  });
-  // area = (x2 - x1 + offset) * (y2 - y1 + offset)
-  for (int64_t i = 0; i < nboxes; i++) {
-    areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
-               (tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
-  }
-  for (int64_t _i = 0; _i < nboxes; _i++) {
-    if (select[_i] == false) continue;
-    auto i = order[_i];
-    auto ix1 = tmp_boxes[i * 4 + 0];
-    auto iy1 = tmp_boxes[i * 4 + 1];
-    auto ix2 = tmp_boxes[i * 4 + 2];
-    auto iy2 = tmp_boxes[i * 4 + 3];
-    auto iarea = areas[i];
-    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-      if (select[_j] == false) continue;
-      auto j = order[_j];
-      auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
-      auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
-      auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
-      auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > iou_threshold) select[_j] = false;
-    }
-  }
-  std::vector<int64_t> res_order;
-  for (int i = 0; i < nboxes; i++) {
-    if (select[i]) {
-      res_order.push_back(order[i]);
-    }
-  }
-  std::vector<int64_t> inds_dims({res_order.size()});
-  OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
-                                               inds_dims.size());
-  int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
-  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "onnxruntime_register.h"
-#include "corner_pool.h"
-#include "deform_conv.h"
-#include "grid_sample.h"
-#include "modulated_deform_conv.h"
-#include "nms.h"
-#include "ort_mmcv_utils.h"
-#include "reduce_ops.h"
-#include "roi_align.h"
-#include "roi_align_rotated.h"
-#include "rotated_feature_align.h"
-#include "soft_nms.h"
-const char *c_MMCVOpDomain = "mmcv";
-SoftNmsOp c_SoftNmsOp;
-NmsOp c_NmsOp;
-MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
-MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
-MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
-GridSampleOp c_GridSampleOp;
-MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
-MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
-MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
-MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
-MMCVDeformConvOp c_MMCVDeformConvOp;
-OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
-                                          const OrtApiBase *api) {
-  OrtCustomOpDomain *domain = nullptr;
-  const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
-  if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
-    return status;
-  }
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
-    return status;
-  }
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
-    return status;
-  }
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
-    return status;
-  }
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
-    return status;
-  }
-  if (auto status = ortApi->CustomOpDomain_Add(
-          domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
-    return status;
-  }
-  return ortApi->AddCustomOpDomain(options, domain);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "reduce_ops.h"
-#include <assert.h>
-#include <vector>
-#include "../ort_mmcv_utils.h"
-// modified from
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
-static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
-  int64_t min = -ndims;
-  int64_t max = ndims - 1;
-  assert(dim >= min && dim <= max);
-  if (dim < 0) dim += ndims;
-  return dim;
-}
-static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
-                                     const int64_t *reversed_dim_cumprod) {
-  return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
-}
-static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
-                                   const int64_t *reversed_dim_cumprod) {
-  return dim == ndims - 1
-             ? reversed_dim_cumprod[dim]
-             : reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
-}
-template <typename T1, typename T2, typename Operation>
-void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
-                          const int64_t input_dim_size, const int64_t stride) {
-  Operation op;
-  T1 out = input[0];
-  int64_t idx = 0;
-  for (int64_t i = 0; i < input_dim_size; i++) {
-    T1 curr_elem = input[i * stride];
-    if (op(curr_elem, out)) {
-      out = curr_elem;
-      idx = i;
-    }
-    output[i * stride] = out;
-    indices[i * stride] = idx;
-  }
-}
-// modified `tensor_dim_apply3` from
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
-// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
-// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
-// and indices, since it's unnecessary to use separate values. currently
-// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
-// official pytorch projects: https://github.com/pytorch/pytorch.
-template <typename T1, typename T2, typename Function>
-void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
-                       const int64_t dim, const int64_t ndims,
-                       const int64_t *reversed_dim_cumprod, Function func) {
-  int dim_apply_finished = 0;
-  int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
-  // the same stride is used for input, output and indices
-  int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
-  std::vector<int64_t> counter(ndims, 0);
-  while (!dim_apply_finished) {
-    // call `func` once to update output and indices
-    func(input, output, indices, input_dim_size, stride);
-    if (ndims == 1) break;
-    for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
-      if (dim_i == dim) {
-        if (dim_i == (ndims - 1)) {
-          dim_apply_finished = 1;
-          break;
-        }
-        continue;
-      }
-      counter[dim_i]++;
-      // the same stride is used for input, output, and indices
-      int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
-      input += stride_dim_i;
-      output += stride_dim_i;
-      indices += stride_dim_i;
-      if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
-        if (dim_i == ndims - 1) {
-          dim_apply_finished = 1;
-          break;
-        } else {
-          input -= counter[dim_i] * stride_dim_i;
-          output -= counter[dim_i] * stride_dim_i;
-          indices -= counter[dim_i] * stride_dim_i;
-          counter[dim_i] = 0;
-        }
-      } else {
-        break;
-      }  // if
-    }    // for
-  }      // while
-}
-template <typename T1, typename T2, typename Operation>
-void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
-                       int64_t *reversed_dim_cumprod, const int64_t dim,
-                       const OrtTensorDimensions &out_dimensions) {
-  // calculate numel
-  const int64_t ndims = out_dimensions.size();
-  int64_t numel = 1;
-  for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
-    numel *= out_dimensions.data()[dim_i];
-  }
-  // cummax is only applied to input which is non-zero dim and non-empty
-  if (numel) {
-    // compute the cumulative production on dimension size,
-    // which is then used for computing the stride or size of a specific `dim`.
-    reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
-    for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
-      reversed_dim_cumprod[dim_i] =
-          reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
-    }
-    // do cummax or cummin based on `Operation` type
-    tensor_dim_apply3<float, int64_t>(
-        input, output, indices, dim, ndims, reversed_dim_cumprod,
-        cummax_cummin_helper<float, int64_t, Operation>);
-  }
-}
-void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
-  // get input
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-  // get output
-  OrtTensorDimensions out_dimensions(ort_, input);
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  float *output_data = ort_.GetTensorMutableData<float>(output);
-  OrtValue *indices = ort_.KernelContext_GetOutput(
-      context, 1, out_dimensions.data(), out_dimensions.size());
-  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
-  // allocate tmp memory for computing the cumulative production on dimension
-  // size
-  const int64_t ndims = out_dimensions.size();
-  assert(ndims > 0);
-  int64_t *reversed_dim_cumprod =
-      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
-  // dim should be wrapped if it's negative (e.g. -1)
-  const int64_t dim = maybe_wrap_dim(dim_, ndims);
-  CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
-      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
-      out_dimensions);
-}
-void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
-  // get input
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-  // get output
-  OrtTensorDimensions out_dimensions(ort_, input);
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  float *output_data = ort_.GetTensorMutableData<float>(output);
-  OrtValue *indices = ort_.KernelContext_GetOutput(
-      context, 1, out_dimensions.data(), out_dimensions.size());
-  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
-  // allocate tmp memory for computing the cumulative production on dimension
-  // size
-  const int64_t ndims = out_dimensions.size();
-  assert(ndims > 0);
-  int64_t *reversed_dim_cumprod =
-      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
-  // dim should be wrapped if it's negative (e.g. -1)
-  const int64_t dim = maybe_wrap_dim(dim_, ndims);
-  CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
-      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
-      out_dimensions);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "roi_align.h"
-#include "../ort_mmcv_utils.h"
-// implementation taken from Caffe2
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  float w1;
-  float w2;
-  float w3;
-  float w4;
-};
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const float yy =
-            roi_start_h + ph * bin_size_h +
-            static_cast<float>(iy + .5f) * bin_size_h /
-                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const float xx = roi_start_w + pw * bin_size_w +
-                           static_cast<float>(ix + .5f) * bin_size_w /
-                               static_cast<float>(roi_bin_grid_w);
-          float x = xx;
-          float y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (float)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (float)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-          float ly = y - y_low;
-          float lx = x - x_low;
-          float hy = 1. - ly, hx = 1. - lx;
-          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-          // save weights and indices
-          PreCalc pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-void ROIAlignForwardCPU(const int nthreads, const float *input,
-                        const float *rois, float *output, float *argmax_y,
-                        float *argmax_x, const int pooled_height,
-                        const int pooled_width, const float spatial_scale,
-                        const int sampling_ratio,
-                        const int pool_mode,  // 0 - max pool, 1 - avg pool
-                        const bool aligned, const int channels,
-                        const int height, const int width) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-    const float *offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-    // Do not use rounding; this implementation detail is critical
-    float offset = aligned ? (float)0.5 : (float)0.0;
-    float roi_start_w = offset_rois[1] * spatial_scale - offset;
-    float roi_start_h = offset_rois[2] * spatial_scale - offset;
-    float roi_end_w = offset_rois[3] * spatial_scale - offset;
-    float roi_end_h = offset_rois[4] * spatial_scale - offset;
-    float roi_width = roi_end_w - roi_start_w;
-    float roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign cannot have non-negative size!");*/
-      assert(roi_width >= 0 && roi_height >= 0);
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (float)1.);
-      roi_height = std::max(roi_height, (float)1.);
-    }
-    float bin_size_h =
-        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-    float bin_size_w =
-        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const float count =
-        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                  pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const float *offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-          float output_val = 0.;
-          float maxval = -10000;
-          float maxidx_y = -1.f, maxidx_x = -1.f;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            const float y = roi_start_h + ph * bin_size_h +
-                            static_cast<float>(iy + .5f) * bin_size_h /
-                                static_cast<float>(roi_bin_grid_h);
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              const float x = roi_start_w + pw * bin_size_w +
-                              static_cast<float>(ix + .5f) * bin_size_w /
-                                  static_cast<float>(roi_bin_grid_w);
-              PreCalc pc = pre_calc[pre_calc_index];
-              float val = pc.w1 * offset_input[pc.pos1] +
-                          pc.w2 * offset_input[pc.pos2] +
-                          pc.w3 * offset_input[pc.pos3] +
-                          pc.w4 * offset_input[pc.pos4];
-              if (val > maxval) {
-                maxval = val;
-                maxidx_y = y;
-                maxidx_x = x;
-              }
-              output_val += val;
-              pre_calc_index += 1;
-            }
-          }
-          if (pool_mode == 0) {
-            // We do max pooling inside a bin
-            output[index] = maxval;
-            argmax_y[index] = maxidx_y;
-            argmax_x[index] = maxidx_x;
-          } else if (pool_mode == 1) {
-            // We do average (integral) pooling inside a bin
-            output[index] = output_val / count;
-          }  // if
-        }    // for pw
-      }      // for ph
-    }        // for c
-  }          // for n
-}
-void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
-  const float *X_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
-  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
-  const float *rois = reinterpret_cast<const float *>(
-      ort_.GetTensorData<const float *>(input_rois));
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_X);
-  OrtTensorDimensions roi_dimensions(ort_, input_rois);
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-  out_dimensions.data()[0] = roi_dimensions.data()[0];
-  out_dimensions.data()[2] = aligned_height_;
-  out_dimensions.data()[3] = aligned_width_;
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-  int poolMod = 1;
-  if (pool_mode_ == "max") poolMod = 0;
-  float *argmax_x = nullptr, *argmax_y = nullptr;
-  if (poolMod == 0) {
-    argmax_y = new float[output_size];
-    argmax_x = new float[output_size];
-  }
-  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
-                     aligned_height_, aligned_width_, spatial_scale_,
-                     sampling_ratio_, poolMod, aligned_, input_channels,
-                     input_height, input_width);
-  if (argmax_x) delete argmax_x;
-  if (argmax_y) delete argmax_y;
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "roi_align_rotated.h"
-#include "../ort_mmcv_utils.h"
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  float w1;
-  float w2;
-  float w3;
-  float w4;
-};
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
-    float roi_center_w, float cos_theta, float sin_theta,
-    std::vector<PreCalc> &pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const float yy =
-            roi_start_h + ph * bin_size_h +
-            static_cast<float>(iy + .5f) * bin_size_h /
-                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const float xx = roi_start_w + pw * bin_size_w +
-                           static_cast<float>(ix + .5f) * bin_size_w /
-                               static_cast<float>(roi_bin_grid_w);
-          // Rotate by theta around the center and translate
-          // In image space, (y, x) is the order for Right Handed System,
-          // and this is essentially multiplying the point by a rotation matrix
-          // to rotate it counterclockwise through angle theta.
-          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
-          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-          if (y < 0) {
-            y = 0;
-          }
-          if (x < 0) {
-            x = 0;
-          }
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (float)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (float)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-          float ly = y - y_low;
-          float lx = x - x_low;
-          float hy = 1. - ly, hx = 1. - lx;
-          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-          // save weights and indices
-          PreCalc pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
-                               const float *rois, float *output,
-                               const float &spatial_scale, const int aligned,
-                               const int clockwise, const int channels,
-                               const int height, const int width,
-                               const int pooled_height, const int pooled_width,
-                               const int sampling_ratio) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-    const float *current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-    // Do not use rounding; this implementation detail is critical
-    float offset = aligned ? (float)0.5 : (float)0.0;
-    float roi_center_w = current_roi[1] * spatial_scale - offset;
-    float roi_center_h = current_roi[2] * spatial_scale - offset;
-    float roi_width = current_roi[3] * spatial_scale;
-    float roi_height = current_roi[4] * spatial_scale;
-    // float theta = current_roi[5] * M_PI / 180.0;
-    float theta = current_roi[5];  // Radian angle by default
-    if (clockwise) {
-      theta = -theta;
-    }
-    float cos_theta = cos(theta);
-    float sin_theta = sin(theta);
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (float)1.);
-      roi_height = std::max(roi_height, (float)1.);
-    }
-    float bin_size_h =
-        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-    float bin_size_w =
-        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    // We do average (integral) pooling inside a bin
-    const float count =
-        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                  pooled_width * pooled_height);
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    float roi_start_h = -roi_height / 2.0;
-    float roi_start_w = -roi_width / 2.0;
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
-        sin_theta, pre_calc);
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const float *offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-          float output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] +
-                            pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] +
-                            pc.w4 * offset_input[pc.pos4];
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
-  const float *X_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
-  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
-  const float *rois = reinterpret_cast<const float *>(
-      ort_.GetTensorData<const float *>(input_rois));
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_X);
-  OrtTensorDimensions roi_dimensions(ort_, input_rois);
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-  out_dimensions.data()[0] = roi_dimensions.data()[0];
-  out_dimensions.data()[2] = aligned_height_;
-  out_dimensions.data()[3] = aligned_width_;
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
-                            aligned_, clockwise_, input_channels, input_height,
-                            input_width, aligned_height_, aligned_width_,
-                            sampling_ratio_);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
-#include "rotated_feature_align.h"
-#include "../ort_mmcv_utils.h"
-template <typename T>
-T bilinear_interpolate(const T *input, const int height, const int width, T y,
-                       T x, const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  T v1 = input[int(fma(y_low, width, x_low))];
-  T v2 = input[int(fma(y_low, width, x_high))];
-  T v3 = input[int(fma(y_high, width, x_low))];
-  T v4 = input[int(fma(y_high, width, x_high))];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-template <typename scalar_t>
-void rotated_feature_align_forward_cpu_kernel(
-    const int nthreads, const int points, const scalar_t *bottom_data,
-    const scalar_t *best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width, scalar_t *top_data) {
-  for (int index = 0; index < nthreads; index++) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-    const scalar_t *bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-    const scalar_t *offset_bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-    scalar_t output_val = bottom_data[index];
-    for (int i = 0; i < points; i++) {
-      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
-                                                   width, py[i], px[i], i);
-    }
-    top_data[index] = output_val;
-  }
-}
-void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
-  const float *features_data = reinterpret_cast<const float *>(
-      ort_.GetTensorData<float>(input_features));
-  const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
-  const float *best_rbboxes = reinterpret_cast<const float *>(
-      ort_.GetTensorData<const float *>(input_best_rbboxes));
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_features);
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-  OrtValue *output = ort_.KernelContext_GetOutput(
-      context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-  rotated_feature_align_forward_cpu_kernel<float>(
-      output_size, points_, features_data, best_rbboxes, spatial_scale_,
-      input_channels, input_height, input_width, out);
-}
--- a/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "soft_nms.h"
-#include <assert.h>
-#include <algorithm>
-#include <cmath>
-#include "../ort_mmcv_utils.h"
-SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info) {
-  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
-  sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
-  min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
-  method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
-  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-void SoftNmsKernel::Compute(OrtKernelContext *context) {
-  typedef float T;
-  const T iou_threshold = T(iou_threshold_);
-  const T sigma = T(sigma_);
-  const T min_score = T(min_score_);
-  const int method = int(method_);
-  const T offset = T(offset_);
-  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
-  const T *boxes_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
-  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
-  const T *scores_data =
-      reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-  int64_t nboxes = boxes_dim[0];
-  assert(boxes_dim[1] == 4);
-  // allocate tmp memory
-  T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
-  T *x1 = tmp_boxes;
-  T *y1 = tmp_boxes + 1;
-  T *x2 = tmp_boxes + 2;
-  T *y2 = tmp_boxes + 3;
-  T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
-  T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
-  T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
-  int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
-  memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
-  memcpy(sc, scores_data, sizeof(T) * nboxes);
-  // init inds as arange(nboxes)
-  std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
-  // area = (x2-x1+offset)*(y2-y1+offset)
-  for (int64_t i = 0; i < nboxes; i++) {
-    areas[i] =
-        (x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
-  }
-  int64_t pos = 0;
-  for (int64_t i = 0; i < nboxes; i++) {
-    auto max_score = sc[i];
-    auto max_pos = i;
-    pos = i + 1;
-    // get max box
-    while (pos < nboxes) {
-      if (max_score < sc[pos]) {
-        max_score = sc[pos];
-        max_pos = pos;
-      }
-      pos = pos + 1;
-    }
-    // swap
-    auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
-    auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
-    auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
-    auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
-    auto iscore = de[i * 5 + 4] = sc[max_pos];
-    auto iarea = areas[max_pos];
-    auto iind = inds[max_pos];
-    x1[max_pos * 4] = x1[i * 4];
-    y1[max_pos * 4] = y1[i * 4];
-    x2[max_pos * 4] = x2[i * 4];
-    y2[max_pos * 4] = y2[i * 4];
-    sc[max_pos] = sc[i];
-    areas[max_pos] = areas[i];
-    inds[max_pos] = inds[i];
-    x1[i * 4] = ix1;
-    y1[i * 4] = iy1;
-    x2[i * 4] = ix2;
-    y2[i * 4] = iy2;
-    sc[i] = iscore;
-    areas[i] = iarea;
-    inds[i] = iind;
-    pos = i + 1;
-    while (pos < nboxes) {
-      auto xx1 = std::max(ix1, x1[pos * 4]);
-      auto yy1 = std::max(iy1, y1[pos * 4]);
-      auto xx2 = std::min(ix2, x2[pos * 4]);
-      auto yy2 = std::min(iy2, y2[pos * 4]);
-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[pos] - inter);
-      float weight = 1.;
-      if (method == 0) {
-        if (ovr >= iou_threshold) weight = 0;
-      } else if (method == 1) {
-        if (ovr >= iou_threshold) weight = 1 - ovr;
-      } else if (method == 2) {
-        weight = std::exp(-(ovr * ovr) / sigma);
-      }
-      sc[pos] *= weight;
-      // if box score falls below threshold, discard the box by
-      // swapping with last box update N
-      if (sc[pos] < min_score) {
-        x1[pos * 4] = x1[(nboxes - 1) * 4];
-        y1[pos * 4] = y1[(nboxes - 1) * 4];
-        x2[pos * 4] = x2[(nboxes - 1) * 4];
-        y2[pos * 4] = y2[(nboxes - 1) * 4];
-        sc[pos] = sc[nboxes - 1];
-        areas[pos] = areas[nboxes - 1];
-        inds[pos] = inds[nboxes - 1];
-        nboxes = nboxes - 1;
-        pos = pos - 1;
-      }
-      pos = pos + 1;
-    }
-  }
-  std::vector<int64_t> dets_dim({nboxes, 5});
-  OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
-                                                dets_dim.size());
-  T *dets_data = ort_.GetTensorMutableData<T>(dets);
-  std::vector<int64_t> inds_dim({nboxes});
-  OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
-                                                   inds_dim.size());
-  int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
-  memcpy(dets_data, de, sizeof(T) * nboxes * 5);
-  memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
-}
--- a/mmcv/ops/csrc/onnxruntime/deform_conv.h
+++ b/mmcv/ops/csrc/onnxruntime/deform_conv.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_DEFORM_CONV_H
-#define ONNXRUNTIME_DEFORM_CONV_H
-#include <onnxruntime_cxx_api.h>
-struct MMCVDeformConvKernel {
-  MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
-  void Compute(OrtKernelContext *context);
- protected:
-  OrtApi api_;
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  int64_t stride_height_;
-  int64_t stride_width_;
-  int64_t padding_height_;
-  int64_t padding_width_;
-  int64_t dilation_height_;
-  int64_t dilation_width_;
-  int64_t deformable_group_;
-  int64_t group_;
-  int64_t im2col_step_;
-};
-struct MMCVDeformConvOp
-    : Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
-  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-    return new MMCVDeformConvKernel(api, info);
-  }
-  const char *GetName() const { return "MMCVDeformConv2d"; };
-  size_t GetInputTypeCount() const { return 3; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
-      size_t index) const {
-    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
-  }
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  // force cpu
-  const char *GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-#endif
--- a/mmcv/ops/csrc/onnxruntime/grid_sample.h
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_GRIDSAMPLE_H
-#define ONNXRUNTIME_GRIDSAMPLE_H
-#include <onnxruntime_cxx_api.h>
-struct GridSampleKernel {
-  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
-  void Compute(OrtKernelContext *context);
- protected:
-  OrtApi api_;
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  int64_t align_corners_;
-  int64_t interpolation_mode_;
-  int64_t padding_mode_;
-};
-struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
-  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-    return new GridSampleKernel(api, info);
-  };
-  const char *GetName() const { return "grid_sampler"; };
-  size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  const char *GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-#endif
--- a/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
+++ b/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
-#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
-#include <onnxruntime_cxx_api.h>
-struct MMCVModulatedDeformConvKernel {
-  MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
-  void Compute(OrtKernelContext *context);
- protected:
-  OrtApi api_;
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  int64_t stride_height_;
-  int64_t stride_width_;
-  int64_t padding_height_;
-  int64_t padding_width_;
-  int64_t dilation_height_;
-  int64_t dilation_width_;
-  int64_t deformable_group_;
-  int64_t group_;
-};
-struct MMCVModulatedDeformConvOp
-    : Ort::CustomOpBase<MMCVModulatedDeformConvOp,
-                        MMCVModulatedDeformConvKernel> {
-  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-    return new MMCVModulatedDeformConvKernel(api, info);
-  }
-  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
-  size_t GetInputTypeCount() const { return 5; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
-      size_t index) const {
-    // The last input (index == 4) is optional, which is bias
-    if (index == 4)
-      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
-    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
-  }
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  // force cpu
-  const char *GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  };
-};
-#endif
--- a/mmcv/ops/csrc/onnxruntime/nms.h
+++ b/mmcv/ops/csrc/onnxruntime/nms.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_NMS_H
-#define ONNXRUNTIME_NMS_H
-#include <onnxruntime_cxx_api.h>
-struct NmsKernel {
-  NmsKernel(OrtApi api, const OrtKernelInfo *info);
-  void Compute(OrtKernelContext *context);
- protected:
-  OrtApi api_;
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  float iou_threshold_;
-  int64_t offset_;
-};
-struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
-  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-    return new NmsKernel(api, info);
-  };
-  const char *GetName() const { return "NonMaxSuppression"; };
-  size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t index) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  }
-  // force cpu
-  const char *GetExecutionProviderType() const {
-    return "CPUExecutionProvider";
-  }
-};
-#endif
--- a/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ONNXRUNTIME_REGISTER_H
-#define ONNXRUNTIME_REGISTER_H
-#include <onnxruntime_c_api.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
-                                          const OrtApiBase *api);
-#ifdef __cplusplus
-}
-#endif
-#endif  // ONNXRUNTIME_REGISTER_H
--- a/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h