support v1.4.0

6f3c5f1c · limm · 6f674c7e · 6f3c5f1c · 6f3c5f1c · 6f3c5f1c
Commit 6f3c5f1c authored Jul 11, 2024 by limm
20 changed files
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "onnxruntime_register.h"
+
+#include "corner_pool.h"
+#include "deform_conv.h"
+#include "grid_sample.h"
+#include "modulated_deform_conv.h"
+#include "nms.h"
+#include "ort_mmcv_utils.h"
+#include "reduce_ops.h"
+#include "roi_align.h"
+#include "roi_align_rotated.h"
+#include "soft_nms.h"
+
+const char *c_MMCVOpDomain = "mmcv";
+SoftNmsOp c_SoftNmsOp;
+NmsOp c_NmsOp;
+MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
+MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+GridSampleOp c_GridSampleOp;
+MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
+MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
+MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
+MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
+MMCVDeformConvOp c_MMCVDeformConvOp;
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api) {
+  OrtCustomOpDomain *domain = nullptr;
+  const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
+
+  if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
+    return status;
+  }
+
+  return ortApi->AddCustomOpDomain(options, domain);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "reduce_ops.h"
+
+#include <assert.h>
+
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
+
+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
+  int64_t min = -ndims;
+  int64_t max = ndims - 1;
+  assert(dim >= min && dim <= max);
+  if (dim < 0) dim += ndims;
+  return dim;
+}
+
+static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
+                                     const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
+}
+
+static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
+                                   const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1
+             ? reversed_dim_cumprod[dim]
+             : reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
+}
+
+template <typename T1, typename T2, typename Operation>
+void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
+                          const int64_t input_dim_size, const int64_t stride) {
+  Operation op;
+  T1 out = input[0];
+  int64_t idx = 0;
+  for (int64_t i = 0; i < input_dim_size; i++) {
+    T1 curr_elem = input[i * stride];
+    if (op(curr_elem, out)) {
+      out = curr_elem;
+      idx = i;
+    }
+    output[i * stride] = out;
+    indices[i * stride] = idx;
+  }
+}
+
+// modified `tensor_dim_apply3` from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
+// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
+// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
+// and indices, since it's unnecessary to use separate values. currently
+// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
+// official pytorch projects: https://github.com/pytorch/pytorch.
+template <typename T1, typename T2, typename Function>
+void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
+                       const int64_t dim, const int64_t ndims,
+                       const int64_t *reversed_dim_cumprod, Function func) {
+  int dim_apply_finished = 0;
+  int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
+  // the same stride is used for input, output and indices
+  int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
+  std::vector<int64_t> counter(ndims, 0);
+
+  while (!dim_apply_finished) {
+    // call `func` once to update output and indices
+    func(input, output, indices, input_dim_size, stride);
+    if (ndims == 1) break;
+    for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          dim_apply_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+
+      // the same stride is used for input, output, and indices
+      int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
+      input += stride_dim_i;
+      output += stride_dim_i;
+      indices += stride_dim_i;
+
+      if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
+        if (dim_i == ndims - 1) {
+          dim_apply_finished = 1;
+          break;
+        } else {
+          input -= counter[dim_i] * stride_dim_i;
+          output -= counter[dim_i] * stride_dim_i;
+          indices -= counter[dim_i] * stride_dim_i;
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+      }  // if
+    }    // for
+  }      // while
+}
+
+template <typename T1, typename T2, typename Operation>
+void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
+                       int64_t *reversed_dim_cumprod, const int64_t dim,
+                       const OrtTensorDimensions &out_dimensions) {
+  // calculate numel
+  const int64_t ndims = out_dimensions.size();
+  int64_t numel = 1;
+  for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+    numel *= out_dimensions.data()[dim_i];
+  }
+
+  // cummax is only applied to input which is non-zero dim and non-empty
+  if (numel) {
+    // compute the cumulative production on dimension size,
+    // which is then used for computing the stride or size of a specific `dim`.
+    reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
+    for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
+      reversed_dim_cumprod[dim_i] =
+          reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
+    }
+
+    // do cummax or cummin based on `Operation` type
+    tensor_dim_apply3<float, int64_t>(
+        input, output, indices, dim, ndims, reversed_dim_cumprod,
+        cummax_cummin_helper<float, int64_t, Operation>);
+  }
+}
+
+void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
+
+void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "roi_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+// implementation taken from Caffe2
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          float x = xx;
+          float y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignForwardCPU(const int nthreads, const float *input,
+                        const float *rois, float *output, float *argmax_y,
+                        float *argmax_x, const int pooled_height,
+                        const int pooled_width, const float spatial_scale,
+                        const int sampling_ratio,
+                        const int pool_mode,  // 0 - max pool, 1 - avg pool
+                        const bool aligned, const int channels,
+                        const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_start_w = offset_rois[1] * spatial_scale - offset;
+    float roi_start_h = offset_rois[2] * spatial_scale - offset;
+    float roi_end_w = offset_rois[3] * spatial_scale - offset;
+    float roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");*/
+      assert(roi_width >= 0 && roi_height >= 0);
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          float maxval = -10000;
+          float maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const float y = roi_start_h + ph * bin_size_h +
+                            static_cast<float>(iy + .5f) * bin_size_h /
+                                static_cast<float>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const float x = roi_start_w + pw * bin_size_w +
+                              static_cast<float>(ix + .5f) * bin_size_w /
+                                  static_cast<float>(roi_bin_grid_w);
+              PreCalc pc = pre_calc[pre_calc_index];
+              float val = pc.w1 * offset_input[pc.pos1] +
+                          pc.w2 * offset_input[pc.pos2] +
+                          pc.w3 * offset_input[pc.pos3] +
+                          pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+
+  int poolMod = 1;
+  if (pool_mode_ == "max") poolMod = 0;
+
+  float *argmax_x = nullptr, *argmax_y = nullptr;
+  if (poolMod == 0) {
+    argmax_y = new float[output_size];
+    argmax_x = new float[output_size];
+  }
+
+  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
+                     aligned_height_, aligned_width_, spatial_scale_,
+                     sampling_ratio_, poolMod, aligned_, input_channels,
+                     input_height, input_width);
+
+  if (argmax_x) delete argmax_x;
+  if (argmax_y) delete argmax_y;
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "roi_align_rotated.h"
+
+#include "../ort_mmcv_utils.h"
+
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
+    float roi_center_w, float cos_theta, float sin_theta,
+    std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
+                               const float *rois, float *output,
+                               const float &spatial_scale, const int aligned,
+                               const int clockwise, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_center_w = current_roi[1] * spatial_scale - offset;
+    float roi_center_h = current_roi[2] * spatial_scale - offset;
+    float roi_width = current_roi[3] * spatial_scale;
+    float roi_height = current_roi[4] * spatial_scale;
+    // float theta = current_roi[5] * M_PI / 180.0;
+    float theta = current_roi[5];  // Radian angle by default
+    if (clockwise) {
+      theta = -theta;
+    }
+    float cos_theta = cos(theta);
+    float sin_theta = sin(theta);
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    float roi_start_h = -roi_height / 2.0;
+    float roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
+                            aligned_, clockwise_, input_channels, input_height,
+                            input_width, aligned_height_, aligned_width_,
+                            sampling_ratio_);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "soft_nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+
+SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
+  min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
+  method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void SoftNmsKernel::Compute(OrtKernelContext *context) {
+  typedef float T;
+
+  const T iou_threshold = T(iou_threshold_);
+  const T sigma = T(sigma_);
+  const T min_score = T(min_score_);
+  const int method = int(method_);
+  const T offset = T(offset_);
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const T *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const T *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
+  T *x1 = tmp_boxes;
+  T *y1 = tmp_boxes + 1;
+  T *x2 = tmp_boxes + 2;
+  T *y2 = tmp_boxes + 3;
+  T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
+  int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
+
+  memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(T) * nboxes);
+
+  // init inds as arange(nboxes)
+  std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
+
+  // area = (x2-x1+offset)*(y2-y1+offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] =
+        (x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
+  }
+
+  int64_t pos = 0;
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos * 4] = x1[i * 4];
+    y1[max_pos * 4] = y1[i * 4];
+    x2[max_pos * 4] = x2[i * 4];
+    y2[max_pos * 4] = y2[i * 4];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i * 4] = ix1;
+    y1[i * 4] = iy1;
+    x2[i * 4] = ix2;
+    y2[i * 4] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos * 4]);
+      auto yy1 = std::max(iy1, y1[pos * 4]);
+      auto xx2 = std::min(ix2, x2[pos * 4]);
+      auto yy2 = std::min(iy2, y2[pos * 4]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos * 4] = x1[(nboxes - 1) * 4];
+        y1[pos * 4] = y1[(nboxes - 1) * 4];
+        x2[pos * 4] = x2[(nboxes - 1) * 4];
+        y2[pos * 4] = y2[(nboxes - 1) * 4];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+
+  std::vector<int64_t> dets_dim({nboxes, 5});
+  OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
+                                                dets_dim.size());
+  T *dets_data = ort_.GetTensorMutableData<T>(dets);
+
+  std::vector<int64_t> inds_dim({nboxes});
+  OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
+                                                   inds_dim.size());
+  int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
+
+  memcpy(dets_data, de, sizeof(T) * nboxes * 5);
+  memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
+}
--- a/mmcv/ops/csrc/onnxruntime/deform_conv.h
+++ b/mmcv/ops/csrc/onnxruntime/deform_conv.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_DEFORM_CONV_H
+#define ONNXRUNTIME_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVDeformConvKernel {
+  MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+  int64_t im2col_step_;
+};
+
+struct MMCVDeformConvOp
+    : Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
--- a/mmcv/ops/csrc/onnxruntime/grid_sample.h
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct GridSampleKernel {
+  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
+
+  const char *GetName() const { return "grid_sampler"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
--- a/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
+++ b/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVModulatedDeformConvKernel {
+  MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+};
+
+struct MMCVModulatedDeformConvOp
+    : Ort::CustomOpBase<MMCVModulatedDeformConvOp,
+                        MMCVModulatedDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVModulatedDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 5; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    // The last input (index == 4) is optional, which is bias
+    if (index == 4)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
--- a/mmcv/ops/csrc/onnxruntime/nms.h
+++ b/mmcv/ops/csrc/onnxruntime/nms.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_NMS_H
+#define ONNXRUNTIME_NMS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct NmsKernel {
+  NmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  int64_t offset_;
+};
+
+struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new NmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "NonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+
+#endif
--- a/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REGISTER_H
+#define ONNXRUNTIME_REGISTER_H
+#include <onnxruntime_c_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // ONNXRUNTIME_REGISTER_H
--- a/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually
+ * for each Config. The maximum length of the Config Value is 1024
+ */
+
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise
+// prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking =
+    "session.disable_prepacking";
+
+// A value of "1" means allocators registered in the env will be used. "0" means
+// the allocators created in the session will be used. Use this to override the
+// usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators =
+    "session.use_env_allocators";
+
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename
+// ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat =
+    "session.load_model_format";
+
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
+// SessionOptions.optimized_model_path is set. If unset, format will default to
+// ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat =
+    "session.save_model_format";
+
+#endif  // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
--- a/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
+++ b/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ORT_MMCV_UTILS_H
+#define ORT_MMCV_UTILS_H
+#include <onnxruntime_cxx_api.h>
+
+#include <vector>
+
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+#endif  // ORT_MMCV_UTILS_H
--- a/mmcv/ops/csrc/onnxruntime/reduce_ops.h
+++ b/mmcv/ops/csrc/onnxruntime/reduce_ops.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REDUCE_OPS_H
+#define ONNXRUNTIME_REDUCE_OPS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCumMaxKernel {
+ public:
+  MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMinKernel {
+ public:
+  MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMaxCustomOp
+    : Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMaxKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummax"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+struct MMCVCumMinCustomOp
+    : Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMinKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummin"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+#endif  // ONNXRUNTIME_REDUCE_OPS_H
--- a/mmcv/ops/csrc/onnxruntime/roi_align.h
+++ b/mmcv/ops/csrc/onnxruntime/roi_align.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_H
+#define ONNXRUNTIME_ROI_ALIGN_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoiAlignKernel {
+ public:
+  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  std::string pool_mode_;
+  int aligned_;
+};
+
+struct MMCVRoiAlignCustomOp
+    : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoiAlignKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoiAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_H
--- a/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
+++ b/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoIAlignRotatedKernel {
+ public:
+  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int aligned_;
+  int clockwise_;
+};
+
+struct MMCVRoIAlignRotatedCustomOp
+    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
+                        MMCVRoIAlignRotatedKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoIAlignRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoIAlignRotated"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
--- a/mmcv/ops/csrc/onnxruntime/soft_nms.h
+++ b/mmcv/ops/csrc/onnxruntime/soft_nms.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_SOFT_NMS_H
+#define ONNXRUNTIME_SOFT_NMS_H
+#include <onnxruntime_cxx_api.h>
+
+struct SoftNmsKernel {
+  SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  float sigma_;
+  float min_score_;
+  int64_t method_;
+  int64_t offset_;
+};
+
+struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new SoftNmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "SoftNonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) {
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif  // ONNXRUNTIME_SOFT_NMS_H
--- a/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
-                       output);
-}
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
-                       grad_in);
-}
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output) {
-  active_rotated_filter_forward_impl(input, indices, output);
-}
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in) {
-  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
-}
--- a/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "active_rotated_filter_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void active_rotated_filter_forward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto input = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  active_rotated_filter_forward(input, indices, output);
-}
-
-void active_rotated_filter_backward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto grad_out = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto grad_in = buildATensor(ctx, outs[0]);
-  active_rotated_filter_backward(grad_out, indices, grad_in);
-}
-#endif
-
-void active_rotated_filter_forward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto input = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  active_rotated_filter_forward(input, indices, output);
-}
-
-void active_rotated_filter_backward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto grad_out = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto grad_in = buildATensor(ctx, outs[0]);
-  active_rotated_filter_backward(grad_out, indices, grad_in);
-}
-
-PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
-    .input(2)
-    .output(1)
-    .apply(active_rotated_filter_forward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(active_rotated_filter_forward_cuda_parrots)
-#endif
-    .done();
-
-PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
-    .input(2)
-    .output(1)
-    .apply(active_rotated_filter_backward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(active_rotated_filter_backward_cuda_parrots)
-#endif
-    .done();
--- a/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
-#define ACTIVE_ROTATED_FILTER_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output);
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in);
-
-#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
 // Modified from
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"

-void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+#ifdef MMCV_WITH_CUDA
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
-  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
-                       aggregate, points, centers, scores, knn_idx, output);
-}
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);

-void assign_score_withk_backward_impl(
+void assign_score_withk_backward_cuda(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
-  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
-                       aggregate, grad_out, points, centers, scores, knn_idx,
-                       grad_points, grad_centers, grad_scores);
-}
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+#endif

 void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate) {
-  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
-                                  centers, scores, knn_idx, output);
+  if (points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CONTIGUOUS(points);
+    CHECK_CONTIGUOUS(centers);
+    CHECK_CONTIGUOUS(scores);
+    CHECK_CONTIGUOUS(knn_idx);
+    CHECK_CONTIGUOUS(output);
+
+    assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
+                                    centers, scores, knn_idx, output);
+#else
+    AT_ERROR("assign_score_withk is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("assign_score_withk is not implemented on CPU");
+  }
 }

 void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
@@ -36,7 +62,24 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate) {
-  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
-                                   points, centers, scores, knn_idx,
-                                   grad_points, grad_centers, grad_scores);
+  if (grad_points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CONTIGUOUS(grad_out);
+    CHECK_CONTIGUOUS(scores);
+    CHECK_CONTIGUOUS(points);
+    CHECK_CONTIGUOUS(centers);
+    CHECK_CONTIGUOUS(knn_idx);
+    CHECK_CONTIGUOUS(grad_scores);
+    CHECK_CONTIGUOUS(grad_points);
+    CHECK_CONTIGUOUS(grad_centers);
+
+    assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
+                                     points, centers, scores, knn_idx,
+                                     grad_points, grad_centers, grad_scores);
+#else
+    AT_ERROR("assign_score_withk is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("assign_score_withk is not implemented on CPU");
+  }
 }