release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · df465820 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-
-void modulated_deformable_im2col_cuda(
+void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
-
-void modulated_deformable_col2im_cuda(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
-
-void modulated_deformable_col2im_coord_cuda(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
-
-#endif
-
-void modulated_deformable_im2col_cpu(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}

-void modulated_deformable_col2im_cpu(
+void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}

-void modulated_deformable_col2im_coord_cpu(
+void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}

 void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(columns);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(output);
-    CHECK_CPU_INPUT(columns);
-  }
-
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
                        output.size(2), output.size(3)});

  for (int b = 0; b < batch; b++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);

    // divide into group
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(columns);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_weight);
-    CHECK_CUDA_INPUT(grad_bias);
-    CHECK_CUDA_INPUT(grad_offset);
-    CHECK_CUDA_INPUT(grad_mask);
-    CHECK_CUDA_INPUT(grad_output);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(columns);
-    CHECK_CPU_INPUT(grad_input);
-    CHECK_CPU_INPUT(grad_weight);
-    CHECK_CPU_INPUT(grad_bias);
-    CHECK_CPU_INPUT(grad_offset);
-    CHECK_CPU_INPUT(grad_mask);
-    CHECK_CPU_INPUT(grad_output);
-  }
-
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});

-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cuda(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cuda(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cpu(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cpu(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,

--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T>
-T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
-                           const int height, const int width, T h, T w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename T>
-T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
-                               const int height, const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floorf(argmax_h);
-  int argmax_w_low = floorf(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename T>
-T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
-                                 const int width, const T *im_data,
-                                 const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floorf(argmax_h);
-  int argmax_w_low = floorf(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename T>
-void modulated_deformable_im2col_cpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const T *data_mask,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T *data_col) {
-  for (int index = 0; index < n; index++) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T *data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr =
-        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    const T *data_mask_ptr =
-        data_mask + (b_col * deformable_group + deformable_group_index) *
-                        kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const int data_mask_hw_ptr =
-            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
-                                         h_im, w_im);
-        *data_col_ptr = val * mask;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-void modulated_deformable_col2im_cpu_kernel(
-    const int n, const T *data_col, const T *data_offset, const T *data_mask,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T *grad_im) {
-  for (int index = 0; index < n; index++) {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const T *data_mask_ptr =
-        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
-                        kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
-        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T mask = data_mask_ptr[data_mask_hw_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[index] * mask;
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
-                                                  cur_inv_w_data, cur_h + dy,
-                                                  cur_w + dx, height, width);
-          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void modulated_deformable_col2im_coord_cpu_kernel(
-    const int n, const T *data_col, const T *data_im, const T *data_offset,
-    const T *data_mask, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int offset_channels, const int deformable_group,
-    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
-  for (int index = 0; index < n; index++) {
-    T val = 0, mval = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T *data_col_ptr = data_col + deformable_group_index *
-                                           channel_per_deformable_group *
-                                           batch_size * width_col * height_col;
-    const T *data_im_ptr =
-        data_im + (b * deformable_group + deformable_group_index) *
-                      channel_per_deformable_group / kernel_h / kernel_w *
-                      height * width;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const T *data_mask_ptr =
-        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
-                        kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const int data_mask_hw_ptr =
-          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-        inv_h = inv_w = -2;
-      else
-        mval += data_col_ptr[col_pos] *
-                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
-                                         width, height, width, inv_h, inv_w);
-      const T weight = dmcn_get_coordinate_weight_cpu(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos] * mask;
-      cnt += 1;
-    }
-    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
-    grad_offset[index] = val;
-    if (offset_c % 2 == 0)
-      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
-      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
-      // height_col + h) * width_col + w], mask_req, mval);
-      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w +
-                  offset_c / 2) *
-                     height_col +
-                 h) *
-                    width_col +
-                w] = mval;
-  }
-}
-
-void modulated_deformable_im2col_cpu(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        modulated_deformable_im2col_cpu_kernel(
-            num_kernels, data_im_, data_offset_, data_mask_, height_im,
-            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
-            channels, deformable_group, height_col, width_col, data_col_);
-      }));
-}
-
-void modulated_deformable_col2im_cpu(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im) {
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels =
-      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_cpu_kernel(
-            num_kernels, data_col_, data_offset_, data_mask_, channels,
-            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, deformable_group, height_col, width_col, grad_im_);
-      }));
-}
-
-void modulated_deformable_col2im_coord_cpu(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask) {
-  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
-                          kernel_w * deformable_group;
-  const int channel_per_deformable_group =
-      channels * kernel_h * kernel_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_coord_cpu_kernel(
-            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
-            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
-            stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, batch_size,
-            2 * kernel_h * kernel_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_, grad_mask_);
-      }));
-}
--- a/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
+++ b/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -10,42 +10,39 @@
 */

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
-                                   const int im2col_step);
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}

-void ms_deform_attn_cuda_backward(
+void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
-
-#endif
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}

 Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight,
                              const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
-                                       sampling_loc, attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("Not implemented on the CPU");
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
 }

 void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    CHECK_CUDA_INPUT(grad_output)
-    CHECK_CUDA_INPUT(grad_value)
-    CHECK_CUDA_INPUT(grad_sampling_loc)
-    CHECK_CUDA_INPUT(grad_attn_weight)
-    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
-                                 sampling_loc, attn_weight, grad_output,
-                                 grad_value, grad_sampling_loc,
-                                 grad_attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Not implemented on the CPU");
-  }
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
 }
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
-                             int offset);
-
-Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
 }
-#endif
-
-Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  if (boxes.numel() == 0) {
-    return at::empty({0}, boxes.options().dtype(at::kLong));
-  }
-  auto x1_t = boxes.select(1, 0).contiguous();
-  auto y1_t = boxes.select(1, 1).contiguous();
-  auto x2_t = boxes.select(1, 2).contiguous();
-  auto y2_t = boxes.select(1, 3).contiguous();
-
-  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto nboxes = boxes.size(0);
-  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
-
-  auto select = select_t.data_ptr<bool>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-
-  for (int64_t _i = 0; _i < nboxes; _i++) {
-    if (select[_i] == false) continue;
-    auto i = order[_i];
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-      if (select[_j] == false) continue;
-      auto j = order[_j];
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);

-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > iou_threshold) select[_j] = false;
-    }
-  }
-  return order_t.masked_select(select_t);
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
 }

-Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(scores);
-    return nms_cuda(boxes, scores, iou_threshold, offset);
-#else
-    AT_ERROR("nms is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(boxes);
-    CHECK_CPU_INPUT(scores);
-    return nms_cpu(boxes, scores, iou_threshold, offset);
-  }
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
 }

-Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
-                   float iou_threshold, float sigma, float min_score,
-                   int method, int offset) {
-  if (boxes.numel() == 0) {
-    return at::empty({0}, boxes.options().dtype(at::kLong));
-  }
-
-  auto x1_t = boxes.select(1, 0).contiguous();
-  auto y1_t = boxes.select(1, 1).contiguous();
-  auto x2_t = boxes.select(1, 2).contiguous();
-  auto y2_t = boxes.select(1, 3).contiguous();
-  auto scores_t = scores.clone();
-
-  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
-
-  auto nboxes = boxes.size(0);
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto sc = scores_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-  auto de = dets.data_ptr<float>();
-
-  int64_t pos = 0;
-  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
-  auto inds = inds_t.data_ptr<int64_t>();
-
-  for (int64_t i = 0; i < nboxes; i++) {
-    auto max_score = sc[i];
-    auto max_pos = i;
-
-    pos = i + 1;
-    // get max box
-    while (pos < nboxes) {
-      if (max_score < sc[pos]) {
-        max_score = sc[pos];
-        max_pos = pos;
-      }
-      pos = pos + 1;
-    }
-    // swap
-    auto ix1 = de[i * 5 + 0] = x1[max_pos];
-    auto iy1 = de[i * 5 + 1] = y1[max_pos];
-    auto ix2 = de[i * 5 + 2] = x2[max_pos];
-    auto iy2 = de[i * 5 + 3] = y2[max_pos];
-    auto iscore = de[i * 5 + 4] = sc[max_pos];
-    auto iarea = areas[max_pos];
-    auto iind = inds[max_pos];
-    x1[max_pos] = x1[i];
-    y1[max_pos] = y1[i];
-    x2[max_pos] = x2[i];
-    y2[max_pos] = y2[i];
-    sc[max_pos] = sc[i];
-    areas[max_pos] = areas[i];
-    inds[max_pos] = inds[i];
-    x1[i] = ix1;
-    y1[i] = iy1;
-    x2[i] = ix2;
-    y2[i] = iy2;
-    sc[i] = iscore;
-    areas[i] = iarea;
-    inds[i] = iind;
-
-    pos = i + 1;
-    while (pos < nboxes) {
-      auto xx1 = std::max(ix1, x1[pos]);
-      auto yy1 = std::max(iy1, y1[pos]);
-      auto xx2 = std::min(ix2, x2[pos]);
-      auto yy2 = std::min(iy2, y2[pos]);
-
-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[pos] - inter);
-
-      float weight = 1.;
-      if (method == 0) {
-        if (ovr >= iou_threshold) weight = 0;
-      } else if (method == 1) {
-        if (ovr >= iou_threshold) weight = 1 - ovr;
-      } else if (method == 2) {
-        weight = std::exp(-(ovr * ovr) / sigma);
-      }
-      sc[pos] *= weight;
-      // if box score falls below threshold, discard the box by
-      // swapping with last box update N
-      if (sc[pos] < min_score) {
-        x1[pos] = x1[nboxes - 1];
-        y1[pos] = y1[nboxes - 1];
-        x2[pos] = x2[nboxes - 1];
-        y2[pos] = y2[nboxes - 1];
-        sc[pos] = sc[nboxes - 1];
-        areas[pos] = areas[nboxes - 1];
-        inds[pos] = inds[nboxes - 1];
-        nboxes = nboxes - 1;
-        pos = pos - 1;
-      }
-      pos = pos + 1;
-    }
-  }
-  return inds_t.slice(0, 0, nboxes);
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
 }

 Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
               float sigma, float min_score, int method, int offset) {
-  if (boxes.device().is_cuda()) {
-    AT_ERROR("softnms is not implemented on GPU");
-  } else {
-    return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
-                       method, offset);
-  }
-}
-
-std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-  auto scores = dets.select(1, 4).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  at::Tensor suppressed_t =
-      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-
-  std::vector<int> keep;
-  std::vector<std::vector<int> > matched;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) continue;
-    keep.push_back(i);
-    std::vector<int> v_i;
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) continue;
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
-
-      auto w = std::max(static_cast<float>(0), xx2 - xx1);
-      auto h = std::max(static_cast<float>(0), yy2 - yy1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr >= iou_threshold) {
-        suppressed[j] = 1;
-        v_i.push_back(j);
-      }
-    }
-    matched.push_back(v_i);
-  }
-  for (int i = 0; i < keep.size(); i++)
-    matched[i].insert(matched[i].begin(), keep[i]);
-  return matched;
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
 }

 std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
-  if (dets.device().is_cuda()) {
-    AT_ERROR("nms_match is not implemented on GPU");
-  } else {
-    return nms_match_cpu(dets, iou_threshold);
-  }
+  return nms_match_impl(dets, iou_threshold);
 }
--- a/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
-#include "box_iou_rotated_utils.hpp"
-#include "pytorch_cpp_helper.hpp"
-
-template <typename scalar_t>
-Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
-                              const float iou_threshold) {
-  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
-  // however, the code in this function is much shorter because
-  // we delegate the IoU computation for rotated boxes to
-  // the single_box_iou_rotated function in box_iou_rotated_utils.h
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
-  AT_ASSERTM(dets.type() == scores.type(),
-             "dets should have the same type as scores");
-
-  if (dets.numel() == 0) {
-    return at::empty({0}, dets.options().dtype(at::kLong));
-  }
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
-  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto keep = keep_t.data_ptr<int64_t>();
-  auto order = order_t.data_ptr<int64_t>();
-
-  int64_t num_to_keep = 0;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) {
-      continue;
-    }
-
-    keep[num_to_keep++] = i;
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) {
-        continue;
-      }
-
-      auto ovr = single_box_iou_rotated<scalar_t>(
-          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
-      if (ovr >= iou_threshold) {
-        suppressed[j] = 1;
-      }
-    }
-  }
-  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
-}
-
-Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
-                       const float iou_threshold) {
-  auto result = at::empty({0}, dets.options());
-  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
-    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
-  });
-  return result;
-}
--- a/mmcv/ops/csrc/parrots/pixel_group.cpp
+++ b/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -2,120 +2,14 @@
 // It is modified from https://github.com/WenmuZhou/PAN.pytorch

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-std::vector<std::vector<float>> estimate_confidence(int32_t* label,
-                                                    float* score, int label_num,
-                                                    int height, int width) {
-  std::vector<std::vector<float>> point_vector;
-  for (int i = 0; i < label_num; i++) {
-    std::vector<float> point;
-    point.push_back(0);
-    point.push_back(0);
-    point_vector.push_back(point);
-  }
-  for (int y = 0; y < height; y++) {
-    auto label_tmp = label + y * width;
-    auto score_tmp = score + y * width;
-    for (int x = 0; x < width; x++) {
-      auto l = label_tmp[x];
-      if (l > 0) {
-        float confidence = score_tmp[x];
-        point_vector[l].push_back(x);
-        point_vector[l].push_back(y);
-        point_vector[l][0] += confidence;
-        point_vector[l][1] += 1;
-      }
-    }
-  }
-  for (int l = 0; l < point_vector.size(); l++)
-    if (point_vector[l][1] > 0) {
-      point_vector[l][0] /= point_vector[l][1];
-    }
-  return point_vector;
-}
-std::vector<std::vector<float>> pixel_group_cpu(
+std::vector<std::vector<float>> pixel_group_impl(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
-  assert(score.dim() == 2);
-  assert(mask.dim() == 2);
-  assert(embedding_dim.dim() == 3);
-  int height = score.size(0);
-  int width = score.size(1);
-  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
-  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
-
-  auto threshold_square = dis_threshold * dis_threshold;
-  auto ptr_score = score.data_ptr<float>();
-  auto ptr_mask = mask.data_ptr<bool>();
-  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
-  auto ptr_embedding = embedding.data_ptr<float>();
-  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
-  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
-  auto embedding_dim = embedding.size(2);
-  std::vector<std::vector<float>> kernel_vector(
-      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
-
-  Tensor text_label;
-  text_label = kernel_label.clone();
-  auto ptr_text_label = text_label.data_ptr<int32_t>();
-
-  for (int i = 0; i < height; i++) {
-    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
-    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
-    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
-
-    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
-         j++, k += embedding_dim) {
-      int32_t label = ptr_kernel_label_tmp[j];
-      if (label > 0) {
-        for (int d = 0; d < embedding_dim; d++)
-          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
-        kernel_vector[label][embedding_dim] += 1;
-        // kernel pixel number
-        if (ptr_kernel_contour_tmp[j]) {
-          contour_pixels.push(std::make_tuple(i, j, label));
-        }
-      }
-    }
-  }
-  for (int i = 0; i < kernel_region_num; i++) {
-    for (int j = 0; j < embedding_dim; j++) {
-      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
-    }
-  }
-  int dx[4] = {-1, 1, 0, 0};
-  int dy[4] = {0, 0, -1, 1};
-  while (!contour_pixels.empty()) {
-    auto query_pixel = contour_pixels.front();
-    contour_pixels.pop();
-    int y = std::get<0>(query_pixel);
-    int x = std::get<1>(query_pixel);
-    int32_t l = std::get<2>(query_pixel);
-    auto kernel_cv = kernel_vector[l];
-    for (int idx = 0; idx < 4; idx++) {
-      int tmpy = y + dy[idx];
-      int tmpx = x + dx[idx];
-      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
-      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
-      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
-        continue;
-
-      float dis = 0;
-      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < embedding_dim; i++) {
-        dis +=
-            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
-        // ignore further computing if dis is big enough
-        if (dis >= threshold_square) break;
-      }
-      if (dis >= threshold_square) continue;
-      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
-      ptr_text_label_tmp[tmpx] = l;
-    }
-  }
-
-  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
-                             height, width);
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
 }

 std::vector<std::vector<float>> pixel_group(
@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
  kernel_label = kernel_label.contiguous();
  kernel_contour = kernel_contour.contiguous();

-  CHECK_CPU_INPUT(score);
-  CHECK_CPU_INPUT(mask);
-  CHECK_CPU_INPUT(embedding);
-  CHECK_CPU_INPUT(kernel_label);
-  CHECK_CPU_INPUT(kernel_contour);
-  return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
-                         kernel_region_num, distance_threshold);
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
 }
--- a/mmcv/ops/csrc/parrots/points_in_boxes.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_boxes.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                                int pts_num, const Tensor boxes,
-                                                const Tensor pts,
-                                                Tensor box_idx_of_points);
-
-void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
-  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                             boxes, pts, box_idx_of_points);
-};
-
-void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                               int pts_num, const Tensor boxes,
-                                               const Tensor pts,
-                                               Tensor box_idx_of_points);
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}

-void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
-  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                            boxes, pts, box_idx_of_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}

 void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                  Tensor box_idx_of_points_tensor) {
@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
  // default -1
-
-  if (pts_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_tensor);
-    CHECK_CUDA_INPUT(pts_tensor);
-    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
-
-    int batch_size = boxes_tensor.size(0);
-    int boxes_num = boxes_tensor.size(1);
-    int pts_num = pts_tensor.size(1);
-
-    const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data_ptr<float>();
-    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-
-    points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
-                                      boxes_tensor, pts_tensor,
-                                      box_idx_of_points_tensor);
-#else
-    AT_ERROR("points_in_boxes_part is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("points_in_boxes_part is not implemented on CPU");
-  }
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
 }

 void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
-
-  if (pts_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_tensor);
-    CHECK_CUDA_INPUT(pts_tensor);
-    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
-
-    int batch_size = boxes_tensor.size(0);
-    int boxes_num = boxes_tensor.size(1);
-    int pts_num = pts_tensor.size(1);
-
-    const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data_ptr<float>();
-    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-
-    points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
-                                     boxes_tensor, pts_tensor,
-                                     box_idx_of_points_tensor);
-#else
-    AT_ERROR("points_in_boxes_all is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("points_in_boxes_all is not implemented on CPU");
-  }
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
 }
--- a/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
-#include "pytorch_cpp_helper.hpp"
-
-inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
-                                      float &local_x, float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-
-inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
-                                 float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
-  // cz in the bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size /
-        2.0;  // shift to the center since cz in box3d is the bottom center
-
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-
-void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
-                                 Tensor pts_indices_tensor) {
-  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
-  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
-  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
-
-  CHECK_CONTIGUOUS(boxes_tensor);
-  CHECK_CONTIGUOUS(pts_tensor);
-  CHECK_CONTIGUOUS(pts_indices_tensor);
-
-  int boxes_num = boxes_tensor.size(0);
-  int pts_num = pts_tensor.size(0);
-
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *pts_indices = pts_indices_tensor.data_ptr<int>();
-
-  float local_x = 0, local_y = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    for (int j = 0; j < pts_num; j++) {
-      int cur_in_flag =
-          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
-      pts_indices[i * pts_num + j] = cur_in_flag;
-    }
-  }
-}
--- a/mmcv/ops/csrc/parrots/points_in_polygons.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_polygons.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
--- a/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
--- a/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
+++ b/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
@@ -2,255 +2,40 @@
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifndef min
-#define min(a, b) (((a) < (b)) ? (a) : (b))
-#endif
-#ifndef max
-#define max(a, b) (((a) > (b)) ? (a) : (b))
-#endif
-
-void psamask_collect_forward(const int num_, const int h_feature,
-                             const int w_feature, const int h_mask,
-                             const int w_mask, const int half_h_mask,
-                             const int half_w_mask, const Tensor mask_data,
-                             Tensor buffer_data) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data.view({-1})[(n * h_feature * w_feature +
-                                    (hidx + h - half_h_mask) * w_feature +
-                                    (widx + w - half_w_mask)) *
-                                       h_feature * w_feature +
-                                   h * w_feature + w] =
-                mask_data.view(
-                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                               h_feature +
-                           h) *
-                              w_feature +
-                          w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_distribute_forward(const int num_, const int h_feature,
-                                const int w_feature, const int h_mask,
-                                const int w_mask, const int half_h_mask,
-                                const int half_w_mask, const Tensor mask_data,
-                                Tensor buffer_data) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data.view(
-                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
-                          h_feature * w_feature +
-                      (hidx + h - half_h_mask) * w_feature +
-                      (widx + w - half_w_mask)] =
-                mask_data.view(
-                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                               h_feature +
-                           h) *
-                              w_feature +
-                          w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_collect_backward(const int num_, const int h_feature,
-                              const int w_feature, const int h_mask,
-                              const int w_mask, const int half_h_mask,
-                              const int half_w_mask, const Tensor buffer_diff,
-                              Tensor mask_diff) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                                      h_feature +
-                                  h) *
-                                     w_feature +
-                                 w] =
-                buffer_diff.view({-1})[(n * h_feature * w_feature +
-                                        (hidx + h - half_h_mask) * w_feature +
-                                        (widx + w - half_w_mask)) *
-                                           h_feature * w_feature +
-                                       h * w_feature + w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_distribute_backward(const int num_, const int h_feature,
-                                 const int w_feature, const int h_mask,
-                                 const int w_mask, const int half_h_mask,
-                                 const int half_w_mask,
-                                 const Tensor buffer_diff, Tensor mask_diff) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                                      h_feature +
-                                  h) *
-                                     w_feature +
-                                 w] =
-                buffer_diff.view(
-                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
-                              h_feature * w_feature +
-                          (hidx + h - half_h_mask) * w_feature +
-                          (widx + w - half_w_mask)];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
-                         const int num_, const int h_feature,
-                         const int w_feature, const int h_mask,
-                         const int w_mask, const int half_h_mask,
-                         const int half_w_mask) {
-  if (psa_type == 0)
-    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                            half_h_mask, half_w_mask, input, output);
-  else
-    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                               half_h_mask, half_w_mask, input, output);
-}
-
-void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
-                          Tensor grad_input, const int num_,
-                          const int h_feature, const int w_feature,
-                          const int h_mask, const int w_mask,
-                          const int half_h_mask, const int half_w_mask) {
-  if (psa_type == 0)
-    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                             half_h_mask, half_w_mask, grad_output, grad_input);
-  else
-    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                                half_h_mask, half_w_mask, grad_output,
-                                grad_input);
-}
-
-#ifdef MMCV_WITH_CUDA
-void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
-                                      Tensor output, const int num_,
-                                      const int h_feature, const int w_feature,
-                                      const int h_mask, const int w_mask,
-                                      const int half_h_mask,
-                                      const int half_w_mask);
-
-void PSAMaskBackwardCUDAKernelLauncher(
-    const int psa_type, const Tensor grad_output, Tensor grad_input,
-    const int num_, const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int half_h_mask, const int half_w_mask);
-
-void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
-  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
-                                   w_feature, h_mask, w_mask, half_h_mask,
-                                   half_w_mask);
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
 }

-void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
-  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
-                                    h_feature, w_feature, h_mask, w_mask,
-                                    half_h_mask, half_w_mask);
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
 }
-#endif

 void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                     const int num_, const int h_feature, const int w_feature,
                     const int h_mask, const int w_mask, const int half_h_mask,
                     const int half_w_mask) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(output);
-    psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
-                         h_mask, w_mask, half_h_mask, half_w_mask);
-#else
-    AT_ERROR("PSAMask is not compiled with GPU support");
-#endif
-  } else {
-    psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
-                        h_mask, w_mask, half_h_mask, half_w_mask);
-  }
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
 }

 void psamask_backward(Tensor grad_output, const Tensor grad_input,
                      const int psa_type, const int num_, const int h_feature,
                      const int w_feature, const int h_mask, const int w_mask,
                      const int half_h_mask, const int half_w_mask) {
-  if (grad_input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_output);
-    psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
-                          w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
-#else
-    AT_ERROR("PSAMask is not compiled with GPU support");
-#endif
-  } else {
-    psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
-                         w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
-  }
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
 }
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                       Tensor argmax_y, Tensor argmax_x,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       int pool_mode, bool aligned);
-
-void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                        Tensor argmax_y, Tensor argmax_x,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, int pool_mode,
-                                        bool aligned);
-
-void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
-  ROIAlignForwardCUDAKernelLauncher(
-      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-      spatial_scale, sampling_ratio, pool_mode, aligned);
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
 }

-void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
-  ROIAlignBackwardCUDAKernelLauncher(
-      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
-      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
-}
-#endif
-
-void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                Tensor argmax_y, Tensor argmax_x,
-                                int aligned_height, int aligned_width,
-                                float spatial_scale, int sampling_ratio,
-                                int pool_mode, bool aligned);
-
-void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                 Tensor argmax_y, Tensor argmax_x,
-                                 Tensor grad_input, int aligned_height,
-                                 int aligned_width, float spatial_scale,
-                                 int sampling_ratio, int pool_mode,
-                                 bool aligned);
-
-void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
-                           int aligned_width, float spatial_scale,
-                           int sampling_ratio, int pool_mode, bool aligned) {
-  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
-                             aligned_height, aligned_width, spatial_scale,
-                             sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                            Tensor argmax_x, Tensor grad_input,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned) {
-  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
-                              aligned_height, aligned_width, spatial_scale,
-                              sampling_ratio, pool_mode, aligned);
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
 }

 void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
                       int aligned_width, float spatial_scale,
                       int sampling_ratio, int pool_mode, bool aligned) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax_y);
-    CHECK_CUDA_INPUT(argmax_x);
-
-    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
-                           aligned_height, aligned_width, spatial_scale,
-                           sampling_ratio, pool_mode, aligned);
-#else
-    AT_ERROR("RoIAlign is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(output);
-    CHECK_CPU_INPUT(argmax_y);
-    CHECK_CPU_INPUT(argmax_x);
-    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
-                          aligned_height, aligned_width, spatial_scale,
-                          sampling_ratio, pool_mode, aligned);
-  }
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
 }

 void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                        Tensor argmax_x, Tensor grad_input, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(argmax_y);
-    CHECK_CUDA_INPUT(argmax_x);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
-                            aligned_height, aligned_width, spatial_scale,
-                            sampling_ratio, pool_mode, aligned);
-#else
-    AT_ERROR("RoIAlign is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(grad_output);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(argmax_y);
-    CHECK_CPU_INPUT(argmax_x);
-    CHECK_CPU_INPUT(grad_input);
-
-    roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
-                           aligned_height, aligned_width, spatial_scale,
-                           sampling_ratio, pool_mode, aligned);
-  }
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
 }
--- a/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-
-#include "../pytorch_cpp_helper.hpp"
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* input, const T* rois,
-                     T* output, T* argmax_y, T* argmax_x,
-                     const int pooled_height, const int pooled_width,
-                     const T spatial_scale, const int sampling_ratio,
-                     const int pool_mode,  // 0 - max pool, 1 - avg pool
-                     const bool aligned, const int channels, const int height,
-                     const int width) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign cannot have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          T maxval = -10000;
-          T maxidx_y = -1.f, maxidx_x = -1.f;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            const T y = roi_start_h + ph * bin_size_h +
-                        static_cast<T>(iy + .5f) * bin_size_h /
-                            static_cast<T>(roi_bin_grid_h);
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              const T x = roi_start_w + pw * bin_size_w +
-                          static_cast<T>(ix + .5f) * bin_size_w /
-                              static_cast<T>(roi_bin_grid_w);
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              T val = pc.w1 * offset_input[pc.pos1] +
-                      pc.w2 * offset_input[pc.pos2] +
-                      pc.w3 * offset_input[pc.pos3] +
-                      pc.w4 * offset_input[pc.pos4];
-              if (val > maxval) {
-                maxval = val;
-                maxidx_y = y;
-                maxidx_x = x;
-              }
-              output_val += val;
-              pre_calc_index += 1;
-            }
-          }
-          if (pool_mode == 0) {
-            // We do max pooling inside a bin
-            output[index] = maxval;
-            argmax_y[index] = maxidx_y;
-            argmax_x[index] = maxidx_x;
-          } else if (pool_mode == 1) {
-            // We do average (integral) pooling inside a bin
-            output[index] = output_val / count;
-          }  // if
-        }    // for pw
-      }      // for ph
-    }        // for c
-  }          // for n
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high,
-                                   const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-inline void add(T* address, const T& val) {
-  *address += val;
-}
-
-template <typename T>
-void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
-                      const T* argmax_y, const T* argmax_x, T* grad_input,
-                      const int pooled_height, const int pooled_width,
-                      const T spatial_scale, const int sampling_ratio,
-                      const int pool_mode,  // 0 - max pool, 1 - avg pool
-                      const bool aligned, const int channels, const int height,
-                      const int width, const int n_stride, const int c_stride,
-                      const int h_stride, const int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    int output_offset = n * n_stride + c * c_stride;
-    const T* offset_grad_output = grad_output + output_offset;
-    const T grad_output_this_bin =
-        offset_grad_output[ph * h_stride + pw * w_stride];
-
-    if (pool_mode == 0) {
-      // We do max pooling inside a bin
-      T y = argmax_y[index], x = argmax_x[index];
-      if (y != -1.f) {
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        T g1 = grad_output_this_bin * w1;
-        T g2 = grad_output_this_bin * w2;
-        T g3 = grad_output_this_bin * w3;
-        T g4 = grad_output_this_bin * w4;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          // atomic add is not needed for now since it is single threaded
-          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
-        }  // if
-      }    // mode
-    } else if (pool_mode == 1) {
-      // We do average (integral) pooling inside a bin
-      // We use roi_bin_grid to sample the grid and mimic integral
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);  // e.g., = 2
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-
-      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h /
-                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w /
-                          static_cast<T>(roi_bin_grid_w);
-
-          T w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-
-          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                        x_low, x_high, y_low, y_high, index);
-
-          T g1 = grad_output_this_bin * w1 / count;
-          T g2 = grad_output_this_bin * w2 / count;
-          T g3 = grad_output_this_bin * w3 / count;
-          T g4 = grad_output_this_bin * w4 / count;
-
-          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-            // atomic add is not needed for now since it is single threaded
-            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-            add(offset_grad_input + y_high * width + x_high,
-                static_cast<T>(g4));
-          }  // if
-        }    // ix
-      }      // iy
-    }        // mode
-  }          // for
-}  // ROIAlignBackward
-
-void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                Tensor argmax_y, Tensor argmax_x,
-                                int aligned_height, int aligned_width,
-                                float spatial_scale, int sampling_ratio,
-                                int pool_mode, bool aligned) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "ROIAlign_forward", [&] {
-        ROIAlignForward<scalar_t>(
-            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
-            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
-            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
-            aligned, channels, height, width);
-      });
-}
-
-void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                 Tensor argmax_y, Tensor argmax_x,
-                                 Tensor grad_input, int aligned_height,
-                                 int aligned_width, float spatial_scale,
-                                 int sampling_ratio, int pool_mode,
-                                 bool aligned) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad_output.stride(0);
-  int c_stride = grad_output.stride(1);
-  int h_stride = grad_output.stride(2);
-  int w_stride = grad_output.stride(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "ROIAlign_backward", [&] {
-        ROIAlignBackward<scalar_t>(
-            output_size, grad_output.data_ptr<scalar_t>(),
-            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
-            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
-            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
-            sampling_ratio, pool_mode, aligned, channels, height, width,
-            n_stride, c_stride, h_stride, w_stride);
-      });
-}
--- a/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor output);
-
-void ROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
-
-void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
-                                    int pooled_height, int pooled_width,
-                                    float spatial_scale, int sample_num,
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
                                    bool aligned, bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = features.size(1);
-  int data_height = features.size(2);
-  int data_width = features.size(3);
-  ROIAlignRotatedForwardCUDAKernelLauncher(
-      features, rois, spatial_scale, sample_num, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, pooled_height,
-      pooled_width, output);
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
 }

-void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int pooled_height,
-                                     int pooled_width, float spatial_scale,
-                                     int sample_num, bool aligned,
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
                                     bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = bottom_grad.size(1);
-  int data_height = bottom_grad.size(2);
-  int data_width = bottom_grad.size(3);
-  ROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, sample_num, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, pooled_height,
-      pooled_width, bottom_grad);
-}
-#endif
-
-void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       bool aligned, bool clockwise);
-
-void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, bool aligned,
-                                        bool clockwise);
-
-void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sample_num,
-                                   bool aligned, bool clockwise) {
-  ROIAlignRotatedForwardCPULauncher(features, rois, output, pooled_height,
-                                    pooled_width, spatial_scale, sample_num,
-                                    aligned, clockwise);
-}
-
-void roi_align_rotated_backward_cpu(Tensor features, Tensor rois, Tensor output,
-                                    int pooled_height, int pooled_width,
-                                    float spatial_scale, int sample_num,
-                                    bool aligned, bool clockwise) {
-  ROIAlignRotatedBackwardCPULauncher(features, rois, output, pooled_height,
-                                     pooled_width, spatial_scale, sample_num,
-                                     aligned, clockwise);
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
 }

 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
-                               int pooled_height, int pooled_width,
-                               float spatial_scale, int sample_num,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
                               bool aligned, bool clockwise) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-
-    roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
-                                   pooled_width, spatial_scale, sample_num,
-                                   aligned, clockwise);
-#else
-    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(output);
-
-    roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
-                                  pooled_width, spatial_scale, sample_num,
-                                  aligned, clockwise);
-  }
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
 }

-void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
-                                Tensor grad_input, int pooled_height,
-                                int pooled_width, float spatial_scale,
-                                int sample_num, bool aligned, bool clockwise) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_align_rotated_backward_cuda(grad_output, rois, grad_input,
-                                    pooled_height, pooled_width, spatial_scale,
-                                    sample_num, aligned, clockwise);
-#else
-    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(grad_output);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(grad_input);
-
-    roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
-                                   pooled_width, spatial_scale, sample_num,
-                                   aligned, clockwise);
-  }
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
 }
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-
-#include "../pytorch_cpp_helper.hpp"
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
-    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          // Rotate by theta around the center and translate
-          // In image space, (y, x) is the order for Right Handed System,
-          // and this is essentially multiplying the point by a rotation matrix
-          // to rotate it counterclockwise through angle theta.
-          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y < 0) {
-            y = 0;
-          }
-          if (x < 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignRotatedForward(const int nthreads, const T* input,
-                            const T& spatial_scale, const bool aligned,
-                            const bool clockwise, const int channels,
-                            const int height, const int width,
-                            const int pooled_height, const int pooled_width,
-                            const int sampling_ratio, const T* rois,
-                            T* output) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_center_w = current_roi[1] * spatial_scale - offset;
-    T roi_center_h = current_roi[2] * spatial_scale - offset;
-    T roi_width = current_roi[3] * spatial_scale;
-    T roi_height = current_roi[4] * spatial_scale;
-    T theta = current_roi[5];
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    T cos_theta = cos(theta);
-    T sin_theta = sin(theta);
-
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlignRotated do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    T roi_start_h = -roi_height / 2.0;
-    T roi_start_w = -roi_width / 2.0;
-
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
-        sin_theta, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] +
-                            pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] +
-                            pc.w4 * offset_input[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y < 0) {
-    y = 0;
-  }
-
-  if (x < 0) {
-    x = 0;
-  }
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-inline void add(T* address, const T& val) {
-  *address += val;
-}
-
-template <typename T>
-void ROIAlignRotatedBackward(
-    const int nthreads,
-    // may not be contiguous. should index using n_stride, etc
-    const T* grad_output, const T& spatial_scale, const bool aligned,
-    const bool clockwise, const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int sampling_ratio,
-    T* grad_input, const T* rois, const int n_stride, const int c_stride,
-    const int h_stride, const int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_center_w = current_roi[1] * spatial_scale - offset;
-    T roi_center_h = current_roi[2] * spatial_scale - offset;
-    T roi_width = current_roi[3] * spatial_scale;
-    T roi_height = current_roi[4] * spatial_scale;
-    T theta = current_roi[5];
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    T cos_theta = cos(theta);
-    T sin_theta = sin(theta);
-
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlignRotated do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    int output_offset = n * n_stride + c * c_stride;
-    const T* offset_grad_output = grad_output + output_offset;
-    const T grad_output_this_bin =
-        offset_grad_output[ph * h_stride + pw * w_stride];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    T roi_start_h = -roi_height / 2.0;
-    T roi_start_w = -roi_width / 2.0;
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T yy = roi_start_h + ph * bin_size_h +
-                   static_cast<T>(iy + .5f) * bin_size_h /
-                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T xx = roi_start_w + pw * bin_size_w +
-                     static_cast<T>(ix + .5f) * bin_size_w /
-                         static_cast<T>(roi_bin_grid_w);
-
-        // Rotate by theta around the center and translate
-        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
-        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high);
-
-        T g1 = grad_output_this_bin * w1 / count;
-        T g2 = grad_output_this_bin * w2 / count;
-        T g3 = grad_output_this_bin * w3 / count;
-        T g4 = grad_output_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          // atomic add is not needed for now since it is single threaded
-          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // for
-}  // ROIAlignRotatedBackward
-
-void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       bool aligned, bool clockwise) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "ROIAlignRotated_forward", [&] {
-        ROIAlignRotatedForward<scalar_t>(
-            output_size, input.data_ptr<scalar_t>(),
-            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
-            height, width, aligned_height, aligned_width, sampling_ratio,
-            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
-      });
-}
-
-void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, bool aligned,
-                                        bool clockwise) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad_output.stride(0);
-  int c_stride = grad_output.stride(1);
-  int h_stride = grad_output.stride(2);
-  int w_stride = grad_output.stride(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
-        ROIAlignRotatedBackward<scalar_t>(
-            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
-            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
-            height, width, aligned_height, aligned_width, sampling_ratio,
-            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-            n_stride, c_stride, h_stride, w_stride);
-      });
-}
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -14,14 +14,14 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
  int pooled_height;
  int pooled_width;
  float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();
@@ -30,7 +30,7 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
-                                 pooled_width, spatial_scale, sample_num,
+                                 pooled_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
 }

@@ -41,14 +41,14 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
  int pooled_height;
  int pooled_width;
  float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();
@@ -57,7 +57,7 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
  const auto& rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
-                                  pooled_width, spatial_scale, sample_num,
+                                  pooled_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
 }
 #endif
@@ -69,14 +69,14 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
  int pooled_height;
  int pooled_width;
  float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();
@@ -85,7 +85,7 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
-                                pooled_width, spatial_scale, sample_num,
+                                pooled_width, spatial_scale, sampling_ratio,
                                aligned, clockwise);
 }

@@ -96,14 +96,14 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
  int pooled_height;
  int pooled_width;
  float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();
@@ -112,7 +112,7 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
  const auto& rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
-                                 pooled_width, spatial_scale, sample_num,
+                                 pooled_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
 }

@@ -120,7 +120,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
-    .attr("sample_num")
+    .attr("sampling_ratio")
    .attr("aligned")
    .attr("clockwise")
    .input(2)
@@ -135,7 +135,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
-    .attr("sample_num")
+    .attr("sampling_ratio")
    .attr("aligned")
    .attr("clockwise")
    .input(2)