release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · df465820 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}

-#ifdef MMCV_WITH_CUDA
-
-void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor data_col);
-
-void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor grad_im);
-
-void deformable_col2im_coord(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-
-#endif
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}

-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col);
-
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im);
-
-void deformable_col2im_coord_cpu(
+void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}

 void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                             at::Tensor *gradOutput, at::Tensor weight, int kH,
@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
       output_buffer.size(2), output_buffer.size(3)});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});

-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
-                              inputHeight, inputWidth, kH, kW, padH, padW, dH,
-                              dW, dilationH, dilationW, im2col_step,
-                              deformable_group, gradOffset[elt]);
-
-      deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group,
-                        gradInput[elt]);
-#endif
-    } else {
-      deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
-                                  inputHeight, inputWidth, kH, kW, padH, padW,
-                                  dH, dW, dilationH, dilationW, im2col_step,
-                                  deformable_group, gradOffset[elt]);
-
-      deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group,
-                            gradInput[elt]);
-    }
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);

    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);

    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(

--- a/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T>
-T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
-                                 const int height, const int width, T h, T w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
-
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename T>
-T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
-                          const int height, const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename T>
-T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
-                            const int width, const T *im_data,
-                            const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename T>
-void deformable_im2col_cpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T *data_col) {
-  for (int index = 0; index < n; index++) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    T *data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr =
-        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
-                                               width, h_im, w_im);
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-void deformable_col2im_cpu_kernel(
-    const int n, const T *data_col, const T *data_offset, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T *grad_im) {
-  for (int index = 0; index < n; index++) {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[index];
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
-                                      cur_h + dy, cur_w + dx, height, width);
-          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void deformable_col2im_coord_cpu_kernel(
-    const int n, const T *data_col, const T *data_im, const T *data_offset,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int offset_channels, const int deformable_group, const int height_col,
-    const int width_col, T *grad_offset) {
-  for (int index = 0; index < n; index++) {
-    T val = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T *data_col_ptr = data_col + deformable_group_index *
-                                           channel_per_deformable_group *
-                                           batch_size * width_col * height_col;
-    const T *data_im_ptr =
-        data_im + (b * deformable_group + deformable_group_index) *
-                      channel_per_deformable_group / kernel_h / kernel_w *
-                      height * width;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-        inv_h = inv_w = -2;
-      const T weight = get_coordinate_weight_cpu(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-
-    grad_offset[index] = val;
-  }
-}
-
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
-        deformable_im2col_cpu_kernel<scalar_t>(
-            num_kernels, data_im.data_ptr<scalar_t>(),
-            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, parallel_imgs, channels,
-            deformable_group, height_col, width_col,
-            data_col.data_ptr<scalar_t>());
-      });
-}
-
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im) {
-  // todo: make sure parallel_imgs is passed in correctly
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels =
-      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        deformable_col2im_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_offset_, channels, height, width,
-            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-            dilation_w, channel_per_deformable_group, parallel_imgs,
-            deformable_group, height_col, width_col, grad_im_);
-      }));
-}
-
-void deformable_col2im_coord_cpu(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
-                    deformable_group * parallel_imgs;
-  int channel_per_deformable_group =
-      channels * ksize_h * ksize_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-
-        deformable_col2im_coord_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_im_, data_offset_, channels, height,
-            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
-            2 * ksize_h * ksize_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_);
-      }));
-}
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                            Tensor offset, Tensor output,
-                                            int pooled_height, int pooled_width,
-                                            float spatial_scale,
-                                            int sampling_ratio, float gamma);
-
-void DeformRoIPoolBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma);
-
-void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
-                                         pooled_height, pooled_width,
-                                         spatial_scale, sampling_ratio, gamma);
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
 }

-void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
-  DeformRoIPoolBackwardCUDAKernelLauncher(
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma);
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
 }
-#endif

 void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(output);
-
-    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
-                                 pooled_width, spatial_scale, sampling_ratio,
-                                 gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
 }

 void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_offset);
-
-    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
-                                  grad_offset, pooled_height, pooled_width,
-                                  spatial_scale, sampling_ratio, gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
 }
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "diff_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  at::Tensor boxes, scores, dets;
+  auto vertices = buildATensor(ctx, ins[0]);
+  auto mask = buildATensor(ctx, ins[1]);
+  auto num_valid = buildATensor(ctx, ins[2]);
+  auto out =
+      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
+    .input(3)
+    .output(1)
+    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
+    .done();
+#endif
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DIFF_IOU_ROTATED_PYTORCH_H
+#define DIFF_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+#endif  // DIFF_IOU_ROTATED_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight, Tensor buff,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
 }

-void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
-  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
-                                             gamma, alpha);
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
 }

-void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
 }

-void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
-                                             grad_input, gamma, alpha);
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
 }
-#endif

 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-
-    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
 }

 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(grad_input);
-
-    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
-                                     alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
 }

 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-
-    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
 }

 void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(buff);
-    CHECK_CUDA_INPUT(grad_input);
-
-    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
-                                     gamma, alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
 }
--- a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -2,61 +2,33 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
-                                                    const float *dataset,
-                                                    float *temp, int *idxs);
-
-void furthest_point_sampling_forward_cuda(int b, int n, int m,
-                                          const float *dataset, float *temp,
-                                          int *idxs) {
-  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
 }

-void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
-    int b, int n, int m, const float *dataset, float *temp, int *idxs);
-
-void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
-                                                    const float *dataset,
-                                                    float *temp, int *idxs) {
-  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
-                                                         idxs);
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
 }
-#endif

 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    const float *points = points_tensor.data_ptr<float>();
-    float *temp = temp_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR("furthest_point_sampling is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling is not implemented on CPU");
-  }
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
 }

 void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    const float *points = points_tensor.data<float>();
-    float *temp = temp_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-
-    furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR(
-        "furthest_point_sampling_with_dist is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
-  }
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
 }
--- a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
+++ b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
-// Copyright (c) OpenMMLab. All rights reserved
 // Modified from
-// from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
-#include "pytorch_cpp_helper.hpp"

-#ifdef MMCV_WITH_CUDA
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
-                                      const torch::Tensor &bias,
-                                      const torch::Tensor &refer, int act,
-                                      int grad, float alpha, float scale);
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.

-#endif
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}

-torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
-                                   const torch::Tensor &bias,
-                                   const torch::Tensor &refer, int act,
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
                                   int grad, float alpha, float scale) {
-#ifdef MMCV_WITH_CUDA
-  CHECK_CUDA(input);
-  CHECK_CUDA(bias);
-
-  return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
-#else
-  AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
-#endif
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
 }
--- a/mmcv/ops/csrc/parrots/gather_points.cpp
+++ b/mmcv/ops/csrc/parrots/gather_points.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           const Tensor points,
-                                           const Tensor idx, Tensor out);
-
-void gather_points_forward_cuda(int b, int c, int n, int npoints,
+void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
-  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
-};
-
-void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                            const Tensor grad_out,
-                                            const Tensor idx,
-                                            Tensor grad_points);
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}

-void gather_points_backward_cuda(int b, int c, int n, int npoints,
+void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
-  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
-                                         grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}

 void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n,
                           int npoints) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
-                               out_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
 }

 void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                                grad_points_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
 }
--- a/mmcv/ops/csrc/parrots/group_points.cpp
+++ b/mmcv/ops/csrc/parrots/group_points.cpp
@@ -3,56 +3,32 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                          int nsample, const Tensor points,
-                                          const Tensor idx, Tensor out);
-void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
-  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
-                                       out);
-};
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}

-void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           int nsample, const Tensor grad_out,
-                                           const Tensor idx,
-                                           Tensor grad_points);
-void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
-  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
-                                        idx, grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}

 void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
-                              idx_tensor, out_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
 }

 void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
-                               idx_tensor, grad_points_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
 }
--- a/mmcv/ops/csrc/parrots/info.cpp
+++ b/mmcv/ops/csrc/parrots/info.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -8,225 +8,128 @@ All Rights Reserved 2019-2020.
 */

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;

-#ifdef MMCV_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-#define CHECK_ERROR(state) \
-  { gpuAssert((state), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-
-void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
-                                                   const Tensor boxes_a,
-                                                   const int num_b,
-                                                   const Tensor boxes_b,
-                                                   Tensor ans_overlap);
-void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
-  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                                ans_overlap);
-};
-
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou);
-void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou) {
-  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-};
-
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long *mask, int boxes_num,
-                                       float nms_overlap_thresh);
-
-void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
-                            int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
-};
-
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long *mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh);
-
-void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
-                                   int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
-                                          nms_overlap_thresh);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}

-void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                     Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}

-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_overlap);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
-                                         ans_overlap);
-#else
-    AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
-  }
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
 }

-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);

-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_iou);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
-#else
-    AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
-  }
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
 }

-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);

-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
-
-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);

-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();

-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);

-    int num_to_keep = 0;
+  int num_to_keep = 0;

-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;

-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
      }
    }
-
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
    *keep_num_data = num_to_keep;
-
-#else
-    AT_ERROR("iou3d_nms is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms is not implemented on CPU");
  }
 }

-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)

-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);

-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
                                  nms_overlap_thresh);

-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();

-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-    int num_to_keep = 0;
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;

-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;

-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
      }
    }
-
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-    *keep_num_data = num_to_keep;
-
-#else
-    AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms_normal is not implemented on CPU");
  }
+
+  *keep_num_data = num_to_keep;
 }
--- a/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -8,7 +8,7 @@
 using namespace parrots;

 #ifdef MMCV_WITH_CUDA
-void iou3d_boxes_iou_bev_forward_cuda_parrots(
+void iou3d_boxes_overlap_bev_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto boxes_a = buildATensor(ctx, ins[0]);
@@ -16,12 +16,12 @@ void iou3d_boxes_iou_bev_forward_cuda_parrots(

  auto ans_iou = buildATensor(ctx, outs[0]);

-  iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou);
+  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
 }

-void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                    const OperatorBase::in_list_t& ins,
-                                    OperatorBase::out_list_t& outs) {
+void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
  float nms_overlap_thresh;
  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();

@@ -30,13 +30,13 @@ void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
  auto keep = buildATensor(ctx, outs[0]);
  auto keep_num = buildATensor(ctx, outs[1]);

-  iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }

-void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
-                                           const SSElement& attr,
-                                           const OperatorBase::in_list_t& ins,
-                                           OperatorBase::out_list_t& outs) {
+void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
  float nms_overlap_thresh;
  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();

@@ -45,26 +45,26 @@ void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
  auto keep = buildATensor(ctx, outs[0]);
  auto keep_num = buildATensor(ctx, outs[1]);

-  iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }

-PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
    .input(2)
    .output(1)
-    .apply(iou3d_boxes_iou_bev_forward_cuda_parrots)
+    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
    .done();

-PARROTS_EXTENSION_REGISTER(iou3d_nms_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
    .attr("nms_overlap_thresh")
    .input(1)
    .output(2)
-    .apply(iou3d_nms_forward_cuda_parrots)
+    .apply(iou3d_nms3d_forward_cuda_parrots)
    .done();

-PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
    .attr("nms_overlap_thresh")
    .input(1)
    .output(2)
-    .apply(iou3d_nms_normal_forward_cuda_parrots)
+    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
    .done();
 #endif
--- a/mmcv/ops/csrc/parrots/iou3d_pytorch.h
+++ b/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -4,13 +4,13 @@
 #include <torch/extension.h>
 using namespace at;

-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou);
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);

-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh);
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);

-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh);
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);

 #endif  // IOU_3D_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/knn.cpp
+++ b/mmcv/ops/csrc/parrots/knn.cpp
@@ -2,31 +2,16 @@
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
-                                  const Tensor xyz, const Tensor new_xyz,
-                                  Tensor idx, Tensor dist2);
-
-void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
-  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
 }
-#endif

 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-
-    knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
-                     dist2_tensor);
-#else
-    AT_ERROR("knn is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("knn is not implemented on CPU");
-  }
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
 }
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int kernel_h,
-                                           const int kernel_w, const int pad_h,
-                                           const int pad_w);
-
-void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int height,
-                                           const int width, const int channels);
-
-void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                        kernel_h, kernel_w, pad_h, pad_w);
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
 }

-void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-                                        width, channels);
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
 }
-#endif

 void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w) {
-  if (im.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(im);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(col);
-    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
-                               kernel_w, pad_h, pad_w);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
 }

 void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels) {
-  if (col.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(col);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(im);
-    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
-                               channels);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
 }
--- a/mmcv/ops/csrc/parrots/min_area_polygons.cpp
+++ b/mmcv/ops/csrc/parrots/min_area_polygons.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
--- a/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
--- a/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
+++ b/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H