support device dispatch in parrots (#1588)

a4dc2a72 · pc · GitHub · 0bcbeadb · a4dc2a72 · a4dc2a72
Unverified Commit a4dc2a72 authored Dec 24, 2021 by pc Committed by GitHub Dec 24, 2021
20 changed files
--- a/mmcv/ops/csrc/parrots/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
 // Modified from
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
-void AssignScoreWithKForwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& points, const Tensor& centers, const Tensor& scores,
-    const Tensor& knn_idx, Tensor& output);
-void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
-  AssignScoreWithKForwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
-      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+                       aggregate, points, centers, scores, knn_idx, output);
-};
+}
-void AssignScoreWithKBackwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores);
-void assign_score_withk_backward_cuda(
+void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
-  AssignScoreWithKBackwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
-      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
-      grad_points, grad_centers, grad_scores);
+                       grad_points, grad_centers, grad_scores);
-};
+}
-#endif
 void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate) {
-  if (points.device().is_cuda()) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
-#ifdef MMCV_WITH_CUDA
+                                  centers, scores, knn_idx, output);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(output);
-    assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
-                                    centers, scores, knn_idx, output);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
 }
 void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate) {
-  if (grad_points.device().is_cuda()) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
-#ifdef MMCV_WITH_CUDA
+                                   points, centers, scores, knn_idx,
-    CHECK_CONTIGUOUS(grad_out);
+                                   grad_points, grad_centers, grad_scores);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(grad_scores);
-    CHECK_CONTIGUOUS(grad_points);
-    CHECK_CONTIGUOUS(grad_centers);
-    assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
-                                     points, centers, scores, knn_idx,
-                                     grad_points, grad_centers, grad_scores);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/ball_query.cpp
+++ b/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -2,36 +2,19 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
-void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
-                                        float max_radius, int nsample,
-                                        const Tensor new_xyz, const Tensor xyz,
-                                        Tensor idx);
-void ball_query_forward_cuda(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
-  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
-                                     new_xyz, xyz, idx);
+                       nsample, new_xyz, xyz, idx);
-};
+}
-#endif
 void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
-#ifdef MMCV_WITH_CUDA
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-    ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
-                            new_xyz_tensor, xyz_tensor, idx_tensor);
-#else
-    AT_ERROR("ball_query is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("ball_query is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
-                                    Tensor ious, const int mode,
-                                    const bool aligned, const int offset);
-void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
-  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
 }
-#endif
 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset) {
-  if (bboxes1.device().is_cuda()) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(bboxes1);
-    CHECK_CUDA_INPUT(bboxes2);
-    CHECK_CUDA_INPUT(ious);
-    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
-#else
-    AT_ERROR("bbox_overlaps is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("bbox_overlaps is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/border_align.cpp
+++ b/mmcv/ops/csrc/parrots/border_align.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
-void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
-                                          const Tensor &boxes, Tensor output,
-                                          Tensor argmax_idx,
-                                          const int pool_size);
-void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
-                                           const Tensor &boxes,
-                                           const Tensor &argmax_idx,
-                                           Tensor grad_input,
-                                           const int pool_size);
-void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
-  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
-                                       pool_size);
+                       argmax_idx, pool_size);
 }
-void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
-  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
-                                        grad_input, pool_size);
+                       argmax_idx, grad_input, pool_size);
 }
-#endif
 void border_align_forward(const Tensor &input, const Tensor &boxes,
                          Tensor output, Tensor argmax_idx,
                          const int pool_size) {
-  if (input.device().is_cuda()) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax_idx);
-    border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
 }
 void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                           const Tensor &argmax_idx, Tensor grad_input,
                           const int pool_size) {
-  if (grad_output.device().is_cuda()) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
-#ifdef MMCV_WITH_CUDA
+                             pool_size);
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(argmax_idx);
-    CHECK_CUDA_INPUT(grad_input);
-    border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
-                               pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -2,28 +2,18 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
-#ifdef MMCV_WITH_CUDA
+                       aligned);
-void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+}
-                          const int mode_flag, const bool aligned);
-#endif
 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned) {
-  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
-  if (boxes1.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
-  }
 }
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
-#include "box_iou_rotated_utils.hpp"
-#include "pytorch_cpp_helper.hpp"
-template <typename T>
-void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
-                                Tensor ious, const int mode_flag,
-                                const bool aligned) {
-  int output_size = ious.numel();
-  auto num_boxes1 = boxes1.size(0);
-  auto num_boxes2 = boxes2.size(0);
-  if (aligned) {
-    for (int i = 0; i < output_size; i++) {
-      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
-                                          boxes2[i].data_ptr<T>(), mode_flag);
-    }
-  } else {
-    for (int i = 0; i < num_boxes1; i++) {
-      for (int j = 0; j < num_boxes2; j++) {
-        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
-            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
-      }
-    }
-  }
-}
-void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned) {
-  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
-}
--- a/mmcv/ops/csrc/parrots/carafe.cpp
+++ b/mmcv/ops/csrc/parrots/carafe.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
-void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
-                                     Tensor rfeatures, Tensor routput,
-                                     Tensor rmasks, Tensor output,
-                                     const int kernel_size,
-                                     const int group_size,
-                                     const int scale_factor);
-void CARAFEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
-    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
-    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor);
-void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
-  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
-                                  output, kernel_size, group_size,
+                       rmasks, output, kernel_size, group_size, scale_factor);
-                                  scale_factor);
 }
-void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
-  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
-                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
-                                   bottom_grad, mask_grad, kernel_size,
+                       bottom_grad, mask_grad, kernel_size, group_size,
-                                   group_size, scale_factor);
+                       scale_factor);
 }
-#endif
 void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                    Tensor routput, Tensor rmasks, Tensor output,
                    int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
-#ifdef MMCV_WITH_CUDA
+                      kernel_size, group_size, scale_factor);
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(routput);
-    CHECK_CUDA_INPUT(rmasks);
-    CHECK_CUDA_INPUT(output);
-    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
-                        kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
 }
 void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                     Tensor mask_grad, int kernel_size, int group_size,
                     int scale_factor) {
-  if (top_grad.device().is_cuda()) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
-#ifdef MMCV_WITH_CUDA
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
-    CHECK_CUDA_INPUT(top_grad);
+                       kernel_size, group_size, scale_factor);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rtop_grad);
-    CHECK_CUDA_INPUT(rbottom_grad_hs);
-    CHECK_CUDA_INPUT(rbottom_grad);
-    CHECK_CUDA_INPUT(rmask_grad);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
-                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
-                         kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/carafe_naive.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
-void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
-                                          const Tensor masks, Tensor output,
-                                          const int kernel_size,
-                                          const int group_size,
-                                          const int scale_factor);
-void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor features, const Tensor masks,
-    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
-    const int group_size, const int scale_factor);
-void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
-  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
-                                       group_size, scale_factor);
+                       kernel_size, group_size, scale_factor);
 }
-void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
-  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
-                                        mask_grad, kernel_size, group_size,
+                       bottom_grad, mask_grad, kernel_size, group_size,
-                                        scale_factor);
+                       scale_factor);
 }
-#endif
 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
-#ifdef MMCV_WITH_CUDA
+                            scale_factor);
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(output);
-    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
-                              scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
 }
 void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                           Tensor bottom_grad, Tensor mask_grad,
                           int kernel_size, int group_size, int scale_factor) {
-  if (top_grad.device().is_cuda()) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
-#ifdef MMCV_WITH_CUDA
+                             kernel_size, group_size, scale_factor);
-    CHECK_CUDA_INPUT(top_grad);
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
-                               mask_grad, kernel_size, group_size,
-                               scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/correlation.cpp
+++ b/mmcv/ops/csrc/parrots/correlation.cpp
@@ -2,65 +2,37 @@
 #include <iostream>
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
-void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
-                                          Tensor output, int kH, int kW,
-                                          int patchH, int patchW, int padH,
-                                          int padW, int dilationH,
-                                          int dilationW, int dilation_patchH,
-                                          int dilation_patchW, int dH, int dW);
-void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
-                                           Tensor input2, Tensor grad_input1,
-                                           Tensor grad_input2, int kH, int kW,
-                                           int patchH, int patchW, int padH,
-                                           int padW, int dilationH,
-                                           int dilationW, int dilation_patchH,
-                                           int dilation_patchW, int dH, int dW);
-void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
-  CorrelationForwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
-      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
-      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+                       dilation_patchH, dilation_patchW, dH, dW);
 }
-void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
-  CorrelationBackwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
-      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
-      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                       padW, dilationH, dilationW, dilation_patchH,
-      dilation_patchW, dH, dW);
+                       dilation_patchW, dH, dW);
 }
-#endif
 void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
-#ifdef MMCV_WITH_CUDA
+                           padW, dilationH, dilationW, dilation_patchH,
-    CHECK_CUDA_INPUT(input1);
+                           dilation_patchW, dH, dW);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
-                             padH, padW, dilationH, dilationW, dilation_patchH,
-                             dilation_patchW, dH, dW);
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
 }
 void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
-#ifdef MMCV_WITH_CUDA
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
-    CHECK_CUDA_INPUT(grad_output);
+                            dilationH, dilationW, dilation_patchH,
-    CHECK_CUDA_INPUT(input1);
+                            dilation_patchW, dH, dW);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_backward(grad_output, input1, input2, grad_input1,
-                              grad_input2, kH, kW, patchH, patchW, padH, padW,
-                              dilationH, dilationW, dilation_patchH,
-                              dilation_patchW, dH, dW);
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/cudabind.cpp
+++ b/mmcv/ops/csrc/parrots/cudabind.cpp
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
-#ifdef MMCV_WITH_CUDA
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
-void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
+                            const int width, const int ksize_h,
-                       const int height, const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
-                       const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
-                       const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
-                       const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
-                       const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
-                       Tensor data_col);
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
-void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
-                       const int height, const int width, const int ksize_h,
+                       deformable_group, grad_im);
-                       const int ksize_w, const int pad_h, const int pad_w,
+}
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor grad_im);
-void deformable_col2im_coord(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-#endif
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+void deformable_col2im_coord_impl(
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col);
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im);
-void deformable_col2im_coord_cpu(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
 void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                             at::Tensor *gradOutput, at::Tensor weight, int kH,
@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
       output_buffer.size(2), output_buffer.size(3)});
  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-#ifdef MMCV_WITH_CUDA
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                           dilationW, im2col_step, deformable_group, columns);
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
-    if (input.device().is_cuda()) {
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
-#ifdef MMCV_WITH_CUDA
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
-      deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                                 dH, dW, dilationH, dilationW, im2col_step,
-                              inputHeight, inputWidth, kH, kW, padH, padW, dH,
+                                 deformable_group, gradOffset[elt]);
-                              dW, dilationH, dilationW, im2col_step,
-                              deformable_group, gradOffset[elt]);
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-      deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                           dilationW, im2col_step, deformable_group,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           gradInput[elt]);
-                        dilationW, im2col_step, deformable_group,
-                        gradInput[elt]);
-#endif
-    } else {
-      deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
-                                  inputHeight, inputWidth, kH, kW, padH, padW,
-                                  dH, dW, dilationH, dilationW, im2col_step,
-                                  deformable_group, gradOffset[elt]);
-      deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group,
-                            gradInput[elt]);
-    }
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-#ifdef MMCV_WITH_CUDA
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                           dilationW, im2col_step, deformable_group, columns);
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(

--- a/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-template <typename T>
-T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
-                                 const int height, const int width, T h, T w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = input[h_high * data_width + w_high];
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-template <typename T>
-T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
-                          const int height, const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-template <typename T>
-T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
-                            const int width, const T *im_data,
-                            const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-  T weight = 0;
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-  return weight;
-}
-template <typename T>
-void deformable_im2col_cpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T *data_col) {
-  for (int index = 0; index < n; index++) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    T *data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr =
-        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
-                                               width, h_im, w_im);
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-template <typename T>
-void deformable_col2im_cpu_kernel(
-    const int n, const T *data_col, const T *data_offset, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T *grad_im) {
-  for (int index = 0; index < n; index++) {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-    const int deformable_group_index = c / channel_per_deformable_group;
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-    const T cur_top_grad = data_col[index];
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
-                                      cur_h + dy, cur_w + dx, height, width);
-          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-template <typename T>
-void deformable_col2im_coord_cpu_kernel(
-    const int n, const T *data_col, const T *data_im, const T *data_offset,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int offset_channels, const int deformable_group, const int height_col,
-    const int width_col, T *grad_offset) {
-  for (int index = 0; index < n; index++) {
-    T val = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T *data_col_ptr = data_col + deformable_group_index *
-                                           channel_per_deformable_group *
-                                           batch_size * width_col * height_col;
-    const T *data_im_ptr =
-        data_im + (b * deformable_group + deformable_group_index) *
-                      channel_per_deformable_group / kernel_h / kernel_w *
-                      height * width;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-        inv_h = inv_w = -2;
-      const T weight = get_coordinate_weight_cpu(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-    grad_offset[index] = val;
-  }
-}
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
-        deformable_im2col_cpu_kernel<scalar_t>(
-            num_kernels, data_im.data_ptr<scalar_t>(),
-            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, parallel_imgs, channels,
-            deformable_group, height_col, width_col,
-            data_col.data_ptr<scalar_t>());
-      });
-}
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im) {
-  // todo: make sure parallel_imgs is passed in correctly
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels =
-      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-        deformable_col2im_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_offset_, channels, height, width,
-            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-            dilation_w, channel_per_deformable_group, parallel_imgs,
-            deformable_group, height_col, width_col, grad_im_);
-      }));
-}
-void deformable_col2im_coord_cpu(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
-                    deformable_group * parallel_imgs;
-  int channel_per_deformable_group =
-      channels * ksize_h * ksize_w / deformable_group;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-        deformable_col2im_coord_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_im_, data_offset_, channels, height,
-            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
-            2 * ksize_h * ksize_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_);
-      }));
-}
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
-void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                            Tensor offset, Tensor output,
-                                            int pooled_height, int pooled_width,
-                                            float spatial_scale,
-                                            int sampling_ratio, float gamma);
-void DeformRoIPoolBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma);
-void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
-                                         pooled_height, pooled_width,
+                       output, pooled_height, pooled_width, spatial_scale,
-                                         spatial_scale, sampling_ratio, gamma);
+                       sampling_ratio, gamma);
 }
-void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
-  DeformRoIPoolBackwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+                       offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma);
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
 }
-#endif
 void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
-  if (input.device().is_cuda()) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
-#ifdef MMCV_WITH_CUDA
+                               pooled_width, spatial_scale, sampling_ratio,
-    CHECK_CUDA_INPUT(input);
+                               gamma);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(output);
-    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
-                                 pooled_width, spatial_scale, sampling_ratio,
-                                 gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
 }
 void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
-  if (grad_output.device().is_cuda()) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
-#ifdef MMCV_WITH_CUDA
+                                grad_offset, pooled_height, pooled_width,
-    CHECK_CUDA_INPUT(grad_output);
+                                spatial_scale, sampling_ratio, gamma);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_offset);
-    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
-                                  grad_offset, pooled_height, pooled_width,
-                                  spatial_scale, sampling_ratio, gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight, Tensor buff,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
-                                            gamma, alpha);
+                       output, gamma, alpha);
 }
-void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
-  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
-                                             gamma, alpha);
+                       grad_input, gamma, alpha);
 }
-void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
-                                            gamma, alpha);
+                       output, gamma, alpha);
 }
-void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
-                                             grad_input, gamma, alpha);
+                       buff, grad_input, gamma, alpha);
 }
-#endif
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
 }
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
-#ifdef MMCV_WITH_CUDA
+                                   alpha);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(grad_input);
-    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
-                                     alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
 }
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
 }
 void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
-  if (input.device().is_cuda()) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
-#ifdef MMCV_WITH_CUDA
+                                   gamma, alpha);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(buff);
-    CHECK_CUDA_INPUT(grad_input);
-    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
-                                     gamma, alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -2,61 +2,33 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
-void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                          Tensor temp_tensor, Tensor idx_tensor,
-                                                    const float *dataset,
+                                          int b, int n, int m) {
-                                                    float *temp, int *idxs);
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
-void furthest_point_sampling_forward_cuda(int b, int n, int m,
-                                          const float *dataset, float *temp,
-                                          int *idxs) {
-  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
 }
-void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
-    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
-void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
+                                                    int n, int m) {
-                                                    const float *dataset,
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
-                                                    float *temp, int *idxs) {
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
-  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
-                                                         idxs);
 }
-#endif
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m) {
-  if (points_tensor.device().is_cuda()) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
-#ifdef MMCV_WITH_CUDA
+                                       b, n, m);
-    const float *points = points_tensor.data_ptr<float>();
-    float *temp = temp_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR("furthest_point_sampling is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling is not implemented on CPU");
-  }
 }
 void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m) {
-  if (points_tensor.device().is_cuda()) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
-#ifdef MMCV_WITH_CUDA
+                                                 idx_tensor, b, n, m);
-    const float *points = points_tensor.data<float>();
-    float *temp = temp_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-    furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR(
-        "furthest_point_sampling_with_dist is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
+++ b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
-// Copyright (c) OpenMMLab. All rights reserved
 // Modified from
-// from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
-#include "pytorch_cpp_helper.hpp"
-#ifdef MMCV_WITH_CUDA
+/*
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-                                      const torch::Tensor &bias,
-                                      const torch::Tensor &refer, int act,
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
-                                      int grad, float alpha, float scale);
+Augmentation (ADA)
+=======================================================================
+1. Definitions
+"Licensor" means any person or entity that distributes its Work.
+"Software" means the original work of authorship made available under
+this License.
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+2. License Grants
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+3. Limitations
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
-#endif
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+=======================================================================
+*/
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
-torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
-                                   const torch::Tensor &bias,
+                                   const torch::Tensor& bias,
-                                   const torch::Tensor &refer, int act,
+                                   const torch::Tensor& refer, int act,
                                   int grad, float alpha, float scale) {
-#ifdef MMCV_WITH_CUDA
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
-  CHECK_CUDA(input);
+                                      scale);
-  CHECK_CUDA(bias);
-  return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
-#else
-  AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
-#endif
 }
--- a/mmcv/ops/csrc/parrots/gather_points.cpp
+++ b/mmcv/ops/csrc/parrots/gather_points.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void gather_points_forward_impl(int b, int c, int n, int npoints,
-void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           const Tensor points,
-                                           const Tensor idx, Tensor out);
-void gather_points_forward_cuda(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
-  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
-};
+                       idx, out);
+}
-void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                            const Tensor grad_out,
-                                            const Tensor idx,
-                                            Tensor grad_points);
-void gather_points_backward_cuda(int b, int c, int n, int npoints,
+void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
-  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
-                                         grad_points);
+                       idx, grad_points);
-};
+}
-#endif
 void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n,
                           int npoints) {
-  if (points_tensor.device().is_cuda()) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
-#ifdef MMCV_WITH_CUDA
+                             out_tensor);
-    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
-                               out_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
 }
 void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints) {
-  if (grad_out_tensor.device().is_cuda()) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
-#ifdef MMCV_WITH_CUDA
+                              grad_points_tensor);
-    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                                grad_points_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/group_points.cpp
+++ b/mmcv/ops/csrc/parrots/group_points.cpp
@@ -3,56 +3,32 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
-void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                          int nsample, const Tensor points,
-                                          const Tensor idx, Tensor out);
-void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
-  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
-                                       out);
+                       points, idx, out);
-};
+}
-void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
-                                           int nsample, const Tensor grad_out,
-                                           const Tensor idx,
-                                           Tensor grad_points);
-void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
-  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
-                                        idx, grad_points);
+                       grad_out, idx, grad_points);
-};
+}
-#endif
 void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample) {
-  if (points_tensor.device().is_cuda()) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
-#ifdef MMCV_WITH_CUDA
+                       points_tensor, idx_tensor, out_tensor);
-    group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
-                              idx_tensor, out_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
 }
 void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample) {
-  if (grad_out_tensor.device().is_cuda()) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
-#ifdef MMCV_WITH_CUDA
+                             idx_tensor, grad_points_tensor);
-    group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
-                               idx_tensor, grad_points_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/parrots/info.cpp
+++ b/mmcv/ops/csrc/parrots/info.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
 */
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
-#ifdef MMCV_WITH_CUDA
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#define CHECK_ERROR(state) \
-  { gpuAssert((state), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
-                                                   const Tensor boxes_a,
-                                                   const int num_b,
-                                                   const Tensor boxes_b,
-                                                   Tensor ans_overlap);
-void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
-  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
-                                                ans_overlap);
+                       num_b, boxes_b, ans_overlap);
-};
+}
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou);
-void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                      const int num_b, const Tensor boxes_b,
                                      Tensor ans_iou) {
-  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
-                                            ans_iou);
+                       boxes_b, ans_iou);
-};
+}
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long *mask, int boxes_num,
-                                       float nms_overlap_thresh);
-void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
                            int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
-};
+                       nms_overlap_thresh);
+}
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long *mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh);
-void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
                                   int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
-                                          nms_overlap_thresh);
+                       nms_overlap_thresh);
-};
+}
-#endif
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap) {
@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
-  if (boxes_a.device().is_cuda()) {
+  int num_a = boxes_a.size(0);
-#ifdef MMCV_WITH_CUDA
+  int num_b = boxes_b.size(0);
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
-    CHECK_CUDA_INPUT(ans_overlap);
+                                       ans_overlap);
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-    iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
-                                         ans_overlap);
-#else
-    AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
-  }
 }
 void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
-  if (boxes_a.device().is_cuda()) {
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_iou);
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-    iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
-#else
-    AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
-  }
 }
 void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                       float nms_overlap_thresh) {
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
-  if (boxes.device().is_cuda()) {
+  int boxes_num = boxes.size(0);
-#ifdef MMCV_WITH_CUDA
+  int64_t *keep_data = keep.data_ptr<int64_t>();
-    CHECK_CUDA_INPUT(boxes);
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-    CHECK_CONTIGUOUS(keep);
-    int boxes_num = boxes.size(0);
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
-    Tensor mask =
+  at::Tensor mask_cpu = mask.to(at::kCPU);
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_host =
-    unsigned long long *mask_data =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
-    at::Tensor mask_cpu = mask.to(at::kCPU);
+  std::vector<unsigned long long> remv_cpu(col_blocks);
-    unsigned long long *mask_host =
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-    std::vector<unsigned long long> remv_cpu(col_blocks);
+  int num_to_keep = 0;
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-    int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
-    for (int i = 0; i < boxes_num; i++) {
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
+      keep_data[num_to_keep++] = i;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+        remv_cpu[j] |= p[j];
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
      }
    }
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
    *keep_num_data = num_to_keep;
-#else
-    AT_ERROR("iou3d_nms is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms is not implemented on CPU");
  }
 }
@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
-  if (boxes.device().is_cuda()) {
+  CHECK_CONTIGUOUS(boxes);
-#ifdef MMCV_WITH_CUDA
+  CHECK_CONTIGUOUS(keep);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
-                                  nms_overlap_thresh);
-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-    int num_to_keep = 0;
-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
-      }
-    }
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-    *keep_num_data = num_to_keep;
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);
-#else
+  at::Tensor mask_cpu = mask.to(at::kCPU);
-    AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
+  unsigned long long *mask_host =
-#endif
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-  } else {
-    AT_ERROR("iou3d_nms_normal is not implemented on CPU");
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
  }
+  *keep_num_data = num_to_keep;
 }