Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map

Refactor csrc with device dispatcher (#1463)
* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map
230f9a3b · q.yao · GitHub · ef8ba752 · 230f9a3b · 230f9a3b
Unverified Commit 230f9a3b authored Nov 23, 2021 by q.yao Committed by GitHub Nov 23, 2021
20 changed files
--- a/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
+++ b/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES.
 */

 #include "pytorch_cpp_helper.hpp"
-
-#ifdef MMCV_WITH_CUDA
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
-                                      const torch::Tensor& bias,
-                                      const torch::Tensor& refer, int act,
-                                      int grad, float alpha, float scale);
-
-#endif
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}

 torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
                                   const torch::Tensor& bias,
                                   const torch::Tensor& refer, int act,
                                   int grad, float alpha, float scale) {
-#ifdef MMCV_WITH_CUDA
-  CHECK_CUDA(input);
-  CHECK_CUDA(bias);
-
-  return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
-#else
-  AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
-#endif
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
 }
--- a/mmcv/ops/csrc/pytorch/gather_points.cpp
+++ b/mmcv/ops/csrc/pytorch/gather_points.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           const Tensor points,
-                                           const Tensor idx, Tensor out);
-
-void gather_points_forward_cuda(int b, int c, int n, int npoints,
+void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
-  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
-};
-
-void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                            const Tensor grad_out,
-                                            const Tensor idx,
-                                            Tensor grad_points);
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}

-void gather_points_backward_cuda(int b, int c, int n, int npoints,
+void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
-  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
-                                         grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}

 void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n,
                           int npoints) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
-                               out_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
 }

 void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                                grad_points_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
 }
--- a/mmcv/ops/csrc/pytorch/group_points.cpp
+++ b/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -3,56 +3,32 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                          int nsample, const Tensor points,
-                                          const Tensor idx, Tensor out);
-void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
-  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
-                                       out);
-};
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}

-void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           int nsample, const Tensor grad_out,
-                                           const Tensor idx,
-                                           Tensor grad_points);
-void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
-  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
-                                        idx, grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}

 void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
-                              idx_tensor, out_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
 }

 void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
-                               idx_tensor, grad_points_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
 }
--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
 */

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;

-#ifdef MMCV_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-#define CHECK_ERROR(state) \
-  { gpuAssert((state), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-
-void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
-                                                   const Tensor boxes_a,
-                                                   const int num_b,
-                                                   const Tensor boxes_b,
-                                                   Tensor ans_overlap);
-void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
-  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                                ans_overlap);
-};
-
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou);
-void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                      const int num_b, const Tensor boxes_b,
                                      Tensor ans_iou) {
-  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-};
-
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long *mask, int boxes_num,
-                                       float nms_overlap_thresh);
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
+                       boxes_b, ans_iou);
+}

-void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
                            int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
-};
-
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long *mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh);
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}

-void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
                                   int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
-                                          nms_overlap_thresh);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}

 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap) {
@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)

-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_overlap);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
-                                         ans_overlap);
-#else
-    AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
-  }
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
 }

 void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);

-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_iou);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
-#else
-    AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
-  }
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
 }

 void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                       float nms_overlap_thresh) {
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);

-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);

-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);

-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();

-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);

-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;

-    int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;

-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
-
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
      }
    }
-
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
    *keep_num_data = num_to_keep;
-
-#else
-    AT_ERROR("iou3d_nms is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms is not implemented on CPU");
  }
 }

@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)

-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
-
-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-
-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
-                                  nms_overlap_thresh);
-
-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-
-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-    int num_to_keep = 0;
-
-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
-
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
-      }
-    }
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);

-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-    *keep_num_data = num_to_keep;
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);

-#else
-    AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms_normal is not implemented on CPU");
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
  }
+
+  *keep_num_data = num_to_keep;
 }
--- a/mmcv/ops/csrc/pytorch/knn.cpp
+++ b/mmcv/ops/csrc/pytorch/knn.cpp
@@ -2,31 +2,16 @@
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
-                                  const Tensor xyz, const Tensor new_xyz,
-                                  Tensor idx, Tensor dist2);
-
-void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
-  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
 }
-#endif

 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-
-    knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
-                     dist2_tensor);
-#else
-    AT_ERROR("knn is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("knn is not implemented on CPU");
-  }
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
 }
--- a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int kernel_h,
-                                           const int kernel_w, const int pad_h,
-                                           const int pad_w);
-
-void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int height,
-                                           const int width, const int channels);
-
-void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                        kernel_h, kernel_w, pad_h, pad_w);
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
 }

-void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-                                        width, channels);
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
 }
-#endif

 void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w) {
-  if (im.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(im);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(col);
-    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
-                               kernel_w, pad_h, pad_w);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
 }

 void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels) {
-  if (col.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(col);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(im);
-    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
-                               channels);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
 }
--- a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-
-void modulated_deformable_im2col_cuda(
+void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
-
-void modulated_deformable_col2im_cuda(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
-
-void modulated_deformable_col2im_coord_cuda(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
-
-#endif
-
-void modulated_deformable_im2col_cpu(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}

-void modulated_deformable_col2im_cpu(
+void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}

-void modulated_deformable_col2im_coord_cpu(
+void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}

 void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(columns);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(output);
-    CHECK_CPU_INPUT(columns);
-  }
-
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
                        output.size(2), output.size(3)});

  for (int b = 0; b < batch; b++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);

    // divide into group
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(columns);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_weight);
-    CHECK_CUDA_INPUT(grad_bias);
-    CHECK_CUDA_INPUT(grad_offset);
-    CHECK_CUDA_INPUT(grad_mask);
-    CHECK_CUDA_INPUT(grad_output);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(columns);
-    CHECK_CPU_INPUT(grad_input);
-    CHECK_CPU_INPUT(grad_weight);
-    CHECK_CPU_INPUT(grad_bias);
-    CHECK_CPU_INPUT(grad_offset);
-    CHECK_CPU_INPUT(grad_mask);
-    CHECK_CPU_INPUT(grad_output);
-  }
-
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});

-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cuda(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cuda(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cpu(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cpu(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,

--- a/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
+++ b/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -10,43 +10,39 @@
 */

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
-                                   const int im2col_step);
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}

-void ms_deform_attn_cuda_backward(
+void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
-
-#endif
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}

 Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight,
                              const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    at::DeviceGuard guard(value.device());
-    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
-                                       sampling_loc, attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("Not implemented on the CPU");
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
 }

 void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    CHECK_CUDA_INPUT(grad_output)
-    CHECK_CUDA_INPUT(grad_value)
-    CHECK_CUDA_INPUT(grad_sampling_loc)
-    CHECK_CUDA_INPUT(grad_attn_weight)
-    at::DeviceGuard guard(value.device());
-    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
-                                 sampling_loc, attn_weight, grad_output,
-                                 grad_value, grad_sampling_loc,
-                                 grad_attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Not implemented on the CPU");
-  }
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
 }
--- a/mmcv/ops/csrc/pytorch/nms.cpp
+++ b/mmcv/ops/csrc/pytorch/nms.cpp
--- a/mmcv/ops/csrc/pytorch/pixel_group.cpp
+++ b/mmcv/ops/csrc/pytorch/pixel_group.cpp
--- a/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
+++ b/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
--- a/mmcv/ops/csrc/pytorch/psamask.cpp
+++ b/mmcv/ops/csrc/pytorch/psamask.cpp
--- a/mmcv/ops/csrc/pytorch/roi_align.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align.cpp
--- a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
--- a/mmcv/ops/csrc/pytorch/roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_pool.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                      Tensor argmax, int pooled_height,
-                                      int pooled_width, float spatial_scale);
-
-void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                       Tensor argmax, Tensor grad_input,
-                                       int pooled_height, int pooled_width,
-                                       float spatial_scale);
-
-void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
-  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
-                                   pooled_width, spatial_scale);
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
 }

-void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
-  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
-                                    pooled_height, pooled_width, spatial_scale);
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
 }
-#endif

 void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                      int pooled_height, int pooled_width,
                      float spatial_scale) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax);
-
-    roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
-                          pooled_width, spatial_scale);
-#else
-    AT_ERROR("RoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("RoIPool is not implemented on CPU");
-  }
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
 }

 void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                       Tensor grad_input, int pooled_height, int pooled_width,
                       float spatial_scale) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(argmax);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
-                           pooled_width, spatial_scale);
-#else
-    AT_ERROR("RoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("RoIPool is not implemented on CPU");
-  }
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
 }
--- a/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
+++ b/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
--- a/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
+++ b/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
--- a/mmcv/ops/csrc/pytorch/scatter_points.cpp
+++ b/mmcv/ops/csrc/pytorch/scatter_points.cpp
--- a/mmcv/ops/csrc/pytorch/sync_bn.cpp
+++ b/mmcv/ops/csrc/pytorch/sync_bn.cpp
--- a/mmcv/ops/csrc/pytorch/three_interpolate.cpp
+++ b/mmcv/ops/csrc/pytorch/three_interpolate.cpp