Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map

Refactor csrc with device dispatcher (#1463)
* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map
230f9a3b · q.yao · GitHub · ef8ba752 · 230f9a3b · 230f9a3b
Unverified Commit 230f9a3b authored Nov 23, 2021 by q.yao Committed by GitHub Nov 23, 2021
20 changed files
--- a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding_dim.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < embedding_dim; i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
--- a/mmcv/ops/csrc/pytorch/points_in_boxes_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/points_in_boxes_cpu.cpp
--- a/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
--- a/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp
@@ -5,6 +5,7 @@
 #include <ATen/TensorUtils.h>
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 // implementation taken from Caffe2
 template <typename T>
@@ -429,3 +430,37 @@ void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
            n_stride, c_stride, h_stride, w_stride);
      });
 }
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
--- a/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp
@@ -5,6 +5,7 @@
 #include <ATen/TensorUtils.h>
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 // implementation taken from Caffe2
 template <typename T>
@@ -415,3 +416,43 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
            n_stride, c_stride, h_stride, w_stride);
      });
 }
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
--- a/mmcv/ops/csrc/pytorch/voxelization_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/voxelization_cpu.cpp
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 template <typename T, typename T_int>
 void dynamic_voxelize_forward_cpu_kernel(
@@ -150,3 +151,20 @@ int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
  return voxel_num;
 }
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
--- a/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -7,8 +7,8 @@
 void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
  using scalar_t = float;
-  AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
-  AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
  int output_size = ious.numel();
  int num_boxes1 = boxes1.size(0);

--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
--- a/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -2,8 +2,9 @@
 #include "deform_conv_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
-void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
-                       const int height, const int width, const int ksize_h,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
@@ -35,8 +36,9 @@ void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
  AT_CUDA_CHECK(cudaGetLastError());
 }
-void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
-                       const int height, const int width, const int ksize_h,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
@@ -68,7 +70,7 @@ void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
  AT_CUDA_CHECK(cudaGetLastError());
 }
-void deformable_col2im_coord(
+void deformable_col2im_coord_cuda(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,

--- a/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -14,8 +14,8 @@ inline int opt_n_threads(int work_size) {
 }
 void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
-                                                    const float *dataset,
+                                                    const float* dataset,
-                                                    float *temp, int *idxs) {
+                                                    float* temp, int* idxs) {
  // dataset: (B, N, 3)
  // tmp: (B, N)
  // output:
@@ -79,7 +79,7 @@ void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
 }
 void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
-    int b, int n, int m, const float *dataset, float *temp, int *idxs) {
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
  // dataset: (B, N, N)
  // temp: (B, N)
  // output:

--- a/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -6,7 +6,7 @@ void modulated_deformable_im2col_cuda(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
@@ -24,7 +24,7 @@ void modulated_deformable_im2col_cuda(
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_im_, data_offset_, data_mask_, height_im,
-            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
            channels, deformable_group, height_col, width_col, data_col_);
      }));

--- a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -232,14 +232,12 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
  AT_ASSERTM(attn_weight.is_contiguous(),
             "attn_weight tensor has to be contiguous");
-  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
-  AT_ASSERTM(level_start_index.type().is_cuda(),
             "level_start_index must be a CUDA tensor");
-  AT_ASSERTM(sampling_loc.type().is_cuda(),
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
  const int batch = value.size(0);
  const int spatial_size = value.size(1);
@@ -268,17 +266,18 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto columns = output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES(
-        value.type(), "ms_deform_attn_forward_cuda", ([&] {
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
          ms_deformable_im2col_cuda(
              at::cuda::getCurrentCUDAStream(),
-              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              spatial_shapes.data_ptr<int64_t>(),
-              sampling_loc.data<scalar_t>() +
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
-              attn_weight.data<scalar_t>() +
+              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
-              num_point, columns.data<scalar_t>());
+              num_point, columns.data_ptr<scalar_t>());
        }));
  }
@@ -305,15 +304,13 @@ void ms_deform_attn_cuda_backward(
  AT_ASSERTM(grad_output.is_contiguous(),
             "grad_output tensor has to be contiguous");
-  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
-  AT_ASSERTM(level_start_index.type().is_cuda(),
             "level_start_index must be a CUDA tensor");
-  AT_ASSERTM(sampling_loc.type().is_cuda(),
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
-  AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
  const int batch = value.size(0);
  const int spatial_size = value.size(1);
@@ -340,21 +337,24 @@ void ms_deform_attn_cuda_backward(
  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto grad_output_g = grad_output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES(
-        value.type(), "ms_deform_attn_backward_cuda", ([&] {
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
          ms_deformable_col2im_cuda(
-              at::cuda::getCurrentCUDAStream(), grad_output_g.data<scalar_t>(),
+              at::cuda::getCurrentCUDAStream(),
-              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              grad_output_g.data_ptr<scalar_t>(),
-              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-              sampling_loc.data<scalar_t>() +
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
-              attn_weight.data<scalar_t>() +
+              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
              num_point,
-              grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              grad_value.data_ptr<scalar_t>() +
-              grad_sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
-              grad_attn_weight.data<scalar_t>() +
+              grad_attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size);
        }));
  }

--- a/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -8,8 +8,8 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                        const Tensor order_t, const Tensor dets_sorted,
                        float iou_threshold, const int multi_label) {
  // using scalar_t = float;
-  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
-  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
  at::cuda::CUDAGuard device_guard(dets.device());
  int dets_num = dets.size(0);
@@ -24,21 +24,22 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      dets_sorted.type(), "nms_rotated_kernel_cuda", [&] {
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-            dets_num, iou_threshold, dets_sorted.data<scalar_t>(),
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
-            (unsigned long long*)mask.data<int64_t>(), multi_label);
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
      });
  Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>();
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
  Tensor keep =
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data<int64_t>();
+  int64_t* keep_out = keep.data_ptr<int64_t>();
  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {

--- a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -9,10 +9,10 @@ void ROIAlignRotatedForwardCUDAKernelLauncher(
    const int pooled_height, const int pooled_width, at::Tensor output) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "ROIAlignRotatedLaucherForward", ([&] {
+      features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
        roi_align_rotated_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
@@ -31,10 +31,10 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher(
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "ROIAlignLaucherBackward", ([&] {
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
        roi_align_rotated_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                output_size, top_diff, rois_data, spatial_scale, sample_num,

--- a/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
 #include "tin_shift_cuda_kernel.cuh"
 void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,

--- a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -6,8 +6,8 @@
 #include "voxelization_cuda_kernel.cuh"
 int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu
@@ -146,7 +146,7 @@ int HardVoxelizeForwardCUDAKernelLauncher(
 }
 void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor& points, at::Tensor& coors,
+    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu

--- a/mmcv/ops/csrc/pytorch/deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_conv.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
-void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor data_col);
-void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor grad_im);
-void deformable_col2im_coord(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-#endif
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
-                           Tensor data_col);
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im);
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
-void deformable_col2im_coord_cpu(
+void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
 void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                             at::Tensor *gradOutput, at::Tensor weight, int kH,
@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
       output_buffer.size(2), output_buffer.size(3)});
  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);
-    }
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
-    if (input.device().is_cuda()) {
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
-#ifdef MMCV_WITH_CUDA
-      deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
-                              inputHeight, inputWidth, kH, kW, padH, padW, dH,
-                              dW, dilationH, dilationW, im2col_step,
-                              deformable_group, gradOffset[elt]);
-      deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group,
-                        gradInput[elt]);
-#endif
-    } else {
-      deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
                                 inputHeight, inputWidth, kH, kW, padH, padW,
                                 dH, dW, dilationH, dilationW, im2col_step,
                                 deformable_group, gradOffset[elt]);
-      deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group,
                           gradInput[elt]);
-    }
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(

--- a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
-void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                            Tensor offset, Tensor output,
-                                            int pooled_height, int pooled_width,
-                                            float spatial_scale,
-                                            int sampling_ratio, float gamma);
-void DeformRoIPoolBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma);
-void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
-                                         pooled_height, pooled_width,
+                       output, pooled_height, pooled_width, spatial_scale,
-                                         spatial_scale, sampling_ratio, gamma);
+                       sampling_ratio, gamma);
 }
-void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
-  DeformRoIPoolBackwardCUDAKernelLauncher(
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+                       offset, grad_input, grad_offset, pooled_height,
                       pooled_width, spatial_scale, sampling_ratio, gamma);
 }
-#endif
 void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
-  if (input.device().is_cuda()) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(output);
-    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
                               pooled_width, spatial_scale, sampling_ratio,
                               gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
 }
 void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
-  if (grad_output.device().is_cuda()) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_offset);
-    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
                                grad_offset, pooled_height, pooled_width,
                                spatial_scale, sampling_ratio, gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/pytorch/focal_loss.cpp
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight, Tensor buff,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
-                                            gamma, alpha);
+                       output, gamma, alpha);
 }
-void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
-  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
-                                             gamma, alpha);
+                       grad_input, gamma, alpha);
 }
-void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
-                                            gamma, alpha);
+                       output, gamma, alpha);
 }
-void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
-                                             grad_input, gamma, alpha);
+                       buff, grad_input, gamma, alpha);
 }
-#endif
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
 }
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(grad_input);
-    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
                                   alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
 }
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
 }
 void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
-  if (input.device().is_cuda()) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(buff);
-    CHECK_CUDA_INPUT(grad_input);
-    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
                                   gamma, alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
 }
--- a/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -2,61 +2,33 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
-#ifdef MMCV_WITH_CUDA
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
-void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                          Tensor temp_tensor, Tensor idx_tensor,
-                                                    const float *dataset,
+                                          int b, int n, int m) {
-                                                    float *temp, int *idxs);
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
-void furthest_point_sampling_forward_cuda(int b, int n, int m,
-                                          const float *dataset, float *temp,
-                                          int *idxs) {
-  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
 }
-void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
-    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
-void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
+                                                    int n, int m) {
-                                                    const float *dataset,
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
-                                                    float *temp, int *idxs) {
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
-  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
-                                                         idxs);
 }
-#endif
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m) {
-  if (points_tensor.device().is_cuda()) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
-#ifdef MMCV_WITH_CUDA
+                                       b, n, m);
-    const float *points = points_tensor.data_ptr<float>();
-    float *temp = temp_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR("furthest_point_sampling is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling is not implemented on CPU");
-  }
 }
 void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m) {
-  if (points_tensor.device().is_cuda()) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
-#ifdef MMCV_WITH_CUDA
+                                                 idx_tensor, b, n, m);
-    const float *points = points_tensor.data<float>();
-    float *temp = temp_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-    furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR(
-        "furthest_point_sampling_with_dist is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
-  }
 }