Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map

Refactor csrc with device dispatcher (#1463)
* Add device registry for pytorch ops * add declaration of CheckDeviceConsistency * fix for torch130 * assert with torch check * Refactor ops with dispatch * update rest ops * faster install * update compatibility * update compatibility, rename parameter * move cpu implement to pytorch/cpu * update ops/csrc/README.md * fix rocm support * update cn document * update docs * list instead of map
230f9a3b · q.yao · GitHub · ef8ba752 · 230f9a3b · 230f9a3b
Unverified Commit 230f9a3b authored Nov 23, 2021 by q.yao Committed by GitHub Nov 23, 2021
20 changed files
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
+### v1.3.18
+
+Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example:
+
+```c++
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
+```
+
+Registry and dispatcher are added to manage these implementations.
+
+```c++
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+// register cuda implementation
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+
+// roi_align.cpp
+// use the dispatcher to invoke different implementation depending on device type of input tensors.
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+```
+
 ### v1.3.11

 In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206).

--- a/docs_zh_CN/compatibility.md
+++ b/docs_zh_CN/compatibility.md
+### v1.3.18
+
+部分自定义算子对于不同的设备有不同实现，为此添加的大量宏命令与类型检查使得代码变得难以维护。例如：
+
+```c++
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
+```
+
+为此我们设计了注册与分发的机制以更好的管理这些算子实现。
+
+```c++
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+// 注册算子的cuda实现
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+
+// roi_align.cpp
+// 使用dispatcher根据参数中的Tensor device类型对实现进行分发
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+```
+
 ### v1.3.11

 为了灵活地支持更多的后端和硬件，例如 `NVIDIA GPUs` 、`AMD GPUs`，我们重构了 `mmcv/ops/csrc` 目录。注意，这次重构不会影响 API 的使用。更多相关信息，请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。

--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -12,6 +12,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── parrots_cuda_helper.hpp
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
+│   ├── pytorch_device_registry.hpp
 │   └── cuda
 │       ├── common_cuda_helper.hpp
 │       ├── parrots_cudawarpfunction.cuh
@@ -37,9 +38,12 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pybind.cpp
 │   ├── ...
 │   ├── ops.cpp
-│   └── cuda
+│   ├── cuda
+│   │   ├── ...
+│   │   └── ops_cuda.cu
+│   └── cpu
 │       ├── ...
-│       └── ops_cuda.cu
+│       └── ops.cpp
 └── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
@@ -64,6 +68,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
+  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
 - `tensorrt`: **TensorRT** support for custom ops.
  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.

@@ -102,42 +107,38 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
    }
    ```

-2. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
+2. Register implementation for different devices.

    ```c++
-    // src/pytorch/new_ops.cpp
-    #ifdef MMCV_WITH_CUDA
+    // src/pytorch/cuda/cudabind.cpp
+    ...
+
    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
        // implement cuda forward here
        // use `NewOpsForwardCUDAKernelLauncher` here
    }
-    #else
+    // declare interface here.
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+    // register the implementation for given device (CUDA here).
+    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+    ```

-    Tensor new_ops_forward_cpu(Tensor input, Tensor output, ...){
-        // implement cpu forward here
-    }
+3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.

+    ```c++
+    // src/pytorch/new_ops.cpp
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+        // dispatch the implementation according to the device type of input.
+        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+    }
    ...

    Tensor new_ops_forward(Tensor input, Tensor output, ...){
-        // select implementation by input device type
-        if (boxes.device().is_cuda()) {
-        #ifdef MMCV_WITH_CUDA
-            CHECK_CUDA_INPUT(input);
-            CHECK_CUDA_INPUT(output);
-            return new_ops_forward_cuda(input, output, ...);
-        #else
-            AT_ERROR("new ops is not compiled with GPU support");
-        #endif
-        } else {
-            CHECK_CPU_INPUT(input);
-            CHECK_CPU_INPUT(output);
-            return new_ops_forward_cpu(input, output, ...);
-        }
+        return new_ops_forward_impl(input, output, ...);
    }
    ```

-3. Binding the implementation in `pytorch/pybind.cpp`
+4. Binding the implementation in `pytorch/pybind.cpp`

    ```c++
    // src/pytorch/pybind.cpp
@@ -156,7 +157,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the

    ```

-4. Build MMCV again. Enjoy new ops in python
+5. Build MMCV again. Enjoy new ops in python

    ```python
    from ..utils import ext_loader

--- a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -14,7 +14,7 @@

 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <torch/types.h>
+#include <torch/extension.h>

 #include <iostream>
 #include <vector>

--- a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -217,7 +217,6 @@ __global__ void ms_deformable_im2col_gpu_kernel(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -278,7 +277,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -369,7 +367,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -463,7 +460,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -555,7 +551,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -658,7 +653,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@@ -757,7 +751,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;


--- a/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -34,6 +34,14 @@ __device__ __forceinline__ static void reduceMax(double *address, double val) {
 }

 // get rid of meaningless warnings when compiling host code
+#ifdef HIP_DIFF
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__ static void reduceAdd(float *address, float val) {
 #if (__CUDA_ARCH__ < 200)
@@ -77,7 +85,8 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
  atomicAdd(address, val);
 #endif
 }
-#endif
+#endif  // __CUDA_ARCH__
+#endif  // HIP_DIFF

 template <typename T>
 __global__ void feats_reduce_kernel(

--- a/mmcv/ops/csrc/common/pytorch_device_registry.hpp
+++ b/mmcv/ops/csrc/common/pytorch_device_registry.hpp
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+#include <torch/extension.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
--- a/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
 // Modified from
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void AssignScoreWithKForwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& points, const Tensor& centers, const Tensor& scores,
-    const Tensor& knn_idx, Tensor& output);
-
-void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
-  AssignScoreWithKForwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
-};
-
-void AssignScoreWithKBackwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores);
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}

-void assign_score_withk_backward_cuda(
+void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
-  AssignScoreWithKBackwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
-      grad_points, grad_centers, grad_scores);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}

 void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate) {
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(output);
-
-    assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
-                                    centers, scores, knn_idx, output);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
 }

 void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate) {
-  if (grad_points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CONTIGUOUS(grad_out);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(grad_scores);
-    CHECK_CONTIGUOUS(grad_points);
-    CHECK_CONTIGUOUS(grad_centers);
-
-    assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
-                                     points, centers, scores, knn_idx,
-                                     grad_points, grad_centers, grad_scores);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
 }
--- a/mmcv/ops/csrc/pytorch/ball_query.cpp
+++ b/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -2,36 +2,19 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
-                                        float max_radius, int nsample,
-                                        const Tensor new_xyz, const Tensor xyz,
-                                        Tensor idx);
-
-void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
-  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
-                                     new_xyz, xyz, idx);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}

 void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-
-    ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
-                            new_xyz_tensor, xyz_tensor, idx_tensor);
-#else
-    AT_ERROR("ball_query is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("ball_query is not implemented on CPU");
-  }
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
 }
--- a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
-                                    Tensor ious, const int mode,
-                                    const bool aligned, const int offset);
-
-void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
-  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
 }
-#endif

 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset) {
-  if (bboxes1.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(bboxes1);
-    CHECK_CUDA_INPUT(bboxes2);
-    CHECK_CUDA_INPUT(ious);
-
-    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
-#else
-    AT_ERROR("bbox_overlaps is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("bbox_overlaps is not implemented on CPU");
-  }
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
 }
--- a/mmcv/ops/csrc/pytorch/border_align.cpp
+++ b/mmcv/ops/csrc/pytorch/border_align.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
-                                          const Tensor &boxes, Tensor output,
-                                          Tensor argmax_idx,
-                                          const int pool_size);
-
-void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
-                                           const Tensor &boxes,
-                                           const Tensor &argmax_idx,
-                                           Tensor grad_input,
-                                           const int pool_size);
-
-void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
-  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
-                                       pool_size);
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
 }

-void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
-  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
-                                        grad_input, pool_size);
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
 }
-#endif

 void border_align_forward(const Tensor &input, const Tensor &boxes,
                          Tensor output, Tensor argmax_idx,
                          const int pool_size) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax_idx);
-
-    border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
 }

 void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                           const Tensor &argmax_idx, Tensor grad_input,
                           const int pool_size) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(argmax_idx);
-    CHECK_CUDA_INPUT(grad_input);
-
-    border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
-                               pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
 }
--- a/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -2,28 +2,18 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
-
-#ifdef MMCV_WITH_CUDA
-void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                          const int mode_flag, const bool aligned);
-#endif
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}

 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned) {
-  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
-  if (boxes1.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
-  }
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
 }
--- a/mmcv/ops/csrc/pytorch/carafe.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
-                                     Tensor rfeatures, Tensor routput,
-                                     Tensor rmasks, Tensor output,
-                                     const int kernel_size,
-                                     const int group_size,
-                                     const int scale_factor);
-
-void CARAFEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
-    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
-    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor);
-
-void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
-  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
-                                  output, kernel_size, group_size,
-                                  scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
 }

-void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
-  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
-                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
-                                   bottom_grad, mask_grad, kernel_size,
-                                   group_size, scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
 }
-#endif

 void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                    Tensor routput, Tensor rmasks, Tensor output,
                    int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(routput);
-    CHECK_CUDA_INPUT(rmasks);
-    CHECK_CUDA_INPUT(output);
-    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
-                        kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
 }

 void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                     Tensor mask_grad, int kernel_size, int group_size,
                     int scale_factor) {
-  if (top_grad.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(top_grad);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rtop_grad);
-    CHECK_CUDA_INPUT(rbottom_grad_hs);
-    CHECK_CUDA_INPUT(rbottom_grad);
-    CHECK_CUDA_INPUT(rmask_grad);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
-                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
-                         kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
 }
--- a/mmcv/ops/csrc/pytorch/carafe_naive.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
-                                          const Tensor masks, Tensor output,
-                                          const int kernel_size,
-                                          const int group_size,
-                                          const int scale_factor);
-
-void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor features, const Tensor masks,
-    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
-    const int group_size, const int scale_factor);
-
-void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
-  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
-                                       group_size, scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
 }

-void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
-  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
-                                        mask_grad, kernel_size, group_size,
-                                        scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
 }
-#endif

 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(output);
-    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
-                              scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
 }

 void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                           Tensor bottom_grad, Tensor mask_grad,
                           int kernel_size, int group_size, int scale_factor) {
-  if (top_grad.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(top_grad);
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
-                               mask_grad, kernel_size, group_size,
-                               scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
 }
--- a/mmcv/ops/csrc/pytorch/correlation.cpp
+++ b/mmcv/ops/csrc/pytorch/correlation.cpp
@@ -2,65 +2,37 @@
 #include <iostream>

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-
-void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
-                                          Tensor output, int kH, int kW,
-                                          int patchH, int patchW, int padH,
-                                          int padW, int dilationH,
-                                          int dilationW, int dilation_patchH,
-                                          int dilation_patchW, int dH, int dW);
-
-void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
-                                           Tensor input2, Tensor grad_input1,
-                                           Tensor grad_input2, int kH, int kW,
-                                           int patchH, int patchW, int padH,
-                                           int padW, int dilationH,
-                                           int dilationW, int dilation_patchH,
-                                           int dilation_patchW, int dH, int dW);
-
-void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
-  CorrelationForwardCUDAKernelLauncher(
-      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
-      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
 }

-void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
-  CorrelationBackwardCUDAKernelLauncher(
-      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
-      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-      dilation_patchW, dH, dW);
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
 }

-#endif
-
 void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input1);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
-                             padH, padW, dilationH, dilationW, dilation_patchH,
-                             dilation_patchW, dH, dW);
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
 }

 void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(input1);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_backward(grad_output, input1, input2, grad_input1,
-                              grad_input2, kH, kW, patchH, patchW, padH, padW,
-                              dilationH, dilationW, dilation_patchH,
-                              dilation_patchW, dH, dW);
-
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
 }
--- a/mmcv/ops/csrc/pytorch/box_iou_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/box_iou_rotated_cpu.cpp
@@ -3,6 +3,7 @@
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
 #include "box_iou_rotated_utils.hpp"
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

 template <typename T>
 void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
@@ -31,3 +32,7 @@ void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
 }
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
--- a/mmcv/ops/csrc/pytorch/deform_conv_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_conv_cpu.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

 template <typename T>
 T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
@@ -375,3 +376,33 @@ void deformable_col2im_coord_cpu(
            height_col, width_col, grad_offset_);
      }));
 }
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
--- a/mmcv/ops/csrc/pytorch/modulated_deform_conv_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv_cpu.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

 template <typename T>
 T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
@@ -322,7 +323,7 @@ void modulated_deformable_im2col_cpu(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
@@ -338,7 +339,7 @@ void modulated_deformable_im2col_cpu(

        modulated_deformable_im2col_cpu_kernel(
            num_kernels, data_im_, data_offset_, data_mask_, height_im,
-            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
            channels, deformable_group, height_col, width_col, data_col_);
      }));
@@ -401,3 +402,35 @@ void modulated_deformable_col2im_coord_cpu(
            height_col, width_col, grad_offset_, grad_mask_);
      }));
 }
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
--- a/mmcv/ops/csrc/pytorch/cpu/nms.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/nms.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
--- a/mmcv/ops/csrc/pytorch/nms_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/pytorch/nms_rotated_cpu.cpp
@@ -11,9 +11,9 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
  // however, the code in this function is much shorter because
  // we delegate the IoU computation for rotated boxes to
  // the single_box_iou_rotated function in box_iou_rotated_utils.h
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
-  AT_ASSERTM(dets.type() == scores.type(),
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
             "dets should have the same type as scores");

  if (dets.numel() == 0) {