add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/parrots_cudawarpfunction.cuh
+++ b/mmcv/ops/csrc/parrots_cudawarpfunction.cuh
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include <cuda.h>
+
+#include <parrots/foundation/common.hpp>
+
+#ifdef PARROTS_USE_HALF
+#include <cuda_fp16.h>
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else  // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+                                      int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+                      return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+                                         unsigned delta, int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+                                          int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(float16 r;
+                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+                      return r;);
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template <typename T>
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+                                int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
+                                   int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
+                                     int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
+                                    int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
--- a/mmcv/ops/csrc/psamask_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/psamask_cuda_kernel.cuh
+#ifndef PSAMASK_CUDA_CUH
+#define PSAMASK_CUDA_CUH
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif
--- a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+#endif
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  if (bboxes1.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(bboxes1);
+    CHECK_CUDA_INPUT(bboxes2);
+    CHECK_CUDA_INPUT(ious);
+
+    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+#else
+    AT_ERROR("bbox_overlaps is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("bbox_overlaps is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/bbox_overlaps_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/bbox_overlaps_cuda.cu
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/carafe.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+#endif
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  if (features.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(routput);
+    CHECK_CUDA_INPUT(rmasks);
+    CHECK_CUDA_INPUT(output);
+    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                        kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  if (top_grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rtop_grad);
+    CHECK_CUDA_INPUT(rbottom_grad_hs);
+    CHECK_CUDA_INPUT(rbottom_grad);
+    CHECK_CUDA_INPUT(rmask_grad);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                         kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/carafe_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/carafe_cuda.cu
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/carafe_naive.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+#endif
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  if (features.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(output);
+    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                              scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  if (top_grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
+                               mask_grad, kernel_size, group_size,
+                               scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/carafe_naive_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/carafe_naive_cuda.cu
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+int CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                         const Tensor masks, Tensor output,
+                                         const int kernel_size,
+                                         const int group_size,
+                                         const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return 0;
+}
+
+int CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return 0;
+}
--- a/mmcv/ops/csrc/pytorch/cc_attention.cpp
+++ b/mmcv/ops/csrc/pytorch/cc_attention.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f, Tensor weight);
+
+void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
+                                  const Tensor f, Tensor dt, Tensor df);
+
+void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
+                                    Tensor out);
+
+void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
+                                     const Tensor g, Tensor dw, Tensor dg);
+
+void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight) {
+  CAForwardCUDAKernelLauncher(t, f, weight);
+}
+
+void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
+                      Tensor dt, Tensor df) {
+  CABackwardCUDAKernelLauncher(dw, t, f, dt, df);
+}
+
+void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out) {
+  CAMapForwardCUDAKernelLauncher(weight, g, out);
+}
+
+void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
+                          const Tensor g, Tensor dw, Tensor dg) {
+  CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg);
+}
+#endif
+
+void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
+  if (t.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(t);
+    CHECK_CUDA_INPUT(f);
+    CHECK_CUDA_INPUT(weight);
+    ca_forward_cuda(t, f, weight);
+#else
+    AT_ERROR("ca is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}
+
+void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
+                 Tensor df) {
+  if (dw.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(dw);
+    CHECK_CUDA_INPUT(t);
+    CHECK_CUDA_INPUT(f);
+    CHECK_CUDA_INPUT(dt);
+    CHECK_CUDA_INPUT(df);
+    ca_backward_cuda(dw, t, f, dt, df);
+#else
+    AT_ERROR("ca is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}
+
+void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
+  if (weight.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(g);
+    CHECK_CUDA_INPUT(out);
+    ca_map_forward_cuda(weight, g, out);
+#else
+    AT_ERROR("ca_map is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}
+
+void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
+                     Tensor dw, Tensor dg) {
+  if (dout.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(dout);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(g);
+    CHECK_CUDA_INPUT(dw);
+    CHECK_CUDA_INPUT(dg);
+    ca_map_backward_cuda(dout, weight, g, dw, dg);
+#else
+    AT_ERROR("ca_map is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/cc_attention_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cc_attention_cuda.cu
+// Modified from
+// https://github.com/LikeLy-Journey/SegmenTron/blob/master/segmentron/modules/csrc/criss_cross_attention/ca_cuda.cu
+
+#include <THC/THC.h>
+
+#include <THC/THCDeviceUtils.cuh>
+
+#include "cc_attention_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f,
+                                 Tensor weight) {
+  AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = t.size(0);
+  auto c = t.size(1);
+  auto h = t.size(2);
+  auto w = t.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_forward", [&] {
+    ca_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        weight.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
+                                  const Tensor f, Tensor dt, Tensor df) {
+  AT_ASSERTM(dw.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = t.size(0);
+  auto c = t.size(1);
+  auto h = t.size(2);
+  auto w = t.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_backward_kernel_t", [&] {
+    ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.contiguous().data_ptr<scalar_t>(),
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        dt.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+
+  AT_DISPATCH_FLOATING_TYPES(f.scalar_type(), "ca_backward_kernel_f", [&] {
+    ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.contiguous().data_ptr<scalar_t>(),
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        df.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
+                                    Tensor out) {
+  AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = g.size(0);
+  auto c = g.size(1);
+  auto h = g.size(2);
+  auto w = g.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_forward", [&] {
+    ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        weight.contiguous().data_ptr<scalar_t>(),
+        g.contiguous().data_ptr<scalar_t>(),
+        out.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
+                                     const Tensor g, Tensor dw, Tensor dg) {
+  AT_ASSERTM(dout.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = dout.size(0);
+  auto c = dout.size(1);
+  auto h = dout.size(2);
+  auto w = dout.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "ca_map_backward_kernel_w", [&] {
+        ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
+            dout.contiguous().data_ptr<scalar_t>(),
+            weight.contiguous().data_ptr<scalar_t>(),
+            g.contiguous().data_ptr<scalar_t>(),
+            dw.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+      });
+
+  AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_backward_kernel_g", [&] {
+    ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
+        dout.contiguous().data_ptr<scalar_t>(),
+        weight.contiguous().data_ptr<scalar_t>(),
+        g.contiguous().data_ptr<scalar_t>(),
+        dg.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/corner_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/corner_pool.cpp
+// Modified from
+// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
--- a/mmcv/ops/csrc/pytorch/deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_conv.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
+                                         Tensor offset, Tensor output,
+                                         Tensor columns, Tensor ones, int kW,
+                                         int kH, int dW, int dH, int padW,
+                                         int padH, int dilationW, int dilationH,
+                                         int group, int deformable_group,
+                                         int im2col_step);
+
+void DeformConvBackwardInputCUDAKernelLauncher(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
+    Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step);
+
+void DeformConvBackwardParametersCUDAKernelLauncher(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    float scale, int im2col_step);
+
+void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
+                              Tensor output, Tensor columns, Tensor ones,
+                              int kW, int kH, int dW, int dH, int padW,
+                              int padH, int dilationW, int dilationH, int group,
+                              int deformable_group, int im2col_step) {
+  DeformConvForwardCUDAKernelLauncher(
+      input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
+      dilationW, dilationH, group, deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradInput,
+                                     Tensor gradOffset, Tensor weight,
+                                     Tensor columns, int kW, int kH, int dW,
+                                     int dH, int padW, int padH, int dilationW,
+                                     int dilationH, int group,
+                                     int deformable_group, int im2col_step) {
+  DeformConvBackwardInputCUDAKernelLauncher(
+      input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
+      dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step);
+}
+
+void deform_conv_backward_parameters_cuda(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    float scale, int im2col_step) {
+  DeformConvBackwardParametersCUDAKernelLauncher(
+      input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
+      padW, padH, dilationW, dilationH, group, deformable_group, scale,
+      im2col_step);
+}
+#endif
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+
+    deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW,
+                             kH, dW, dH, padW, padH, dilationW, dilationH,
+                             group, deformable_group, im2col_step);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformConv is not implemented on CPU");
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+
+    deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
+                                    gradOffset, weight, columns, kW, kH, dW, dH,
+                                    padW, padH, dilationW, dilationH, group,
+                                    deformable_group, im2col_step);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformConv is not implemented on CPU");
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+
+    deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
+                                         columns, ones, kW, kH, dW, dH, padW,
+                                         padH, dilationW, dilationH, group,
+                                         deformable_group, scale, im2col_step);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformConv is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/deform_conv_cuda.cu
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
+                       const int height, const int width, const int ksize_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
+                       const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
+                       const int height, const int width, const int ksize_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
+                       const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deform_conv_shape_check(Tensor input, Tensor offset, Tensor *gradOutput,
+                             Tensor weight, int kH, int kW, int dH, int dW,
+                             int padH, int padW, int dilationH, int dilationW,
+                             int group, int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
+                                         Tensor offset, Tensor output,
+                                         Tensor columns, Tensor ones, int kW,
+                                         int kH, int dW, int dH, int padW,
+                                         int padH, int dilationW, int dilationH,
+                                         int group, int deformable_group,
+                                         int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void DeformConvBackwardInputCUDAKernelLauncher(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
+    Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step) {
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void DeformConvBackwardParametersCUDAKernelLauncher(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    float scale, int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
--- a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+#endif
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(output);
+
+    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 gamma);
+#else
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
+  }
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  if (grad_output.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_offset);
+
+    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                  grad_offset, pooled_height, pooled_width,
+                                  spatial_scale, sampling_ratio, gamma);
+#else
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/deform_roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool_cuda.cu
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/focal_loss.cpp
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+#endif
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+
+    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(grad_input);
+
+    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                     alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+
+    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
+  }
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(buff);
+    CHECK_CUDA_INPUT(grad_input);
+
+    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                     gamma, alpha);
+#else
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_kernel.cuh"
+#include "softmax_focal_loss_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<long>() <= (long)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<long>() <= (long)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(), "softmax_focal_loss_backward_cuda1_kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(), "softmax_focal_loss_backward_cuda2_kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/info.cpp
+++ b/mmcv/ops/csrc/pytorch/info.cpp
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef WITH_CUDA
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
--- a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef WITH_CUDA
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+#endif
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  if (im.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(im);
+    CHECK_CUDA_INPUT(mask_h_idx);
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(col);
+    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                               kernel_w, pad_h, pad_w);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  if (col.device().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA_INPUT(col);
+    CHECK_CUDA_INPUT(mask_h_idx);
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(im);
+    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                               channels);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/pytorch/masked_conv2d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d_cuda.cu
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}