add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh
+#ifndef CA_CUDA_KERNEL_CUH
+#define CA_CUDA_KERNEL_CUH
+
+template <typename T>
+__global__ void ca_forward_kernel(const T *t, const T *f, T *weight, int num,
+                                  int chn, int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int z = blockIdx.z;
+
+  if (x < width && y < height && z < height + width - 1) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int plane = 0; plane < chn; ++plane) {
+        T _t = t[(batch * chn + plane) * sp + y * width + x];
+
+        if (z < width) {
+          int i = z;
+          T _f = f[(batch * chn + plane) * sp + y * width + i];
+          weight[(batch * len + i) * sp + y * width + x] += _t * _f;
+        } else {
+          int i = z - width;
+          int j = i < y ? i : i + 1;
+
+          T _f = f[(batch * chn + plane) * sp + j * width + x];
+          weight[(batch * len + width + i) * sp + y * width + x] += _t * _f;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ca_backward_kernel_t(const T *dw, const T *t, const T *f, T *dt,
+                                     int num, int chn, int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int plane = blockIdx.z;
+
+  if (x < width && y < height && plane < chn) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int i = 0; i < width; ++i) {
+        T _dw = dw[(batch * len + i) * sp + y * width + x];
+        T _f = f[(batch * chn + plane) * sp + y * width + i];
+        dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f;
+      }
+      for (int i = 0; i < height; ++i) {
+        if (i == y) continue;
+        int j = i < y ? i : i - 1;
+
+        T _dw = dw[(batch * len + width + j) * sp + y * width + x];
+        T _f = f[(batch * chn + plane) * sp + i * width + x];
+        dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ca_backward_kernel_f(const T *dw, const T *t, const T *f, T *df,
+                                     int num, int chn, int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int plane = blockIdx.z;
+
+  if (x < width && y < height && plane < chn) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int i = 0; i < width; ++i) {
+        T _dw = dw[(batch * len + x) * sp + y * width + i];
+        T _t = t[(batch * chn + plane) * sp + y * width + i];
+        df[(batch * chn + plane) * sp + y * width + x] += _dw * _t;
+      }
+      for (int i = 0; i < height; ++i) {
+        if (i == y) continue;
+        int j = i > y ? y : y - 1;
+
+        T _dw = dw[(batch * len + width + j) * sp + i * width + x];
+        T _t = t[(batch * chn + plane) * sp + i * width + x];
+        df[(batch * chn + plane) * sp + y * width + x] += _dw * _t;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ca_map_forward_kernel(const T *weight, const T *g, T *out,
+                                      int num, int chn, int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int plane = blockIdx.z;
+
+  if (x < width && y < height && plane < chn) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int i = 0; i < width; ++i) {
+        T _g = g[(batch * chn + plane) * sp + y * width + i];
+        T _w = weight[(batch * len + i) * sp + y * width + x];
+        out[(batch * chn + plane) * sp + y * width + x] += _g * _w;
+      }
+      for (int i = 0; i < height; ++i) {
+        if (i == y) continue;
+
+        int j = i < y ? i : i - 1;
+
+        T _g = g[(batch * chn + plane) * sp + i * width + x];
+        T _w = weight[(batch * len + width + j) * sp + y * width + x];
+        out[(batch * chn + plane) * sp + y * width + x] += _g * _w;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ca_map_backward_kernel_w(const T *dout, const T *weight,
+                                         const T *g, T *dw, int num, int chn,
+                                         int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int z = blockIdx.z;
+
+  if (x < width && y < height && z < height + width - 1) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int plane = 0; plane < chn; ++plane) {
+        T _dout = dout[(batch * chn + plane) * sp + y * width + x];
+
+        if (z < width) {
+          int i = z;
+          T _g = g[(batch * chn + plane) * sp + y * width + i];
+          dw[(batch * len + i) * sp + y * width + x] += _dout * _g;
+        } else {
+          int i = z - width;
+          int j = i < y ? i : i + 1;
+
+          T _g = g[(batch * chn + plane) * sp + j * width + x];
+          dw[(batch * len + width + i) * sp + y * width + x] += _dout * _g;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ca_map_backward_kernel_g(const T *dout, const T *weight,
+                                         const T *g, T *dg, int num, int chn,
+                                         int height, int width) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  int sp = height * width;
+  int len = height + width - 1;
+  int plane = blockIdx.z;
+
+  if (x < width && y < height && plane < chn) {
+    for (int batch = 0; batch < num; ++batch) {
+      for (int i = 0; i < width; ++i) {
+        T _dout = dout[(batch * chn + plane) * sp + y * width + i];
+        T _w = weight[(batch * len + x) * sp + y * width + i];
+        dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w;
+      }
+      for (int i = 0; i < height; ++i) {
+        if (i == y) continue;
+        int j = i > y ? y : y - 1;
+
+        T _dout = dout[(batch * chn + plane) * sp + i * width + x];
+        T _w = weight[(batch * len + width + j) * sp + i * width + x];
+        dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w;
+      }
+    }
+  }
+}
+
+#endif
--- a/mmcv/ops/csrc/common_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common_cuda_helper.hpp
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
--- a/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_KERNEL_CUH
+#define DEFORM_CONV_KERNEL_CUH
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif
--- a/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh
+#ifndef DEFORM_POOL_KERNEL_CUH
+#define DEFORM_POOL_KERNEL_CUH
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceil(roi_height / pooled_height));
+    int roi_bin_grid_w = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : static_cast<int>(ceil(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceil(roi_height / pooled_height));
+    int roi_bin_grid_w = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : static_cast<int>(ceil(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
--- a/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
--- a/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_KERNEL_CUH
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif
--- a/mmcv/ops/csrc/nms_kernel.cuh
+++ b/mmcv/ops/csrc/nms_kernel.cuh
+#ifndef NMS_KERNEL_CUH
+#define NMS_KERNEL_CUH
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float *dev_boxes,
+                         unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  if (row_start > col_start) return;
+
+  const int row_size =
+      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 4];
+  if (tid < col_size) {
+    block_boxes[tid * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+    block_boxes[tid * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+    block_boxes[tid * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+    block_boxes[tid * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (tid < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + tid;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long int t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+        t |= 1ULL << i;
+      }
+    }
+    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+  }
+}
+#endif
--- a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+#include "parrots_cpp_helper.hpp"
+
+void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
+                                    const DArrayLite bboxes2, DArrayLite ious,
+                                    const int mode, const bool aligned,
+                                    const int offset, cudaStream_t stream);
+
+void bbox_overlaps_cuda(CudaContext& ctx, const SSElement& attr,
+                        const OperatorBase::in_list_t& ins,
+                        OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = ins[0];
+  const auto& bboxes2 = ins[1];
+
+  auto& ious = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset,
+                                 stream);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
+                                    const DArrayLite bboxes2, DArrayLite ious,
+                                    const int mode, const bool aligned,
+                                    const int offset, cudaStream_t stream) {
+  int output_size = ious.size();
+  int num_bbox1 = bboxes1.dim(0);
+  int num_bbox2 = bboxes2.dim(0);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.elemType().prim(), ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.ptr<scalar_t>(), bboxes2.ptr<scalar_t>(),
+                ious.ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/carafe.cpp
+++ b/mmcv/ops/csrc/parrots/carafe.cpp
+#include "parrots_cpp_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(
+    const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
+    DArrayLite routput, DArrayLite rmasks, DArrayLite output,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const DArrayLite top_grad, const DArrayLite rfeatures,
+    const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
+    DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
+    DArrayLite mask_grad, const int kernel_size, const int group_size,
+    const int scale_factor, cudaStream_t stream);
+
+void carafe_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = ins[0];
+  const auto& masks = ins[1];
+
+  auto& rfeatures = outs[0];
+  auto& routput = outs[1];
+  auto& rmasks = outs[2];
+  auto& output = outs[3];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size, scale_factor,
+                                  stream);
+}
+
+void carafe_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                          const OperatorBase::in_list_t& ins,
+                          OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = ins[0];
+  const auto& rfeatures = ins[1];
+  const auto& masks = ins[2];
+
+  auto& rtop_grad = outs[0];
+  auto rbottom_grad_hs = outs[1];
+  auto& rbottom_grad = outs[2];
+  auto& rmask_grad = outs[3];
+  auto& bottom_grad = outs[4];
+  auto& mask_grad = outs[5];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/carafe_cuda.cu
+++ b/mmcv/ops/csrc/parrots/carafe_cuda.cu
+#include "carafe_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(
+    const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
+    DArrayLite routput, DArrayLite rmasks, DArrayLite output,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream) {
+  const int batch_size = output.dim(0);
+  const int channels = output.dim(1);
+  const int output_height = output.dim(2);
+  const int output_width = output.dim(3);
+
+  const int input_height = features.dim(2);
+  const int input_width = features.dim(3);
+
+  const int mask_channels = masks.dim(1);
+
+  // one warp per pixel
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                features.ptr<scalar_t>(), rfeatures.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                masks.ptr<scalar_t>(), rmasks.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, rfeatures.ptr<scalar_t>(), rmasks.ptr<scalar_t>(),
+            kernel_size, group_size, scale_factor, channels, input_height,
+            input_width, output_height, output_width, mask_channels,
+            routput.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                routput.ptr<scalar_t>(), output.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const DArrayLite top_grad, const DArrayLite rfeatures,
+    const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
+    DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
+    DArrayLite mask_grad, const int kernel_size, const int group_size,
+    const int scale_factor, cudaStream_t stream) {
+  const int batch_size = top_grad.dim(0);
+  const int channels = top_grad.dim(1);
+  const int output_height = top_grad.dim(2);
+  const int output_width = top_grad.dim(3);
+
+  const int input_height = bottom_grad.dim(2);
+  const int input_width = bottom_grad.dim(3);
+
+  const int mask_channels = masks.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                top_grad.ptr<scalar_t>(), rtop_grad.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
+                         masks.ptr<scalar_t>(), kernel_size, group_size,
+                         scale_factor, channels, input_height, input_width,
+                         output_height, output_width, mask_channels,
+                         rbottom_grad_hs.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+
+        FeatureSum<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                               THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, rbottom_grad_hs.ptr<scalar_t>(), scale_factor,
+            channels, input_height, input_width, rbottom_grad.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                rbottom_grad.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
+                         rfeatures.ptr<scalar_t>(), kernel_size, group_size,
+                         scale_factor, channels, input_height, input_width,
+                         output_height, output_width, mask_channels,
+                         rmask_grad.ptr<scalar_t>());
+      }));
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.elemType().prim(), ([&] {
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                rmask_grad.ptr<scalar_t>(), mask_grad.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/carafe_naive.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive.cpp
+#include "parrots_cpp_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(
+    const DArrayLite features, const DArrayLite masks, DArrayLite output,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const DArrayLite top_grad, const DArrayLite features,
+    const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream);
+
+void carafe_naive_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = ins[0];
+  const auto& masks = ins[1];
+
+  auto& output = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor, stream);
+}
+
+void carafe_naive_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = ins[0];
+  const auto& features = ins[1];
+  const auto& masks = ins[2];
+
+  auto& bottom_grad = outs[0];
+  auto& mask_grad = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/carafe_naive_cuda.cu
+++ b/mmcv/ops/csrc/parrots/carafe_naive_cuda.cu
+#include "carafe_naive_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(
+    const DArrayLite features, const DArrayLite masks, DArrayLite output,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream) {
+  int output_size = output.size();
+  int channels = output.dim(1);
+  int height = output.dim(2);
+  int width = output.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.ptr<scalar_t>(), masks.ptr<scalar_t>(),
+                output.ptr<scalar_t>(), kernel_size, group_size, scale_factor,
+                channels, height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const DArrayLite top_grad, const DArrayLite features,
+    const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor,
+    cudaStream_t stream) {
+  int output_size = top_grad.size();
+  int channels = top_grad.dim(1);
+  int height = top_grad.dim(2);
+  int width = top_grad.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.elemType().prim(), ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.ptr<scalar_t>(), features.ptr<scalar_t>(),
+                masks.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>(),
+                mask_grad.ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/cc_attention.cpp
+++ b/mmcv/ops/csrc/parrots/cc_attention.cpp
+#include "parrots_cpp_helper.hpp"
+
+void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
+                                 DArrayLite weight, CudaContext &ctx,
+                                 cudaStream_t stream);
+
+void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
+                                  const DArrayLite f, DArrayLite dt,
+                                  DArrayLite df, CudaContext &ctx,
+                                  cudaStream_t stream);
+
+void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
+                                    DArrayLite out, CudaContext &ctx,
+                                    cudaStream_t stream);
+
+void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
+                                     const DArrayLite weight,
+                                     const DArrayLite g, DArrayLite dw,
+                                     DArrayLite dg, CudaContext &ctx,
+                                     cudaStream_t stream);
+
+void ca_forward_cuda(CudaContext &ctx, const SSElement &attr,
+                     const OperatorBase::in_list_t &ins,
+                     OperatorBase::out_list_t &outs) {
+  const auto &t = ins[0];
+  const auto &f = ins[1];
+  auto &weight = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CAForwardCUDAKernelLauncher(t, f, weight, ctx, stream);
+}
+
+void ca_backward_cuda(CudaContext &ctx, const SSElement &attr,
+                      const OperatorBase::in_list_t &ins,
+                      OperatorBase::out_list_t &outs) {
+  const auto &dw = ins[0];
+  const auto &t = ins[1];
+  const auto &f = ins[2];
+  auto &dt = outs[0];
+  auto &df = outs[1];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CABackwardCUDAKernelLauncher(dw, t, f, dt, df, ctx, stream);
+}
+
+void ca_map_forward_cuda(CudaContext &ctx, const SSElement &attr,
+                         const OperatorBase::in_list_t &ins,
+                         OperatorBase::out_list_t &outs) {
+  const auto &weight = ins[0];
+  const auto &g = ins[1];
+  auto &out = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CAMapForwardCUDAKernelLauncher(weight, g, out, ctx, stream);
+}
+
+void ca_map_backward_cuda(CudaContext &ctx, const SSElement &attr,
+                          const OperatorBase::in_list_t &ins,
+                          OperatorBase::out_list_t &outs) {
+  const auto &dout = ins[0];
+  const auto &weight = ins[1];
+  const auto &g = ins[2];
+  auto &dw = outs[0];
+  auto &dg = outs[1];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg, ctx, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(ca_forward)
+    .input(2)
+    .output(1)
+    .apply(ca_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ca_backward)
+    .input(3)
+    .output(2)
+    .apply(ca_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ca_map_forward)
+    .input(2)
+    .output(1)
+    .apply(ca_map_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ca_map_backward)
+    .input(3)
+    .output(2)
+    .apply(ca_map_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/cc_attention_cuda_kernel.cu
+++ b/mmcv/ops/csrc/parrots/cc_attention_cuda_kernel.cu
+#include "cc_attention_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
+                                 DArrayLite weight, CudaContext &ctx,
+                                 cudaStream_t stream) {
+  auto n = t.dim(0);
+  auto c = t.dim(1);
+  auto h = t.dim(2);
+  auto w = t.dim(3);
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
+    ca_forward_kernel<scalar_t>
+        <<<blocks, threads, 0, stream>>>(t.ptr<scalar_t>(), f.ptr<scalar_t>(),
+                                         weight.ptr<scalar_t>(), n, c, h, w);
+  });
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
+                                  const DArrayLite f, DArrayLite dt,
+                                  DArrayLite df, CudaContext &ctx,
+                                  cudaStream_t stream) {
+  auto n = t.dim(0);
+  auto c = t.dim(1);
+  auto h = t.dim(2);
+  auto w = t.dim(3);
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
+    ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
+        dt.ptr<scalar_t>(), n, c, h, w);
+  });
+
+  PARROTS_DISPATCH_FLOATING_TYPES(f.elemType().prim(), [&] {
+    ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
+        df.ptr<scalar_t>(), n, c, h, w);
+  });
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
+                                    DArrayLite out, CudaContext &ctx,
+                                    cudaStream_t stream) {
+  auto n = g.dim(0);
+  auto c = g.dim(1);
+  auto h = g.dim(2);
+  auto w = g.dim(3);
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
+    ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        weight.ptr<scalar_t>(), g.ptr<scalar_t>(), out.ptr<scalar_t>(), n, c, h,
+        w);
+  });
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
+                                     const DArrayLite weight,
+                                     const DArrayLite g, DArrayLite dw,
+                                     DArrayLite dg, CudaContext &ctx,
+                                     cudaStream_t stream) {
+  auto n = dout.dim(0);
+  auto c = dout.dim(1);
+  auto h = dout.dim(2);
+  auto w = dout.dim(3);
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES(weight.elemType().prim(), [&] {
+    ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
+        dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
+        dw.ptr<scalar_t>(), n, c, h, w);
+  });
+
+  PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
+    ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
+        dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
+        dg.ptr<scalar_t>(), n, c, h, w);
+  });
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/corner_pool.cpp
+++ b/mmcv/ops/csrc/parrots/corner_pool.cpp
+// Modified from
+// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
+#include "parrots_cpp_helper.hpp"
+
+void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {}
+
+void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {}
+
+void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {}
+
+void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {}
+
+void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {}
+
+void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                             const OperatorBase::in_list_t& ins,
+                             OperatorBase::out_list_t& outs) {}
+
+void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                             const OperatorBase::in_list_t& ins,
+                             OperatorBase::out_list_t& outs) {}
+
+void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {}
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
+    .input(1)
+    .output(1)
+    .apply(bottom_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
+    .input(2)
+    .output(1)
+    .apply(bottom_pool_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_forward)
+    .input(1)
+    .output(1)
+    .apply(top_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_backward)
+    .input(2)
+    .output(1)
+    .apply(top_pool_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_forward)
+    .input(1)
+    .output(1)
+    .apply(left_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_backward)
+    .input(2)
+    .output(1)
+    .apply(left_pool_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_forward)
+    .input(1)
+    .output(1)
+    .apply(right_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_backward)
+    .input(2)
+    .output(1)
+    .apply(right_pool_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
+// Copyright (c) 2018, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void DeformConvForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite offset,
+    DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH,
+    int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step, CudaContext& ctx,
+    cudaStream_t stream);
+
+void DeformConvBackwardInputCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite offset,
+    const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset,
+    DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step, CudaContext& ctx,
+    cudaStream_t stream);
+
+void DeformConvBackwardParametersCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite offset,
+    const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns,
+    DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH,
+    int dilationW, int dilationH, int group, int deformable_group, float scale,
+    int im2col_step, CudaContext& ctx, cudaStream_t stream);
+
+void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto input = ins[0];
+  const auto weight = ins[1];
+  const auto offset = ins[2];
+
+  auto output = outs[0];
+  auto columns = outs[1];
+  auto ones = outs[2];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  DeformConvForwardCUDAKernelLauncher(
+      input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
+      dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream);
+}
+
+void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  auto input = ins[0];
+  auto offset = ins[1];
+  auto gradOutput = ins[2];
+
+  auto gradInput = outs[0];
+  auto gradOffset = outs[1];
+  auto weight = outs[2];
+  auto columns = outs[3];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  DeformConvBackwardInputCUDAKernelLauncher(
+      input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
+      dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step, ctx, stream);
+}
+
+void deform_conv_backward_parameters_cuda(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  auto input = ins[0];
+  auto offset = ins[1];
+  auto gradOutput = ins[2];
+
+  auto gradWeight = outs[0];
+  auto columns = outs[1];
+  auto ones = outs[2];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  DeformConvBackwardParametersCUDAKernelLauncher(
+      input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
+      padW, padH, dilationW, dilationH, group, deformable_group, scale,
+      im2col_step, ctx, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+#include "parrots_cpp_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
+    DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
+    int sampling_ratio, float gamma, cudaStream_t stream);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
+    const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
+    int pooled_height, int pooled_width, float spatial_scale,
+    int sampling_ratio, float gamma, cudaStream_t stream);
+
+void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& input = ins[0];
+  const auto& rois = ins[1];
+  const auto& offset = ins[2];
+
+  auto& output = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  DeformRoIPoolForwardCUDAKernelLauncher(
+      input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
+      sampling_ratio, gamma, stream);
+}
+
+void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& grad_output = ins[0];
+  const auto& input = ins[1];
+  const auto& rois = ins[2];
+  const auto& offset = ins[3];
+
+  auto& grad_input = outs[0];
+  auto& grad_offset = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
+    DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
+    int sampling_ratio, float gamma, cudaStream_t stream) {
+  int output_size = output.size();
+  int channels = input.dim(1);
+  int height = input.dim(2);
+  int width = input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height,
+                pooled_width, spatial_scale, sampling_ratio, gamma, channels,
+                height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
+    const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
+    int pooled_height, int pooled_width, float spatial_scale,
+    int sampling_ratio, float gamma, cudaStream_t stream) {
+  int output_size = grad_output.size();
+  int channels = grad_input.dim(1);
+  int height = grad_input.dim(2);
+  int width = grad_input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.elemType().prim(), ([&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(),
+                rois.ptr<scalar_t>(), offset.ptr<scalar_t>(),
+                grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(),
+                pooled_height, pooled_width, spatial_scale, sampling_ratio,
+                gamma, channels, height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}