add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
+// Copyright (c) 2018, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream);
+
+void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& output = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha, stream);
+}
+
+void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha, stream);
+}
+
+void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha, stream);
+}
+
+void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& buff = outs[0];
+  auto& grad_input = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "sigmoid_focal_loss_kernel.cuh"
+#include "softmax_focal_loss_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = input.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = grad_input.size();
+  int num_classes = input.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
+                alpha, num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = softmax.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.elemType().prim(), ([&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream) {
+  int output_size = buff.size();
+  int num_classes = softmax.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.size();
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+#include "parrots_cpp_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
+
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream);
+
+void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+
+  auto& col = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w,
+                                        stream);
+}
+
+void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+
+  auto& im = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
+                                       width, channels, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
+  int channels = bottom_data.dim(1);
+  int height = bottom_data.dim(2);
+  int width = bottom_data.dim(3);
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
+                mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream) {
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
+                mask_cnt, top_data.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+// Copyright (c) 2019, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+
+void ModulatedDeformConvBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
+    DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
+    DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
+    int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
+    int group, int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+
+void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+
+  auto output = outs[0];
+  auto columns = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvForwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+
+void modulated_deform_conv_backward_cuda(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+
+  auto columns = outs[0];
+  auto grad_input = outs[1];
+  auto grad_weight = outs[2];
+  auto grad_bias = outs[3];
+  auto grad_offset = outs[4];
+  auto grad_mask = outs[5];
+  auto grad_output = outs[6];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvBackwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const DArrayLite data_im, const DArrayLite data_offset,
+    const DArrayLite data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    DArrayLite data_col, cudaStream_t stream) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.elemType().prim(), ([&] {
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, data_im.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
+            data_mask.ptr<scalar_t>(), height_im, width_im, kernel_h, kenerl_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size, channels,
+            deformable_group, height_col, width_col, data_col.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const DArrayLite data_col, const DArrayLite data_offset,
+    const DArrayLite data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    DArrayLite grad_im, cudaStream_t stream) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.elemType().prim(), ([&] {
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
+            data_mask.ptr<scalar_t>(), channels, height_im, width_im, kernel_h,
+            kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size, deformable_group,
+            height_col, width_col, grad_im.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const DArrayLite data_col, const DArrayLite data_im,
+    const DArrayLite data_offset, const DArrayLite data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, DArrayLite grad_offset,
+    DArrayLite grad_mask, cudaStream_t stream) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.elemType().prim(), ([&] {
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(),
+            data_offset.ptr<scalar_t>(), data_mask.ptr<scalar_t>(), channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, 2 * kernel_h * kernel_w * deformable_group,
+            deformable_group, height_col, width_col,
+            grad_offset.ptr<scalar_t>(), grad_mask.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
+    DArrayLite offset, DArrayLite mask, DArrayLite output, DArrayLite columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
+  const int batch = input.dim(0);
+  const int channels = input.dim(1);
+  const int height = input.dim(2);
+  const int width = input.dim(3);
+
+  const int channels_out = weight.dim(0);
+  const int channels_kernel = weight.dim(1);
+  const int kernel_h_ = weight.dim(2);
+  const int kernel_w_ = weight.dim(3);
+
+  PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
+      << "Input shape and kernel shape wont match: (" << kernel_h << " x "
+      << kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
+
+  PARROTS_CHECKARGS(channels == channels_kernel * group)
+      << "Input shape and kernel channels wont match: (" << channels << " vs "
+      << channels_kernel * group << ").";
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = ctx.createDArrayLite(input.elemType(),
+                                DArrayShape(height_out, width_out));
+    fill(ctx, ones, *toScalar(1));
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out});
+  output.setZeros(ctx.getStream());
+
+  // resize temporary columns
+  columns = ctx.createDArrayLite(
+      input.elemType(),
+      DArrayShape(channels * kernel_h * kernel_w, 1 * height_out * width_out));
+  columns.setZeros(ctx.getStream());
+
+  output = output.view({output.dim(0), group, output.dim(1) / group,
+                        output.dim(2), output.dim(3)});
+
+  for (size_t b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns, stream);
+
+    // divide into group
+    weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
+                          weight.dim(2), weight.dim(3)});
+    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+
+    for (size_t g = 0; g < group; g++) {
+      auto output_g = output[b][g];
+      gemm(ctx, 1, false,
+           weight[g].view(
+               {weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
+           false, columns[g], 1, output_g);
+    }
+
+    weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
+                          weight.dim(3), weight.dim(4)});
+    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+  }
+
+  output = output.view({output.dim(0), output.dim(1) * output.dim(2),
+                        output.dim(3), output.dim(4)});
+
+  if (with_bias) {
+    bias = bias.view({1, bias.dim(0), 1, 1});
+    add(ctx, output, bias, output);
+  }
+}
+
+void ModulatedDeformConvBackwardCUDAKernelLauncher(
+    DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
+    DArrayLite offset, DArrayLite mask, DArrayLite columns,
+    DArrayLite grad_input, DArrayLite grad_weight, DArrayLite grad_bias,
+    DArrayLite grad_offset, DArrayLite grad_mask, DArrayLite grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
+  const int batch = input.dim(0);
+  const int channels = input.dim(1);
+  const int height = input.dim(2);
+  const int width = input.dim(3);
+
+  const int channels_kernel = weight.dim(1);
+  const int kernel_h_ = weight.dim(2);
+  const int kernel_w_ = weight.dim(3);
+
+  PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
+      << "Input shape and kernel shape wont match: (" << kernel_h << " x "
+      << kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
+
+  PARROTS_CHECKARGS(channels == channels_kernel * group)
+      << "Input shape and kernel channels wont match: (" << channels << " vs "
+      << channels_kernel * group << ").";
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = ctx.createDArrayLite(input.elemType(),
+                                DArrayShape(height_out, width_out));
+    fill(ctx, ones, *toScalar(1));
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = ctx.createDArrayLite(
+      input.elemType(),
+      DArrayShape(channels * kernel_h * kernel_w, height_out * width_out));
+
+  grad_output =
+      grad_output.view({grad_output.dim(0), group, grad_output.dim(1) / group,
+                        grad_output.dim(2), grad_output.dim(3)});
+
+  for (size_t b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+    weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
+                          weight.dim(2), weight.dim(3)});
+
+    for (size_t g = 0; g < group; g++) {
+      auto columns_g = columns[g];
+      gemm(ctx, 1, true,
+           weight[g].view(
+               {weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
+           false,
+           grad_output[b][g].view(
+               {grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
+           0, columns_g);
+    }
+
+    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
+                          weight.dim(3), weight.dim(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b], stream);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b], stream);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns, stream);
+
+    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+    grad_weight =
+        grad_weight.view({group, grad_weight.dim(0) / group, grad_weight.dim(1),
+                          grad_weight.dim(2), grad_weight.dim(3)});
+    if (with_bias) {
+      grad_bias = grad_bias.view({group, grad_bias.dim(0) / group});
+    }
+
+    for (size_t g = 0; g < group; g++) {
+      auto grad_weight_g = grad_weight[g].view(
+          {grad_weight.dim(1),
+           grad_weight.dim(2) * grad_weight.dim(3) * grad_weight.dim(4)});
+      gemm(ctx, 1, false,
+           grad_output[b][g].view(
+               {grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
+           true, columns[g], 1, grad_weight_g);
+
+      if (with_bias) {
+        auto grad_bias_g = grad_bias[g].view({grad_bias.dim(1), 1});
+        gemm(ctx, 1, false,
+             grad_output[b][g].view(
+                 {grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
+             false, ones.view({ones.dim(0) * ones.dim(1), 1}), 1, grad_bias_g);
+      }
+    }
+
+    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    grad_weight = grad_weight.view({grad_weight.dim(0) * grad_weight.dim(1),
+                                    grad_weight.dim(2), grad_weight.dim(3),
+                                    grad_weight.dim(4)});
+    if (with_bias)
+      grad_bias =
+          grad_bias.view(DArrayShape{grad_bias.dim(0) * grad_bias.dim(1)});
+  }
+  grad_output = grad_output.view({grad_output.dim(0) * grad_output.dim(1),
+                                  grad_output.dim(2), grad_output.dim(3),
+                                  grad_output.dim(4)});
+}
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
+#include "parrots_cpp_helper.hpp"
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
+                                 const DArrayLite order, const DArrayLite areas,
+                                 float iou_threshold, int offset,
+                                 CudaContext& ctx, cudaStream_t stream);
+
+void nms_cuda(CudaContext& ctx, const SSElement& attr,
+              const OperatorBase::in_list_t& ins,
+              OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& boxes_sorted = ins[0];
+  const auto& order = ins[1];
+  const auto& areas = ins[2];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
+                                  offset, ctx, stream);
+}
+
+void nms_cpu(HostContext& ctx, const SSElement& attr,
+             const OperatorBase::in_list_t& ins,
+             OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& boxes = ins[0];
+  const auto& order = ins[1];
+  const auto& areas = ins[2];
+
+  size_t nboxes = boxes.shape().dim(0);
+  size_t boxes_dim = boxes.shape().dim(1);
+
+  auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
+                                     getHostProxy());
+  select.setZeros(syncStream());
+
+  if (boxes.size() == 0) {
+    outs[0] = select;
+    return;
+  }
+
+  fill(ctx, select, *toScalar(1));
+
+  auto select_ptr = select.ptr<int64_t>();
+  auto boxes_ptr = boxes.ptr<float>();
+  auto order_ptr = order.ptr<int64_t>();
+  auto areas_ptr = areas.ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select_ptr[_i] == 0) continue;
+    auto i = order_ptr[_i];
+    auto ix1 = boxes_ptr[i * boxes_dim];
+    auto iy1 = boxes_ptr[i * boxes_dim + 1];
+    auto ix2 = boxes_ptr[i * boxes_dim + 2];
+    auto iy2 = boxes_ptr[i * boxes_dim + 3];
+    auto iarea = areas_ptr[i];
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select_ptr[_j] == 0) continue;
+      auto j = order_ptr[_j];
+      auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
+      auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
+      auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
+      auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
+
+      auto w = fmaxf(0.0, xx2 - xx1 + offset);
+      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas_ptr[j] - inter);
+      if (ovr >= iou_threshold) select_ptr[_j] = 0;
+    }
+  }
+  outs[0] = select;
+}
+
+void softnms_cpu(HostContext& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  float sigma;
+  float min_score;
+  int method;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<float>("sigma", sigma)
+      .get<float>("min_score", min_score)
+      .get<int>("method", method)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& boxes = ins[0];
+  const auto& scores = ins[1];
+  const auto& areas = ins[2];
+
+  size_t nboxes = boxes.shape().dim(0);
+  size_t boxes_dim = boxes.shape().dim(1);
+  auto boxes_ptr = boxes.ptr<float>();
+  auto scores_ptr = scores.ptr<float>();
+  auto areas_ptr = areas.ptr<float>();
+
+  auto inputs = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
+  auto inputs_ptr = inputs.ptr<float>();
+  auto dets = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
+  auto de = dets.ptr<float>();
+  for (size_t i = 0; i < nboxes; i++) {
+    inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
+    inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
+    inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
+    inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
+    inputs_ptr[i * 6 + 4] = scores_ptr[i];
+    inputs_ptr[i * 6 + 5] = areas_ptr[i];
+  }
+
+  size_t pos = 0;
+  auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
+  arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
+  auto inds = inds_t.ptr<int64_t>();
+  auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
+
+  for (size_t i = 0; i < nboxes; i++) {
+    auto max_score = inputs_ptr[i * 6 + 4];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < inputs_ptr[pos * 6 + 4]) {
+        max_score = inputs_ptr[pos * 6 + 4];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
+    auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
+    auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
+    auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
+    auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
+    auto iarea = inputs_ptr[max_pos * 6 + 5];
+    auto iind = inds[max_pos];
+    inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
+    inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
+    inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
+    inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
+    inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
+    inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
+    inds[max_pos] = inds[i];
+    inputs_ptr[i * 6 + 0] = ix1;
+    inputs_ptr[i * 6 + 1] = iy1;
+    inputs_ptr[i * 6 + 2] = ix2;
+    inputs_ptr[i * 6 + 3] = iy2;
+    inputs_ptr[i * 6 + 4] = iscore;
+    inputs_ptr[i * 6 + 5] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
+      auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
+      auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
+      auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
+
+      auto w = fmaxf(0.0, xx2 - xx1 + offset);
+      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = exp(-(ovr * ovr) / sigma);
+      }
+      inputs_ptr[pos * 6 + 4] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (inputs_ptr[pos * 6 + 4] < min_score) {
+        inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
+        inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
+        inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
+        inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
+        inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
+        inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  setScalar(num_out, int64_t{nboxes});
+  outs[0] = dets;
+  outs[1] = inds_t;
+  outs[2] = num_out;
+}
+
+void nms_match_cpu(HostContext& ctx, const SSElement& attr,
+                   const OperatorBase::in_list_t& ins,
+                   OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(nms_cpu)
+#ifdef PARROTS_USE_CUDA
+    .apply(nms_cuda)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(3)
+    .apply(softnms_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_cpu)
+    .done();
--- a/mmcv/ops/csrc/parrots/nms_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_cuda.cu
+#include "nms_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
+                                 const DArrayLite order, const DArrayLite areas,
+                                 float iou_threshold, int offset,
+                                 CudaContext& ctx, cudaStream_t stream) {
+  size_t boxes_num = boxes_sorted.dim(0);
+
+  if (boxes_sorted.size() == 0) {
+    auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
+    return select;
+  }
+
+  const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  auto mask = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
+      (unsigned long long*)mask.ptr<int64_t>());
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+
+  auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
+  auto mask_host = mask_cpu.ptr<int64_t>();
+
+  auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
+                                   getHostProxy());
+  remv.setZeros(syncStream());
+  auto remv_ptr = remv.ptr<int64_t>();
+
+  auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
+                                     getHostProxy());
+  keep_t.setZeros(syncStream());
+  auto keep = keep_t.ptr<uint8_t>();
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv_ptr[nblock] & (1ULL << inblock))) {
+      keep[i] = 1;
+      int64_t* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_ptr[j] |= p[j];
+      }
+    }
+  }
+
+  auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+  return keep_cuda;
+}
--- a/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+++ b/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+#include "parrots_cpp_helper.hpp"
+using namespace parrots;
--- a/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+++ b/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+#include "parrots_cuda_helper.hpp"
+
+using namespace parrots;
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
+#include "parrots_cpp_helper.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const float *mask_data,
+                             float *buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data[(n * h_feature * w_feature +
+                         (hidx + h - half_h_mask) * w_feature +
+                         (widx + w - half_w_mask)) *
+                            h_feature * w_feature +
+                        h * w_feature + w] =
+                mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const float *mask_data,
+                                float *buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)] =
+                mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const float *buffer_diff,
+                              float *mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                           h_feature +
+                       h) *
+                          w_feature +
+                      w] = buffer_diff[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const float *buffer_diff, float *mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                           h_feature +
+                       h) *
+                          w_feature +
+                      w] =
+                buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                                h_feature * w_feature +
+                            (hidx + h - half_h_mask) * w_feature +
+                            (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(HostContext &ctx, const SSElement &attr,
+                         const OperatorBase::in_list_t &ins,
+                         OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = ins[0];
+  auto &output = outs[0];
+
+  auto input_ptr = input.ptr<float>();
+  auto output_ptr = output.ptr<float>();
+
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input_ptr, output_ptr);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input_ptr, output_ptr);
+}
+
+void psamask_backward_cpu(HostContext &ctx, const SSElement &attr,
+                          const OperatorBase::in_list_t &ins,
+                          OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &input = ins[0];
+  auto &output = outs[0];
+
+  auto input_ptr = input.ptr<float>();
+  auto output_ptr = output.ptr<float>();
+
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, input_ptr, output_ptr);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, input_ptr,
+                                output_ptr);
+}
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
+                                      const DArrayLite input, DArrayLite output,
+                                      const int num_, const int h_feature,
+                                      const int w_feature, const int h_mask,
+                                      const int w_mask, const int half_h_mask,
+                                      const int half_w_mask, CudaContext &ctx);
+
+void PSAMaskBackwardCUDAKernelLauncher(const int psa_type,
+                                       const DArrayLite grad_output,
+                                       DArrayLite grad_input, const int num_,
+                                       const int h_feature, const int w_feature,
+                                       const int h_mask, const int w_mask,
+                                       const int half_h_mask,
+                                       const int half_w_mask, CudaContext &ctx);
+
+void psamask_forward_cuda(CudaContext &ctx, const SSElement &attr,
+                          const OperatorBase::in_list_t &ins,
+                          OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = ins[0];
+  auto &output = outs[0];
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask, ctx);
+}
+
+void psamask_backward_cuda(CudaContext &ctx, const SSElement &attr,
+                           const OperatorBase::in_list_t &ins,
+                           OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &input = ins[0];
+  auto &output = outs[0];
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                    w_feature, h_mask, w_mask, half_h_mask,
+                                    half_w_mask, ctx);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu)
+#ifdef PARROTS_USE_CUDA
+    .apply(psamask_forward_cuda)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu)
+#ifdef PARROTS_USE_CUDA
+    .apply(psamask_backward_cuda)
+#endif
+    .done();
--- a/mmcv/ops/csrc/parrots/psamask_cuda.cu
+++ b/mmcv/ops/csrc/parrots/psamask_cuda.cu
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include "parrots_cuda_helper.hpp"
+#include "psamask_cuda_kernel.cuh"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
+                                      const DArrayLite input, DArrayLite output,
+                                      const int num_, const int h_feature,
+                                      const int w_feature, const int h_mask,
+                                      const int w_mask, const int half_h_mask,
+                                      const int half_w_mask, CudaContext& ctx) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  if (psa_type == 0)
+    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
+      psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
+    });
+  else
+    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
+      psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
+    });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask,
+    CudaContext& ctx) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  if (psa_type == 0)
+    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
+      psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
+    });
+  else
+    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
+      psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
+    });
+}
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
+// Copyright (c) 2018, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
+                                       const DArrayLite rois, DArrayLite output,
+                                       DArrayLite argmax_y, DArrayLite argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned,
+                                       cudaStream_t stream);
+
+void ROIAlignBackwardCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite rois,
+    const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+void roi_align_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = ins[0];
+  const auto& rois = ins[1];
+  auto& output = outs[0];
+  auto& argmax_y = outs[1];
+  auto& argmax_x = outs[2];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned, stream);
+}
+
+void roi_align_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                             const OperatorBase::in_list_t& ins,
+                             OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = ins[0];
+  const auto& rois = ins[1];
+  const auto& argmax_y = ins[2];
+  const auto& argmax_x = ins[3];
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_forward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(2)
+    .output(3)
+    .apply(roi_align_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_backward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(4)
+    .output(1)
+    .apply(roi_align_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "roi_align_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
+                                       const DArrayLite rois, DArrayLite output,
+                                       DArrayLite argmax_y, DArrayLite argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned,
+                                       cudaStream_t stream) {
+  int output_size = output.size();
+  int channels = input.dim(1);
+  int height = input.dim(2);
+  int width = input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
+                argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
+                spatial_scale, sampling_ratio, pool_mode, aligned, channels,
+                height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite rois,
+    const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
+  int output_size = grad_output.size();
+  int channels = grad_input.dim(1);
+  int height = grad_input.dim(2);
+  int width = grad_input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.elemType().prim(), ([&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
+                grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
+                spatial_scale, sampling_ratio, pool_mode, aligned, channels,
+                height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/roi_pool.cpp
+#include "parrots_cpp_helper.hpp"
+
+void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
+                                      const DArrayLite rois, DArrayLite output,
+                                      DArrayLite argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      cudaStream_t stream);
+
+void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
+                                       const DArrayLite rois,
+                                       const DArrayLite argmax,
+                                       DArrayLite grad_input, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       cudaStream_t stream);
+
+void roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = ins[0];
+  const auto& rois = ins[1];
+  auto& output = outs[0];
+  auto& argmax = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale, stream);
+}
+
+void roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = ins[0];
+  const auto& rois = ins[1];
+  const auto& argmax = ins[2];
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale,
+                                    stream);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(2)
+    .apply(roi_pool_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(3)
+    .output(1)
+    .apply(roi_pool_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "roi_pool_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
+                                      const DArrayLite rois, DArrayLite output,
+                                      DArrayLite argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      cudaStream_t stream) {
+  int output_size = output.size();
+  int channels = input.dim(1);
+  int height = input.dim(2);
+  int width = input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(input.elemType().prim(), [&] {
+    roi_pool_forward_cuda_kernel<scalar_t>
+        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+            output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+            output.ptr<scalar_t>(), argmax.ptr<int>(), pooled_height,
+            pooled_width, spatial_scale, channels, height, width);
+  });
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
+                                       const DArrayLite rois,
+                                       const DArrayLite argmax,
+                                       DArrayLite grad_input, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       cudaStream_t stream) {
+  int output_size = grad_output.size();
+  int channels = grad_output.dim(1);
+  int height = grad_output.dim(2);
+  int width = grad_output.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output.elemType().prim(), [&] {
+    roi_pool_backward_cuda_kernel<scalar_t>
+        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+            output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+            argmax.ptr<int>(), grad_input.ptr<scalar_t>(), pooled_height,
+            pooled_width, channels, height, width);
+  });
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/sync_bn.cpp
+++ b/mmcv/ops/csrc/parrots/sync_bn.cpp
+#include "parrots_cpp_helper.hpp"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
+                                         DArrayLite mean, cudaStream_t stream);
+
+void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
+                                        const DArrayLite mean, DArrayLite var,
+                                        cudaStream_t stream);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite mean, const DArrayLite var,
+    DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
+    const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
+    const float eps, const float momentum, size_t group_size,
+    cudaStream_t stream);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
+                                           const DArrayLite norm,
+                                           DArrayLite weight_diff,
+                                           DArrayLite bias_diff,
+                                           cudaStream_t stream);
+
+void SyncBNBackwardDataCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite weight,
+    const DArrayLite weight_diff, const DArrayLite bias_diff,
+    const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
+    cudaStream_t stream);
+
+void sync_bn_forward_mean_cuda(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  const auto& input = ins[0];
+  auto& mean = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean, stream);
+}
+
+void sync_bn_forward_var_cuda(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  const auto& input = ins[0];
+  const auto& mean = ins[1];
+  auto& var = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var, stream);
+}
+
+void sync_bn_forward_output_cuda(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  size_t group_size;
+  float eps, momentum;
+  SSAttrs(attr)
+      .get<float>("eps", eps)
+      .get<float>("momentum", momentum)
+      .get<size_t>("group_size", group_size)
+      .done();
+
+  const auto& input = ins[0];
+  const auto& mean = ins[1];
+  const auto& var = ins[2];
+  const auto& weight = ins[3];
+  const auto& bias = ins[4];
+  auto& running_mean = outs[0];
+  auto& running_var = outs[1];
+  auto& norm = outs[2];
+  auto& std = outs[3];
+  auto& output = outs[4];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SyncBNForwardOutputCUDAKernelLauncher(
+      input, mean, var, running_mean, running_var, weight, bias, norm, std,
+      output, eps, momentum, group_size, stream);
+}
+
+void sync_bn_backward_param_cuda(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  const auto& grad_output = ins[0];
+  const auto& norm = ins[1];
+  auto& grad_weight = outs[0];
+  auto& grad_bias = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias, stream);
+}
+
+void sync_bn_backward_data_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  const auto& grad_output = ins[0];
+  const auto& weight = ins[1];
+  const auto& grad_weight = ins[2];
+  const auto& grad_bias = ins[3];
+  const auto& norm = ins[4];
+  const auto& std = ins[5];
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input,
+                                       stream);
+}
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
+    .input(1)
+    .output(1)
+    .apply(sync_bn_forward_mean_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
+    .input(2)
+    .output(1)
+    .apply(sync_bn_forward_var_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
+    .attr("eps")
+    .attr("momentum")
+    .attr("group_size")
+    .input(5)
+    .output(5)
+    .apply(sync_bn_forward_output_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
+    .input(2)
+    .output(2)
+    .apply(sync_bn_backward_param_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
+    .input(6)
+    .output(1)
+    .apply(sync_bn_backward_data_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
+++ b/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
+                                         DArrayLite mean, cudaStream_t stream) {
+  int num = input.dim(0);
+  int channels = input.dim(1);
+  int spatial = input.dim(2);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(input.ptr<scalar_t>(),
+                                                         mean.ptr<float>(), num,
+                                                         channels, spatial);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
+                                        const DArrayLite mean, DArrayLite var,
+                                        cudaStream_t stream) {
+  int num = input.dim(0);
+  int channels = input.dim(1);
+  int spatial = input.dim(2);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(), num,
+                channels, spatial);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite mean, const DArrayLite var,
+    DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
+    const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
+    float eps, float momentum, size_t group_size, cudaStream_t stream) {
+  int num = input.dim(0);
+  int channels = input.dim(1);
+  int spatial = input.dim(2);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(),
+                running_mean.ptr<float>(), running_var.ptr<float>(),
+                weight.ptr<float>(), bias.ptr<float>(), norm.ptr<float>(),
+                std.ptr<float>(), output.ptr<scalar_t>(), num, channels,
+                spatial, eps, momentum, group_size);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
+                                           const DArrayLite norm,
+                                           DArrayLite grad_weight,
+                                           DArrayLite grad_bias,
+                                           cudaStream_t stream) {
+  int num = grad_output.dim(0);
+  int channels = grad_output.dim(1);
+  int spatial = grad_output.dim(2);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.elemType().prim(), ([&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.ptr<scalar_t>(), norm.ptr<float>(),
+                grad_weight.ptr<float>(), grad_bias.ptr<float>(), num, channels,
+                spatial);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite weight,
+    const DArrayLite grad_weight, const DArrayLite grad_bias,
+    const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
+    cudaStream_t stream) {
+  int output_size = grad_input.size();
+  int num = grad_input.dim(0);
+  int channels = grad_input.dim(1);
+  int spatial = grad_input.dim(2);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.ptr<scalar_t>(), weight.ptr<float>(),
+                grad_weight.ptr<float>(), grad_bias.ptr<float>(),
+                norm.ptr<float>(), std.ptr<float>(), grad_input.ptr<scalar_t>(),
+                num, channels, spatial);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots_cpp_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cpp_helper.hpp
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#endif  // PARROTS_CPP_HELPER
--- a/mmcv/ops/csrc/parrots_cuda_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cuda_helper.hpp
+#ifndef PARROTS_CUDA_HELPER
+#define PARROTS_CUDA_HELPER
+
+#include <cuda.h>
+#include <float.h>
+
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darrayutil.hpp>
+#include <parrots/foundation/exceptions.hpp>
+#include <parrots/foundation/float16.hpp>
+#include <parrots/foundation/mathfunction.hpp>
+
+#include "common_cuda_helper.hpp"
+#include "parrots_cudawarpfunction.cuh"
+
+using namespace parrots;
+using phalf = float16;
+
+#define __PHALF(x) (x.y)
+
+#define PARROTS_CUDA_CHECK(exp)                         \
+  do {                                                  \
+    cudaError_t err = exp;                              \
+    if (err != cudaSuccess) {                           \
+      fprintf(stderr, "cudaCheckError() failed : %s\n", \
+              cudaGetErrorString(err));                 \
+      exit(-1);                                         \
+    }                                                   \
+  } while (0)
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+/** atomicAdd **/
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  if (val == 0.0) return __longlong_as_double(old);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#endif
+
+static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
+  unsigned int* aligned =
+      (unsigned int*)((size_t)address - ((size_t)address & 2));
+  unsigned int old = *aligned;
+  unsigned int assumed;
+  unsigned short old_as_us;
+  do {
+    assumed = old;
+    old_as_us =
+        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
+
+#if __CUDACC_VER_MAJOR__ >= 9
+    float16 tmp;
+    tmp.x = old_as_us;
+    float16 sum = tmp + val;
+    unsigned short sum_as_us = sum.x;
+//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
+//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
+#else
+    unsigned short sum_as_us =
+        __float2half_rn(__half2float(old_as_us) + (float)(val));
+#endif
+
+    unsigned int sum_as_ui = (size_t)address & 2
+                                 ? (sum_as_us << 16) | (old & 0xffff)
+                                 : (old & 0xffff0000) | sum_as_us;
+    old = atomicCAS(aligned, assumed, sum_as_ui);
+  } while (assumed != old);
+  //__half_raw raw = {old_as_us};
+  // return float16(raw);
+  return *reinterpret_cast<float16*>(&old_as_us);
+}
+#endif  // PARROTS_CUDA_HELPER