add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
+// Copyright (c) 2018, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream);
+
+void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& output = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha, stream);
+}
+
+void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha, stream);
+}
+
+void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& grad_input = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha, stream);
+}
+
+void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+
+  auto& buff = outs[0];
+  auto& grad_input = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "sigmoid_focal_loss_kernel.cuh"
+#include "softmax_focal_loss_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = input.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = grad_input.size();
+  int num_classes = input.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
+                alpha, num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = softmax.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.elemType().prim(), ([&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream) {
+  int output_size = buff.size();
+  int num_classes = softmax.dim(1);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.size();
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+#include "parrots_cpp_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
+
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream);
+
+void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+
+  auto& col = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w,
+                                        stream);
+}
+
+void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+
+  auto& im = outs[0];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
+                                       width, channels, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
+  int channels = bottom_data.dim(1);
+  int height = bottom_data.dim(2);
+  int width = bottom_data.dim(3);
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
+                mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream) {
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
+                mask_cnt, top_data.ptr<scalar_t>());
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+// Copyright (c) 2019, SenseTime.
+#include "parrots_cpp_helper.hpp"
+
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+
+void ModulatedDeformConvBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
+    DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
+    DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
+    int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
+    int group, int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+
+void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+
+  auto output = outs[0];
+  auto columns = outs[1];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvForwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+
+void modulated_deform_conv_backward_cuda(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+
+  auto columns = outs[0];
+  auto grad_input = outs[1];
+  auto grad_weight = outs[2];
+  auto grad_bias = outs[3];
+  auto grad_offset = outs[4];
+  auto grad_mask = outs[5];
+  auto grad_output = outs[6];
+
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvBackwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cuda)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
--- a/mmcv/ops/csrc/parrots/nms_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_cuda.cu
+#include "nms_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+
+DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
+                                 const DArrayLite order, const DArrayLite areas,
+                                 float iou_threshold, int offset,
+                                 CudaContext& ctx, cudaStream_t stream) {
+  size_t boxes_num = boxes_sorted.dim(0);
+
+  if (boxes_sorted.size() == 0) {
+    auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
+    return select;
+  }
+
+  const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  auto mask = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
+      (unsigned long long*)mask.ptr<int64_t>());
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+
+  auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
+  auto mask_host = mask_cpu.ptr<int64_t>();
+
+  auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
+                                   getHostProxy());
+  remv.setZeros(syncStream());
+  auto remv_ptr = remv.ptr<int64_t>();
+
+  auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
+                                     getHostProxy());
+  keep_t.setZeros(syncStream());
+  auto keep = keep_t.ptr<uint8_t>();
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv_ptr[nblock] & (1ULL << inblock))) {
+      keep[i] = 1;
+      int64_t* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_ptr[j] |= p[j];
+      }
+    }
+  }
+
+  auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+  return keep_cuda;
+}
--- a/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+++ b/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+#include "parrots_cpp_helper.hpp"
+using namespace parrots;
--- a/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+++ b/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+#include "parrots_cuda_helper.hpp"
+
+using namespace parrots;
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
--- a/mmcv/ops/csrc/parrots/psamask_cuda.cu
+++ b/mmcv/ops/csrc/parrots/psamask_cuda.cu
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include "parrots_cuda_helper.hpp"
+#include "psamask_cuda_kernel.cuh"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
+                                      const DArrayLite input, DArrayLite output,
+                                      const int num_, const int h_feature,
+                                      const int w_feature, const int h_mask,
+                                      const int w_mask, const int half_h_mask,
+                                      const int half_w_mask, CudaContext& ctx) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  if (psa_type == 0)
+    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
+      psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
+    });
+  else
+    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
+      psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
+    });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask,
+    CudaContext& ctx) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  if (psa_type == 0)
+    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
+      psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
+    });
+  else
+    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
+      psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
+    });
+}
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
--- a/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "roi_align_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
+                                       const DArrayLite rois, DArrayLite output,
+                                       DArrayLite argmax_y, DArrayLite argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned,
+                                       cudaStream_t stream) {
+  int output_size = output.size();
+  int channels = input.dim(1);
+  int height = input.dim(2);
+  int width = input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
+                argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
+                spatial_scale, sampling_ratio, pool_mode, aligned, channels,
+                height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(
+    const DArrayLite grad_output, const DArrayLite rois,
+    const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
+  int output_size = grad_output.size();
+  int channels = grad_input.dim(1);
+  int height = grad_input.dim(2);
+  int width = grad_input.dim(3);
+
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.elemType().prim(), ([&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
+                grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
+                spatial_scale, sampling_ratio, pool_mode, aligned, channels,
+                height, width);
+      }));
+
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/roi_pool.cpp
--- a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
--- a/mmcv/ops/csrc/parrots/sync_bn.cpp
+++ b/mmcv/ops/csrc/parrots/sync_bn.cpp
--- a/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
+++ b/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
--- a/mmcv/ops/csrc/parrots_cpp_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cpp_helper.hpp
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#endif  // PARROTS_CPP_HELPER
--- a/mmcv/ops/csrc/parrots_cuda_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cuda_helper.hpp