add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
+// Copyright (c) 2018, SenseTime.
+#include "parrots_cpp_helper.hpp"
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream);
+void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+  auto& output = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha, stream);
+}
+void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+  auto& grad_input = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha, stream);
+}
+void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+  auto& grad_input = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha, stream);
+}
+void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = ins[0];
+  const auto& target = ins[1];
+  const auto& weight = ins[2];
+  auto& buff = outs[0];
+  auto& grad_input = outs[1];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha, stream);
+}
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda)
+    .done();
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda)
+    .done();
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda)
+    .done();
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+#include "parrots_cuda_helper.hpp"
+#include "sigmoid_focal_loss_kernel.cuh"
+#include "softmax_focal_loss_kernel.cuh"
+void SigmoidFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = input.dim(1);
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+void SigmoidFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = grad_input.size();
+  int num_classes = input.dim(1);
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.elemType().prim(), ([&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
+                alpha, num_classes);
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+void SoftmaxFocalLossForwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+  int output_size = output.size();
+  int num_classes = softmax.dim(1);
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.elemType().prim(), ([&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+    cudaStream_t stream) {
+  int output_size = buff.size();
+  int num_classes = softmax.dim(1);
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
+                num_classes);
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+  output_size = grad_input.size();
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.elemType().prim(), ([&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+#include "parrots_cpp_helper.hpp"
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream);
+void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+  const auto& im = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+  auto& col = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w,
+                                        stream);
+}
+void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+  const auto& col = ins[0];
+  const auto& mask_h_idx = ins[1];
+  const auto& mask_w_idx = ins[2];
+  auto& im = outs[0];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
+                                       width, channels, stream);
+}
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda)
+    .done();
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "parrots_cuda_helper.hpp"
+void MaskedIm2colForwardCUDAKernelLauncher(
+    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
+  int channels = bottom_data.dim(1);
+  int height = bottom_data.dim(2);
+  int width = bottom_data.dim(3);
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
+                mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
+void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+                                          const DArrayLite mask_h_idx,
+                                          const DArrayLite mask_w_idx,
+                                          DArrayLite top_data, const int height,
+                                          const int width, const int channels,
+                                          cudaStream_t stream) {
+  int mask_cnt = mask_h_idx.dim(0);
+  int output_size = mask_cnt * channels;
+  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.elemType().prim(), ([&] {
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
+                mask_cnt, top_data.ptr<scalar_t>());
+      }));
+  PARROTS_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+// Copyright (c) 2019, SenseTime.
+#include "parrots_cpp_helper.hpp"
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+void ModulatedDeformConvBackwardCUDAKernelLauncher(
+    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
+    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
+    DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
+    DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
+    DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
+    int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
+    int group, int deformable_group, const bool with_bias, CudaContext& ctx,
+    cudaStream_t stream);
+void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+  auto output = outs[0];
+  auto columns = outs[1];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvForwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+void modulated_deform_conv_backward_cuda(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+  auto input = ins[0];
+  auto weight = ins[1];
+  auto bias = ins[2];
+  auto ones = ins[3];
+  auto offset = ins[4];
+  auto mask = ins[5];
+  auto columns = outs[0];
+  auto grad_input = outs[1];
+  auto grad_weight = outs[2];
+  auto grad_bias = outs[3];
+  auto grad_offset = outs[4];
+  auto grad_mask = outs[5];
+  auto grad_output = outs[6];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  ModulatedDeformConvBackwardCUDAKernelLauncher(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias, ctx, stream);
+}
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cuda)
+    .done();
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cuda)
+    .done();
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
+#include "parrots_cpp_helper.hpp"
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
+                                 const DArrayLite order, const DArrayLite areas,
+                                 float iou_threshold, int offset,
+                                 CudaContext& ctx, cudaStream_t stream);
+void nms_cuda(CudaContext& ctx, const SSElement& attr,
+              const OperatorBase::in_list_t& ins,
+              OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<int>("offset", offset)
+      .done();
+  const auto& boxes_sorted = ins[0];
+  const auto& order = ins[1];
+  const auto& areas = ins[2];
+  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
+                                  offset, ctx, stream);
+}
+void nms_cpu(HostContext& ctx, const SSElement& attr,
+             const OperatorBase::in_list_t& ins,
+             OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<int>("offset", offset)
+      .done();
+  const auto& boxes = ins[0];
+  const auto& order = ins[1];
+  const auto& areas = ins[2];
+  size_t nboxes = boxes.shape().dim(0);
+  size_t boxes_dim = boxes.shape().dim(1);
+  auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
+                                     getHostProxy());
+  select.setZeros(syncStream());
+  if (boxes.size() == 0) {
+    outs[0] = select;
+    return;
+  }
+  fill(ctx, select, *toScalar(1));
+  auto select_ptr = select.ptr<int64_t>();
+  auto boxes_ptr = boxes.ptr<float>();
+  auto order_ptr = order.ptr<int64_t>();
+  auto areas_ptr = areas.ptr<float>();
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select_ptr[_i] == 0) continue;
+    auto i = order_ptr[_i];
+    auto ix1 = boxes_ptr[i * boxes_dim];
+    auto iy1 = boxes_ptr[i * boxes_dim + 1];
+    auto ix2 = boxes_ptr[i * boxes_dim + 2];
+    auto iy2 = boxes_ptr[i * boxes_dim + 3];
+    auto iarea = areas_ptr[i];
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select_ptr[_j] == 0) continue;
+      auto j = order_ptr[_j];
+      auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
+      auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
+      auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
+      auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
+      auto w = fmaxf(0.0, xx2 - xx1 + offset);
+      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas_ptr[j] - inter);
+      if (ovr >= iou_threshold) select_ptr[_j] = 0;
+    }
+  }
+  outs[0] = select;
+}
+void softnms_cpu(HostContext& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  float sigma;
+  float min_score;
+  int method;
+  int offset;
+  SSAttrs(attr)
+      .get<float>("iou_threshold", iou_threshold)
+      .get<float>("sigma", sigma)
+      .get<float>("min_score", min_score)
+      .get<int>("method", method)
+      .get<int>("offset", offset)
+      .done();
+  const auto& boxes = ins[0];
+  const auto& scores = ins[1];
+  const auto& areas = ins[2];
+  size_t nboxes = boxes.shape().dim(0);
+  size_t boxes_dim = boxes.shape().dim(1);
+  auto boxes_ptr = boxes.ptr<float>();
+  auto scores_ptr = scores.ptr<float>();
+  auto areas_ptr = areas.ptr<float>();
+  auto inputs = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
+  auto inputs_ptr = inputs.ptr<float>();
+  auto dets = ctx.createDArrayLite(
+      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
+  auto de = dets.ptr<float>();
+  for (size_t i = 0; i < nboxes; i++) {
+    inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
+    inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
+    inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
+    inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
+    inputs_ptr[i * 6 + 4] = scores_ptr[i];
+    inputs_ptr[i * 6 + 5] = areas_ptr[i];
+  }
+  size_t pos = 0;
+  auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
+  arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
+  auto inds = inds_t.ptr<int64_t>();
+  auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
+  for (size_t i = 0; i < nboxes; i++) {
+    auto max_score = inputs_ptr[i * 6 + 4];
+    auto max_pos = i;
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < inputs_ptr[pos * 6 + 4]) {
+        max_score = inputs_ptr[pos * 6 + 4];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
+    auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
+    auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
+    auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
+    auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
+    auto iarea = inputs_ptr[max_pos * 6 + 5];
+    auto iind = inds[max_pos];
+    inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
+    inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
+    inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
+    inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
+    inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
+    inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
+    inds[max_pos] = inds[i];
+    inputs_ptr[i * 6 + 0] = ix1;
+    inputs_ptr[i * 6 + 1] = iy1;
+    inputs_ptr[i * 6 + 2] = ix2;
+    inputs_ptr[i * 6 + 3] = iy2;
+    inputs_ptr[i * 6 + 4] = iscore;
+    inputs_ptr[i * 6 + 5] = iarea;
+    inds[i] = iind;
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
+      auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
+      auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
+      auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
+      auto w = fmaxf(0.0, xx2 - xx1 + offset);
+      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = exp(-(ovr * ovr) / sigma);
+      }
+      inputs_ptr[pos * 6 + 4] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (inputs_ptr[pos * 6 + 4] < min_score) {
+        inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
+        inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
+        inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
+        inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
+        inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
+        inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  setScalar(num_out, int64_t{nboxes});
+  outs[0] = dets;
+  outs[1] = inds_t;
+  outs[2] = num_out;
+}
+void nms_match_cpu(HostContext& ctx, const SSElement& attr,
+                   const OperatorBase::in_list_t& ins,
+                   OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
+}
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(nms_cpu)
+#ifdef PARROTS_USE_CUDA
+    .apply(nms_cuda)
+#endif
+    .done();
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(3)
+    .apply(softnms_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_cpu)
+    .done();
--- a/mmcv/ops/csrc/parrots/nms_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_cuda.cu
--- a/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+++ b/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+#include "parrots_cpp_helper.hpp"
+using namespace parrots;
--- a/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+++ b/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
--- a/mmcv/ops/csrc/parrots/psamask_cuda.cu
+++ b/mmcv/ops/csrc/parrots/psamask_cuda.cu
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
--- a/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_align_cuda.cu
--- a/mmcv/ops/csrc/parrots/roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/roi_pool.cpp
--- a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
--- a/mmcv/ops/csrc/parrots/sync_bn.cpp
+++ b/mmcv/ops/csrc/parrots/sync_bn.cpp
--- a/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
+++ b/mmcv/ops/csrc/parrots/sync_bn_cuda.cu
--- a/mmcv/ops/csrc/parrots_cpp_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cpp_helper.hpp
--- a/mmcv/ops/csrc/parrots_cuda_helper.hpp
+++ b/mmcv/ops/csrc/parrots_cuda_helper.hpp