Add new parrots extension implementation for all ops (#794)

* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint

Add new parrots extension implementation for all ops (#794)
* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint
48d99025 · z55250825 · GitHub · 72e4cc12 · 48d99025 · 48d99025
Unverified Commit 48d99025 authored Feb 26, 2021 by z55250825 Committed by GitHub Feb 26, 2021
20 changed files
--- a/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
+#ifndef MASKED_CONV2D_PYTORCH_H
+#define MASKED_CONV2D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+#endif  // MASKED_CONV2D_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
-// Copyright (c) 2019, SenseTime.
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

+#ifdef MMCV_WITH_CUDA
 void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
-    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
-    DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
-    int deformable_group, const bool with_bias, CudaContext& ctx,
-    cudaStream_t stream);
+    const int deformable_group, const bool with_bias);

 void ModulatedDeformConvBackwardCUDAKernelLauncher(
-    const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
-    const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
-    DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
-    DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
-    DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
-    int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
-    int group, int deformable_group, const bool with_bias, CudaContext& ctx,
-    cudaStream_t stream);
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);

-void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                                        const OperatorBase::in_list_t& ins,
-                                        OperatorBase::out_list_t& outs) {
-  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
-      dilation_w, group, deformable_group, with_bias;
-  SSAttrs(attr)
-      .get<int>("kernel_h", kernel_h)
-      .get<int>("kernel_w", kernel_w)
-      .get<int>("stride_h", stride_h)
-      .get<int>("stride_w", stride_w)
-      .get<int>("pad_h", pad_h)
-      .get<int>("pad_w", pad_w)
-      .get<int>("dilation_h", dilation_h)
-      .get<int>("dilation_w", dilation_w)
-      .get<int>("group", group)
-      .get<int>("deformable_group", deformable_group)
-      .get<int>("with_bias", with_bias)
-      .done();
-
-  auto input = ins[0];
-  auto weight = ins[1];
-  auto bias = ins[2];
-  auto ones = ins[3];
-  auto offset = ins[4];
-  auto mask = ins[5];
-
-  auto output = outs[0];
-  auto columns = outs[1];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void modulated_deform_conv_forward_cuda(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
  ModulatedDeformConvForwardCUDAKernelLauncher(
      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
-      deformable_group, with_bias, ctx, stream);
+      deformable_group, with_bias);
 }

-void modulated_deform_conv_backward_cuda(CudaContext& ctx,
-                                         const SSElement& attr,
-                                         const OperatorBase::in_list_t& ins,
-                                         OperatorBase::out_list_t& outs) {
-  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
-      dilation_w, group, deformable_group, with_bias;
-  SSAttrs(attr)
-      .get<int>("kernel_h", kernel_h)
-      .get<int>("kernel_w", kernel_w)
-      .get<int>("stride_h", stride_h)
-      .get<int>("stride_w", stride_w)
-      .get<int>("pad_h", pad_h)
-      .get<int>("pad_w", pad_w)
-      .get<int>("dilation_h", dilation_h)
-      .get<int>("dilation_w", dilation_w)
-      .get<int>("group", group)
-      .get<int>("deformable_group", deformable_group)
-      .get<int>("with_bias", with_bias)
-      .done();
-
-  auto input = ins[0];
-  auto weight = ins[1];
-  auto bias = ins[2];
-  auto ones = ins[3];
-  auto offset = ins[4];
-  auto mask = ins[5];
-
-  auto columns = outs[0];
-  auto grad_input = outs[1];
-  auto grad_weight = outs[2];
-  auto grad_bias = outs[3];
-  auto grad_offset = outs[4];
-  auto grad_mask = outs[5];
-  auto grad_output = outs[6];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void modulated_deform_conv_backward_cuda(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
  ModulatedDeformConvBackwardCUDAKernelLauncher(
      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
-      deformable_group, with_bias, ctx, stream);
+      deformable_group, with_bias);
 }
+#endif

-PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
-    .attr("kernel_h")
-    .attr("kernel_w")
-    .attr("stride_h")
-    .attr("stride_w")
-    .attr("pad_h")
-    .attr("pad_w")
-    .attr("dilation_h")
-    .attr("dilation_w")
-    .attr("group")
-    .attr("deformable_group")
-    .attr("with_bias")
-    .input(6)
-    .output(2)
-    .apply(modulated_deform_conv_forward_cuda)
-    .done();
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(bias);
+    CHECK_CUDA_INPUT(ones);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(mask);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);

-PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
-    .attr("kernel_h")
-    .attr("kernel_w")
-    .attr("stride_h")
-    .attr("stride_w")
-    .attr("pad_h")
-    .attr("pad_w")
-    .attr("dilation_h")
-    .attr("dilation_w")
-    .attr("group")
-    .attr("deformable_group")
-    .attr("with_bias")
-    .input(6)
-    .output(7)
-    .apply(modulated_deform_conv_backward_cuda)
-    .done();
+    modulated_deform_conv_forward_cuda(
+        input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+#else
+    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ModulatedDeformConv is not implemented on CPU");
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(bias);
+    CHECK_CUDA_INPUT(ones);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(mask);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_weight);
+    CHECK_CUDA_INPUT(grad_bias);
+    CHECK_CUDA_INPUT(grad_offset);
+    CHECK_CUDA_INPUT(grad_mask);
+    CHECK_CUDA_INPUT(grad_output);
+
+    modulated_deform_conv_backward_cuda(
+        input, weight, bias, ones, offset, mask, columns, grad_input,
+        grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+#else
+    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ModulatedDeformConv is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu
 #include "modulated_deform_conv_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

 void modulated_deformable_im2col_cuda(
-    const DArrayLite data_im, const DArrayLite data_offset,
-    const DArrayLite data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kenerl_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    DArrayLite data_col, cudaStream_t stream) {
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * batch_size * height_col * width_col;

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
        modulated_deformable_im2col_gpu_kernel<<<
-            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels, data_im.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
-            data_mask.ptr<scalar_t>(), height_im, width_im, kernel_h, kenerl_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, batch_size, channels,
-            deformable_group, height_col, width_col, data_col.ptr<scalar_t>());
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
      }));
-
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void modulated_deformable_col2im_cuda(
-    const DArrayLite data_col, const DArrayLite data_offset,
-    const DArrayLite data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    DArrayLite grad_im, cudaStream_t stream) {
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels =
      channels * kernel_h * kernel_w * batch_size * height_col * width_col;

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
        modulated_deformable_col2im_gpu_kernel<<<
-            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
-            data_mask.ptr<scalar_t>(), channels, height_im, width_im, kernel_h,
-            kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, batch_size, deformable_group,
-            height_col, width_col, grad_im.ptr<scalar_t>());
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
      }));
-
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void modulated_deformable_col2im_coord_cuda(
-    const DArrayLite data_col, const DArrayLite data_im,
-    const DArrayLite data_offset, const DArrayLite data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, DArrayLite grad_offset,
-    DArrayLite grad_mask, cudaStream_t stream) {
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
                          kernel_w * deformable_group;
  const int channel_per_deformable_group =
      channels * kernel_h * kernel_w / deformable_group;

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
        modulated_deformable_col2im_coord_gpu_kernel<<<
-            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(),
-            data_offset.ptr<scalar_t>(), data_mask.ptr<scalar_t>(), channels,
-            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, 2 * kernel_h * kernel_w * deformable_group,
-            deformable_group, height_col, width_col,
-            grad_offset.ptr<scalar_t>(), grad_mask.ptr<scalar_t>());
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
      }));
-
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void ModulatedDeformConvForwardCUDAKernelLauncher(
-    DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
-    DArrayLite offset, DArrayLite mask, DArrayLite output, DArrayLite columns,
-    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w, const int dilation_h,
-    const int dilation_w, const int group, const int deformable_group,
-    const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
-  const int batch = input.dim(0);
-  const int channels = input.dim(1);
-  const int height = input.dim(2);
-  const int width = input.dim(3);
-
-  const int channels_out = weight.dim(0);
-  const int channels_kernel = weight.dim(1);
-  const int kernel_h_ = weight.dim(2);
-  const int kernel_w_ = weight.dim(3);
-
-  PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
-      << "Input shape and kernel shape wont match: (" << kernel_h << " x "
-      << kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
-
-  PARROTS_CHECKARGS(channels == channels_kernel * group)
-      << "Input shape and kernel channels wont match: (" << channels << " vs "
-      << channels_kernel * group << ").";
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

-  if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
-    ones = ctx.createDArrayLite(input.elemType(),
-                                DArrayShape(height_out, width_out));
-    fill(ctx, ones, *toScalar(1));
+    ones = at::ones({height_out, width_out}, input.options());
  }

  // resize output
-  output = output.view({batch, channels_out, height_out, width_out});
-  output.setZeros(ctx.getStream());
-
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
  // resize temporary columns
-  columns = ctx.createDArrayLite(
-      input.elemType(),
-      DArrayShape(channels * kernel_h * kernel_w, 1 * height_out * width_out));
-  columns.setZeros(ctx.getStream());
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());

-  output = output.view({output.dim(0), group, output.dim(1) / group,
-                        output.dim(2), output.dim(3)});
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});

-  for (size_t b = 0; b < batch; b++) {
+  for (int b = 0; b < batch; b++) {
    modulated_deformable_im2col_cuda(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns, stream);
+        dilation_h, dilation_w, deformable_group, columns);

    // divide into group
-    weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
-                          weight.dim(2), weight.dim(3)});
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
-
-    for (size_t g = 0; g < group; g++) {
-      auto output_g = output[b][g];
-      gemm(ctx, 1, false,
-           weight[g].view(
-               {weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
-           false, columns[g], 1, output_g);
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
    }

-    weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
-                          weight.dim(3), weight.dim(4)});
-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }

-  output = output.view({output.dim(0), output.dim(1) * output.dim(2),
-                        output.dim(3), output.dim(4)});
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});

  if (with_bias) {
-    bias = bias.view({1, bias.dim(0), 1, 1});
-    add(ctx, output, bias, output);
+    output += bias.view({1, bias.size(0), 1, 1});
  }
 }

 void ModulatedDeformConvBackwardCUDAKernelLauncher(
-    DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
-    DArrayLite offset, DArrayLite mask, DArrayLite columns,
-    DArrayLite grad_input, DArrayLite grad_weight, DArrayLite grad_bias,
-    DArrayLite grad_offset, DArrayLite grad_mask, DArrayLite grad_output,
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
-    const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
-  const int batch = input.dim(0);
-  const int channels = input.dim(1);
-  const int height = input.dim(2);
-  const int width = input.dim(3);
-
-  const int channels_kernel = weight.dim(1);
-  const int kernel_h_ = weight.dim(2);
-  const int kernel_w_ = weight.dim(3);
-
-  PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
-      << "Input shape and kernel shape wont match: (" << kernel_h << " x "
-      << kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
-
-  PARROTS_CHECKARGS(channels == channels_kernel * group)
-      << "Input shape and kernel channels wont match: (" << channels << " vs "
-      << channels_kernel * group << ").";
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

-  if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
-    ones = ctx.createDArrayLite(input.elemType(),
-                                DArrayShape(height_out, width_out));
-    fill(ctx, ones, *toScalar(1));
+    ones = at::ones({height_out, width_out}, input.options());
  }

  grad_input = grad_input.view({batch, channels, height, width});
-  columns = ctx.createDArrayLite(
-      input.elemType(),
-      DArrayShape(channels * kernel_h * kernel_w, height_out * width_out));
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());

  grad_output =
-      grad_output.view({grad_output.dim(0), group, grad_output.dim(1) / group,
-                        grad_output.dim(2), grad_output.dim(3)});
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});

-  for (size_t b = 0; b < batch; b++) {
+  for (int b = 0; b < batch; b++) {
    // divide int group
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
-    weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
-                          weight.dim(2), weight.dim(3)});
-
-    for (size_t g = 0; g < group; g++) {
-      auto columns_g = ctx.createDArrayLite(
-          weight.elemType(), DArrayShape(columns.dim(1), columns.dim(2)));
-      copy(ctx, columns_g, columns[g]);
-      auto weight_g = weight[g].view(
-          {weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)});
-      weight_g = transpose(ctx, weight_g, 0, 1);
-
-      auto grad_output_bg = ctx.createDArrayLite(
-          grad_output.elemType(),
-          DArrayShape(grad_output.dim(2), grad_output.dim(3),
-                      grad_output.dim(4)));
-      copy(ctx, grad_output_bg, grad_output[b][g]);
-      grad_output_bg =
-          grad_output_bg.view({grad_output_bg.dim(0),
-                               grad_output_bg.dim(1) * grad_output_bg.dim(2)});
-
-      columns_g =
-          parrots::op::addmm(ctx, columns[g], weight_g, grad_output_bg, 0, 1);
-      auto columns_out = columns[g];
-      copy(ctx, columns_out, columns_g);
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
    }

-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
-    weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
-                          weight.dim(3), weight.dim(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});

    // gradient w.r.t. input coordinate data
    modulated_deformable_col2im_coord_cuda(
        columns, input[b], offset[b], mask[b], 1, channels, height, width,
        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-        grad_mask[b], stream);
+        grad_mask[b]);
    // gradient w.r.t. input data
    modulated_deformable_col2im_cuda(
        columns, offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, grad_input[b], stream);
+        dilation_h, dilation_w, deformable_group, grad_input[b]);

    // gradient w.r.t. weight, dWeight should accumulate across the batch and
    // group
    modulated_deformable_im2col_cuda(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns, stream);
-
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
-    grad_weight =
-        grad_weight.view({group, grad_weight.dim(0) / group, grad_weight.dim(1),
-                          grad_weight.dim(2), grad_weight.dim(3)});
-    if (with_bias) {
-      grad_bias = grad_bias.view({group, grad_bias.dim(0) / group});
-    }
-
-    for (size_t g = 0; g < group; g++) {
-      auto grad_weight_g = ctx.createDArrayLite(
-          grad_weight.elemType(),
-          DArrayShape(grad_weight.dim(1), grad_weight.dim(2),
-                      grad_weight.dim(3), grad_weight.dim(4)));
-      copy(ctx, grad_weight_g, grad_weight[g]);
-      grad_weight_g = grad_weight_g.view(
-          {grad_weight_g.dim(0),
-           grad_weight_g.dim(1) * grad_weight_g.dim(2) * grad_weight_g.dim(3)});
-
-      auto columns_g = columns[g];
-      columns_g = transpose(ctx, columns_g, 0, 1);
-
-      auto grad_output_bg = ctx.createDArrayLite(
-          grad_output.elemType(),
-          DArrayShape(grad_output.dim(2), grad_output.dim(3),
-                      grad_output.dim(4)));
-      copy(ctx, grad_output_bg, grad_output[b][g]);
-      grad_output_bg =
-          grad_output_bg.view({grad_output_bg.dim(0),
-                               grad_output_bg.dim(1) * grad_output_bg.dim(2)});
-
-      grad_weight_g = parrots::op::addmm(ctx, grad_weight_g, grad_output_bg,
-                                         columns_g, 1, 1);
-      auto grad_weight_out = grad_weight[g];
-      copy(ctx, grad_weight_out, grad_weight_g);
+        dilation_h, dilation_w, deformable_group, columns);

+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
      if (with_bias) {
-        auto grad_bias_g = ctx.createDArrayLite(grad_bias.elemType(),
-                                                DArrayShape(grad_bias.dim(1)));
-        copy(ctx, grad_bias_g, grad_bias[g]);
-        grad_bias_g = grad_bias_g.view({grad_bias_g.dim(0), 1});
-
-        auto grad_output_bg = ctx.createDArrayLite(
-            grad_output.elemType(),
-            DArrayShape(grad_output.dim(2), grad_output.dim(3),
-                        grad_output.dim(4)));
-        copy(ctx, grad_output_bg, grad_output[b][g]);
-        grad_output_bg = grad_output_bg.view(
-            {grad_output_bg.dim(0),
-             grad_output_bg.dim(1) * grad_output_bg.dim(2)});
-
-        auto ones_g = ctx.createDArrayLite(
-            ones.elemType(), DArrayShape(ones.dim(0), ones.dim(1)));
-        copy(ctx, ones_g, ones);
-        ones_g = ones_g.view({ones_g.dim(0) * ones_g.dim(1), 1});
-
-        grad_bias_g =
-            parrots::op::addmm(ctx, grad_bias_g, grad_output_bg, ones_g, 1, 1);
-
-        auto grad_bias_out = grad_bias[g];
-        copy(ctx, grad_bias_out, grad_bias_g);
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
      }
    }

-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
-    grad_weight = grad_weight.view({grad_weight.dim(0) * grad_weight.dim(1),
-                                    grad_weight.dim(2), grad_weight.dim(3),
-                                    grad_weight.dim(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
    if (with_bias)
-      grad_bias =
-          grad_bias.view(DArrayShape{grad_bias.dim(0) * grad_bias.dim(1)});
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
  }
-  grad_output = grad_output.view({grad_output.dim(0) * grad_output.dim(1),
-                                  grad_output.dim(2), grad_output.dim(3),
-                                  grad_output.dim(4)});
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
 }
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "modulated_deform_conv_pytorch.h"
+
+using namespace parrots;
+
+void modulated_deform_conv_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward_cuda(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward_cuda(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
+#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
+#define MODULATED_DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void modulated_deform_conv_forward_cuda(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward_cuda(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+#endif  // MODULATED_DEFORM_CONV_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
-#include "parrots_cpp_helper.hpp"
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
-                                 const DArrayLite order, const DArrayLite areas,
-                                 float iou_threshold, int offset,
-                                 CudaContext& ctx, cudaStream_t stream);
-
-void nms_cuda(CudaContext& ctx, const SSElement& attr,
-              const OperatorBase::in_list_t& ins,
-              OperatorBase::out_list_t& outs) {
-  float iou_threshold;
-  int offset;
-  SSAttrs(attr)
-      .get<float>("iou_threshold", iou_threshold)
-      .get<int>("offset", offset)
-      .done();
-
-  const auto& boxes_sorted = ins[0];
-  const auto& order = ins[1];
-  const auto& areas = ins[2];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
-                                  offset, ctx, stream);
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
 }
+#endif

-void nms_cpu(HostContext& ctx, const SSElement& attr,
-             const OperatorBase::in_list_t& ins,
-             OperatorBase::out_list_t& outs) {
-  float iou_threshold;
-  int offset;
-  SSAttrs(attr)
-      .get<float>("iou_threshold", iou_threshold)
-      .get<int>("offset", offset)
-      .done();
-
-  const auto& boxes = ins[0];
-  const auto& order = ins[1];
-  const auto& areas = ins[2];
-
-  size_t nboxes = boxes.shape().dim(0);
-  size_t boxes_dim = boxes.shape().dim(1);
-
-  auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
-                                     getHostProxy());
-  select.setZeros(syncStream());
-
-  if (boxes.size() == 0) {
-    outs[0] = select;
-    return;
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);

-  fill(ctx, select, *toScalar(1));
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));

-  auto select_ptr = select.ptr<int64_t>();
-  auto boxes_ptr = boxes.ptr<float>();
-  auto order_ptr = order.ptr<int64_t>();
-  auto areas_ptr = areas.ptr<float>();
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();

  for (int64_t _i = 0; _i < nboxes; _i++) {
-    if (select_ptr[_i] == 0) continue;
-    auto i = order_ptr[_i];
-    auto ix1 = boxes_ptr[i * boxes_dim];
-    auto iy1 = boxes_ptr[i * boxes_dim + 1];
-    auto ix2 = boxes_ptr[i * boxes_dim + 2];
-    auto iy2 = boxes_ptr[i * boxes_dim + 3];
-    auto iarea = areas_ptr[i];
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-      if (select_ptr[_j] == 0) continue;
-      auto j = order_ptr[_j];
-      auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
-      auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
-      auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
-      auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
-
-      auto w = fmaxf(0.0, xx2 - xx1 + offset);
-      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
      auto inter = w * h;
-      auto ovr = inter / (iarea + areas_ptr[j] - inter);
-      if (ovr >= iou_threshold) select_ptr[_j] = 0;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) select[_j] = false;
    }
  }
-  outs[0] = select;
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CUDA_INPUT(scores);
+    return nms_cuda(boxes, scores, iou_threshold, offset);
+#else
+    AT_ERROR("nms is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(boxes);
+    CHECK_CPU_INPUT(scores);
+    return nms_cpu(boxes, scores, iou_threshold, offset);
+  }
 }

-void softnms_cpu(HostContext& ctx, const SSElement& attr,
-                 const OperatorBase::in_list_t& ins,
-                 OperatorBase::out_list_t& outs) {
-  float iou_threshold;
-  float sigma;
-  float min_score;
-  int method;
-  int offset;
-  SSAttrs(attr)
-      .get<float>("iou_threshold", iou_threshold)
-      .get<float>("sigma", sigma)
-      .get<float>("min_score", min_score)
-      .get<int>("method", method)
-      .get<int>("offset", offset)
-      .done();
-
-  const auto& boxes = ins[0];
-  const auto& scores = ins[1];
-  const auto& areas = ins[2];
-
-  size_t nboxes = boxes.shape().dim(0);
-  size_t boxes_dim = boxes.shape().dim(1);
-  auto boxes_ptr = boxes.ptr<float>();
-  auto scores_ptr = scores.ptr<float>();
-  auto areas_ptr = areas.ptr<float>();
-
-  auto inputs = ctx.createDArrayLite(
-      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
-  auto inputs_ptr = inputs.ptr<float>();
-  auto dets = ctx.createDArrayLite(
-      DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
-  auto de = dets.ptr<float>();
-  for (size_t i = 0; i < nboxes; i++) {
-    inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
-    inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
-    inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
-    inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
-    inputs_ptr[i * 6 + 4] = scores_ptr[i];
-    inputs_ptr[i * 6 + 5] = areas_ptr[i];
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
  }

-  size_t pos = 0;
-  auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
-  arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
-  auto inds = inds_t.ptr<int64_t>();
-  auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);

-  for (size_t i = 0; i < nboxes; i++) {
-    auto max_score = inputs_ptr[i * 6 + 4];
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
    auto max_pos = i;

    pos = i + 1;
    // get max box
    while (pos < nboxes) {
-      if (max_score < inputs_ptr[pos * 6 + 4]) {
-        max_score = inputs_ptr[pos * 6 + 4];
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
        max_pos = pos;
      }
      pos = pos + 1;
    }
    // swap
-    auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
-    auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
-    auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
-    auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
-    auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
-    auto iarea = inputs_ptr[max_pos * 6 + 5];
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
    auto iind = inds[max_pos];
-    inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
-    inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
-    inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
-    inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
-    inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
-    inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
    inds[max_pos] = inds[i];
-    inputs_ptr[i * 6 + 0] = ix1;
-    inputs_ptr[i * 6 + 1] = iy1;
-    inputs_ptr[i * 6 + 2] = ix2;
-    inputs_ptr[i * 6 + 3] = iy2;
-    inputs_ptr[i * 6 + 4] = iscore;
-    inputs_ptr[i * 6 + 5] = iarea;
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
    inds[i] = iind;

    pos = i + 1;
    while (pos < nboxes) {
-      auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
-      auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
-      auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
-      auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);

-      auto w = fmaxf(0.0, xx2 - xx1 + offset);
-      auto h = fmaxf(0.0, yy2 - yy1 + offset);
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
      auto inter = w * h;
-      auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
+      auto ovr = inter / (iarea + areas[pos] - inter);

      float weight = 1.;
      if (method == 0) {
@@ -186,18 +158,18 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
      } else if (method == 1) {
        if (ovr >= iou_threshold) weight = 1 - ovr;
      } else if (method == 2) {
-        weight = exp(-(ovr * ovr) / sigma);
+        weight = std::exp(-(ovr * ovr) / sigma);
      }
-      inputs_ptr[pos * 6 + 4] *= weight;
+      sc[pos] *= weight;
      // if box score falls below threshold, discard the box by
      // swapping with last box update N
-      if (inputs_ptr[pos * 6 + 4] < min_score) {
-        inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
-        inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
-        inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
-        inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
-        inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
-        inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
        inds[pos] = inds[nboxes - 1];
        nboxes = nboxes - 1;
        pos = pos - 1;
@@ -205,44 +177,84 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
      pos = pos + 1;
    }
  }
-  setScalar(num_out, int64_t{nboxes});
-  outs[0] = dets;
-  outs[1] = inds_t;
-  outs[2] = num_out;
+  return inds_t.slice(0, 0, nboxes);
 }

-void nms_match_cpu(HostContext& ctx, const SSElement& attr,
-                   const OperatorBase::in_list_t& ins,
-                   OperatorBase::out_list_t& outs) {
-  float iou_threshold;
-  SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  if (boxes.device().is_cuda()) {
+    AT_ERROR("softnms is not implemented on GPU");
+  } else {
+    return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
+                       method, offset);
+  }
 }

-PARROTS_EXTENSION_REGISTER(nms)
-    .attr("iou_threshold")
-    .attr("offset")
-    .input(3)
-    .output(1)
-    .apply(nms_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(nms_cuda)
-#endif
-    .done();
-
-PARROTS_EXTENSION_REGISTER(softnms)
-    .attr("iou_threshold")
-    .attr("sigma")
-    .attr("min_score")
-    .attr("method")
-    .attr("offset")
-    .input(3)
-    .output(3)
-    .apply(softnms_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(nms_match)
-    .attr("iou_threshold")
-    .input(1)
-    .output(1)
-    .apply(nms_match_cpu)
-    .done();
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (int i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  if (dets.device().is_cuda()) {
+    AT_ERROR("nms_match is not implemented on GPU");
+  } else {
+    return nms_match_cpu(dets, iou_threshold);
+  }
+}
--- a/mmcv/ops/csrc/parrots/nms_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_cuda.cu
 #include "nms_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
-                                 const DArrayLite order, const DArrayLite areas,
-                                 float iou_threshold, int offset,
-                                 CudaContext& ctx, cudaStream_t stream) {
-  size_t boxes_num = boxes_sorted.dim(0);
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());

-  if (boxes_sorted.size() == 0) {
-    auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
-    return select;
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);

-  const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
-  auto mask = ctx.createDArrayLite(
-      DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
+  int boxes_num = boxes.size(0);
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  nms_cuda<<<blocks, threads, 0, stream>>>(
-      boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
-      (unsigned long long*)mask.ptr<int64_t>());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());

-  auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
-  auto mask_host = mask_cpu.ptr<int64_t>();
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();

-  auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
-                                   getHostProxy());
-  remv.setZeros(syncStream());
-  auto remv_ptr = remv.ptr<int64_t>();
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

-  auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
-                                     getHostProxy());
-  keep_t.setZeros(syncStream());
-  auto keep = keep_t.ptr<uint8_t>();
+  at::Tensor keep_t =
+      at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
+  bool* keep = keep_t.data_ptr<bool>();

  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

-    if (!(remv_ptr[nblock] & (1ULL << inblock))) {
-      keep[i] = 1;
-      int64_t* p = mask_host + i * col_blocks;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep[i] = true;
+      // set every overlap box with bit 1 in remv
+      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
-        remv_ptr[j] |= p[j];
+        remv[j] |= p[j];
      }
    }
  }

-  auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-  return keep_cuda;
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t.to(at::kCUDA));
 }
--- a/mmcv/ops/csrc/parrots/nms_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/nms_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "nms_pytorch.h"
+
+using namespace parrots;
+
+// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+template <typename T>
+void nms_parrots(T& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  auto out = nms(boxes, scores, iou_threshold, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+ *                float sigma, float min_score, int method, int offset);*/
+template <typename T>
+void softnms_parrots(T& ctx, const SSElement& attr,
+                     const OperatorBase::in_list_t& ins,
+                     OperatorBase::out_list_t& outs) {
+  float iou_threshold, sigma, min_score;
+  int method, offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("sigma", sigma)
+      .get("min_score", min_score)
+      .get("method", method)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores, dets;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  dets = buildATensor(ctx, ins[2]);
+  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
+                     method, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
+template <typename T>
+void nms_match_parrots(T& ctx, const SSElement& attr,
+                       const OperatorBase::in_list_t& ins,
+                       OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
+  at::Tensor dets;
+  dets = buildATensor(ctx, ins[0]);
+  auto out = nms_match(dets, iou_threshold);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+ *                    const Tensor dets_sorted, const float iou_threshold,
+ *                                       const int multi_label);*/
+template <typename T>
+void nms_rotated_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int multi_label;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("multi_label", multi_label)
+      .done();
+  at::Tensor dets, scores, order, dets_sorted;
+  dets = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  order = buildATensor(ctx, ins[2]);
+  dets_sorted = buildATensor(ctx, ins[3]);
+  auto out =
+      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(nms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(softnms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(softnms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_match_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_rotated)
+    .attr("multi_label")
+    .attr("iou_threshold")
+    .input(4)
+    .output(1)
+    .apply(nms_rotated_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_rotated_parrots<CudaContext>)
+#endif
+    .done();
--- a/mmcv/ops/csrc/parrots/nms_pytorch.h
+++ b/mmcv/ops/csrc/parrots/nms_pytorch.h
+#ifndef NMS_PYTORCH_H
+#define NMS_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
+               int offset);
+
+at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset);
+
+std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
+
+at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
+                       const at::Tensor order, const at::Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif  // NMS_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/nms_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/nms_rotated.cpp
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
-                            const DArrayLite dets_sorted, float iou_threshold,
-                            const int multi_label, cudaStream_t stream,
-                            CudaContext& ctx);
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif

 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
-void nms_rotated(CudaContext& ctx, const SSElement& attr,
-                 const OperatorBase::in_list_t& ins,
-                 OperatorBase::out_list_t& outs) {
-  float iou_threshold;
-  int multi_label;
-  SSAttrs(attr)
-      .get<float>("iou_threshold", iou_threshold)
-      .get<int>("multi_label", multi_label)
-      .done();
-
-  const auto& dets = ins[0];
-  const auto& scores = ins[1];
-  const auto& dets_sorted = ins[2];
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }

-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-
-  outs[0] = nms_rotated_cuda(dets, scores, dets_sorted, iou_threshold,
-                             multi_label, stream, ctx);
+  return nms_rotated_cpu(dets, scores, iou_threshold);
 }
-
-PARROTS_EXTENSION_REGISTER(nms_rotated)
-    .attr("multi_label")
-    .attr("iou_threshold")
-    .input(3)
-    .output(1)
-    .apply(nms_rotated)
-    .done();
--- a/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.type() == scores.type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
--- a/mmcv/ops/csrc/parrots/nms_rotated_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_rotated_cuda.cu
@@ -2,45 +2,51 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
 #include "nms_rotated_cuda.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
-                            const DArrayLite dets_sorted, float iou_threshold,
-                            const int multi_label, cudaStream_t stream,
-                            CudaContext& ctx) {
-  int dets_num = dets.dim(0);
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());

-  const int col_blocks = divideUP(dets_num, threadsPerBlock);
+  int dets_num = dets.size(0);

-  auto mask = ctx.createDArrayLite(
-      DArraySpec::array(Prim::Int64, DArrayShape(dets_num * col_blocks)));
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(dets_sorted.elemType().prim(), [&] {
-    nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        dets_num, iou_threshold, dets_sorted.ptr<scalar_t>(),
-        (unsigned long long*)mask.ptr<int64_t>(), multi_label);
-  });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data<scalar_t>(),
+            (unsigned long long*)mask.data<int64_t>(), multi_label);
+      });

-  DArrayLite mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
-  unsigned long long* mask_host = (unsigned long long*)mask_cpu.ptr<int64_t>();
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>();

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

-  auto keep = ctx.createDArrayLite(
-      DArraySpec::array(Prim::Int64, DArrayShape(dets_num)), getHostProxy());
-
-  int64_t* keep_out = keep.ptr<int64_t>();
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data<int64_t>();

+  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[i] = 1;
+      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
@@ -48,7 +54,8 @@ DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
    }
  }

-  auto keep_cuda = ctx.createDArrayLite(keep, ctx.getProxy());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-  return keep_cuda;
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
 }
--- a/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
+++ b/mmcv/ops/csrc/parrots/parrots_cpp_helper.cpp
-#include "parrots_cpp_helper.hpp"
-using namespace parrots;
--- a/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
+++ b/mmcv/ops/csrc/parrots/parrots_cuda_helper.cu
-#include "parrots_cuda_helper.hpp"
-
-using namespace parrots;
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
-#include "parrots_cpp_helper.hpp"
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"

 #ifndef min
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 #endif
-
 #ifndef max
 #define max(a, b) (((a) > (b)) ? (a) : (b))
 #endif
@@ -11,8 +12,8 @@
 void psamask_collect_forward(const int num_, const int h_feature,
                             const int w_feature, const int h_mask,
                             const int w_mask, const int half_h_mask,
-                             const int half_w_mask, const float *mask_data,
-                             float *buffer_data) {
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
@@ -27,12 +28,13 @@ void psamask_collect_forward(const int num_, const int h_feature,
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data[(n * h_feature * w_feature +
-                         (hidx + h - half_h_mask) * w_feature +
-                         (widx + w - half_w_mask)) *
-                            h_feature * w_feature +
-                        h * w_feature + w] =
-                mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                               h_feature +
                           h) *
                              w_feature +
@@ -47,8 +49,8 @@ void psamask_collect_forward(const int num_, const int h_feature,
 void psamask_distribute_forward(const int num_, const int h_feature,
                                const int w_feature, const int h_mask,
                                const int w_mask, const int half_h_mask,
-                                const int half_w_mask, const float *mask_data,
-                                float *buffer_data) {
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
@@ -63,11 +65,13 @@ void psamask_distribute_forward(const int num_, const int h_feature,
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
-                            h_feature * w_feature +
-                        (hidx + h - half_h_mask) * w_feature +
-                        (widx + w - half_w_mask)] =
-                mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                               h_feature +
                           h) *
                              w_feature +
@@ -82,8 +86,8 @@ void psamask_distribute_forward(const int num_, const int h_feature,
 void psamask_collect_backward(const int num_, const int h_feature,
                              const int w_feature, const int h_mask,
                              const int w_mask, const int half_h_mask,
-                              const int half_w_mask, const float *buffer_diff,
-                              float *mask_diff) {
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
@@ -98,11 +102,12 @@ void psamask_collect_backward(const int num_, const int h_feature,
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                           h_feature +
-                       h) *
-                          w_feature +
-                      w] = buffer_diff[(n * h_feature * w_feature +
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
                                        (hidx + h - half_h_mask) * w_feature +
                                        (widx + w - half_w_mask)) *
                                           h_feature * w_feature +
@@ -118,7 +123,7 @@ void psamask_distribute_backward(const int num_, const int h_feature,
                                 const int w_feature, const int h_mask,
                                 const int w_mask, const int half_h_mask,
                                 const int half_w_mask,
-                                 const float *buffer_diff, float *mask_diff) {
+                                 const Tensor buffer_diff, Tensor mask_diff) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
@@ -133,15 +138,16 @@ void psamask_distribute_backward(const int num_, const int h_feature,
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                           h_feature +
-                       h) *
-                          w_feature +
-                      w] =
-                buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
-                                h_feature * w_feature +
-                            (hidx + h - half_h_mask) * w_feature +
-                            (widx + w - half_w_mask)];
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
          }
        }
      }
@@ -149,156 +155,101 @@ void psamask_distribute_backward(const int num_, const int h_feature,
  }
 }

-void psamask_forward_cpu(HostContext &ctx, const SSElement &attr,
-                         const OperatorBase::in_list_t &ins,
-                         OperatorBase::out_list_t &outs) {
-  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-      half_w_mask;
-  SSAttrs(attr)
-      .get<int>("psa_type", psa_type)
-      .get<int>("num_", num_)
-      .get<int>("h_feature", h_feature)
-      .get<int>("w_feature", w_feature)
-      .get<int>("h_mask", h_mask)
-      .get<int>("w_mask", w_mask)
-      .get<int>("half_h_mask", half_h_mask)
-      .get<int>("half_w_mask", half_w_mask)
-      .done();
-  const auto &input = ins[0];
-  auto &output = outs[0];
-
-  auto input_ptr = input.ptr<float>();
-  auto output_ptr = output.ptr<float>();
-
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
  if (psa_type == 0)
    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                            half_h_mask, half_w_mask, input_ptr, output_ptr);
+                            half_h_mask, half_w_mask, input, output);
  else
    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                               half_h_mask, half_w_mask, input_ptr, output_ptr);
+                               half_h_mask, half_w_mask, input, output);
 }

-void psamask_backward_cpu(HostContext &ctx, const SSElement &attr,
-                          const OperatorBase::in_list_t &ins,
-                          OperatorBase::out_list_t &outs) {
-  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-      half_w_mask;
-  SSAttrs(attr)
-      .get<int>("psa_type", psa_type)
-      .get<int>("num_", num_)
-      .get<int>("h_feature", h_feature)
-      .get<int>("w_feature", w_feature)
-      .get<int>("h_mask", h_mask)
-      .get<int>("w_mask", w_mask)
-      .get<int>("half_h_mask", half_h_mask)
-      .get<int>("half_w_mask", half_w_mask)
-      .done();
-
-  const auto &input = ins[0];
-  auto &output = outs[0];
-
-  auto input_ptr = input.ptr<float>();
-  auto output_ptr = output.ptr<float>();
-
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
  if (psa_type == 0)
    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                             half_h_mask, half_w_mask, input_ptr, output_ptr);
+                             half_h_mask, half_w_mask, grad_output, grad_input);
  else
    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                                half_h_mask, half_w_mask, input_ptr,
-                                output_ptr);
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
 }

-void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
-                                      const DArrayLite input, DArrayLite output,
-                                      const int num_, const int h_feature,
-                                      const int w_feature, const int h_mask,
-                                      const int w_mask, const int half_h_mask,
-                                      const int half_w_mask, CudaContext &ctx);
-
-void PSAMaskBackwardCUDAKernelLauncher(const int psa_type,
-                                       const DArrayLite grad_output,
-                                       DArrayLite grad_input, const int num_,
-                                       const int h_feature, const int w_feature,
-                                       const int h_mask, const int w_mask,
-                                       const int half_h_mask,
-                                       const int half_w_mask, CudaContext &ctx);
-
-void psamask_forward_cuda(CudaContext &ctx, const SSElement &attr,
-                          const OperatorBase::in_list_t &ins,
-                          OperatorBase::out_list_t &outs) {
-  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-      half_w_mask;
-  SSAttrs(attr)
-      .get<int>("psa_type", psa_type)
-      .get<int>("num_", num_)
-      .get<int>("h_feature", h_feature)
-      .get<int>("w_feature", w_feature)
-      .get<int>("h_mask", h_mask)
-      .get<int>("w_mask", w_mask)
-      .get<int>("half_h_mask", half_h_mask)
-      .get<int>("half_w_mask", half_w_mask)
-      .done();
-  const auto &input = ins[0];
-  auto &output = outs[0];
+#ifdef MMCV_WITH_CUDA
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
                                   w_feature, h_mask, w_mask, half_h_mask,
-                                   half_w_mask, ctx);
+                                   half_w_mask);
 }

-void psamask_backward_cuda(CudaContext &ctx, const SSElement &attr,
-                           const OperatorBase::in_list_t &ins,
-                           OperatorBase::out_list_t &outs) {
-  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-      half_w_mask;
-  SSAttrs(attr)
-      .get<int>("psa_type", psa_type)
-      .get<int>("num_", num_)
-      .get<int>("h_feature", h_feature)
-      .get<int>("w_feature", w_feature)
-      .get<int>("h_mask", h_mask)
-      .get<int>("w_mask", w_mask)
-      .get<int>("half_h_mask", half_h_mask)
-      .get<int>("half_w_mask", half_w_mask)
-      .done();
-
-  const auto &input = ins[0];
-  auto &output = outs[0];
-  PSAMaskBackwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
-                                    w_feature, h_mask, w_mask, half_h_mask,
-                                    half_w_mask, ctx);
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
 }
+#endif

-PARROTS_EXTENSION_REGISTER(psamask_forward)
-    .attr("psa_type")
-    .attr("num_")
-    .attr("h_feature")
-    .attr("w_feature")
-    .attr("h_mask")
-    .attr("w_mask")
-    .attr("half_h_mask")
-    .attr("half_w_mask")
-    .input(1)
-    .output(1)
-    .apply(psamask_forward_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(psamask_forward_cuda)
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(output);
+    psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                         h_mask, w_mask, half_h_mask, half_w_mask);
+#else
+    AT_ERROR("PSAMask is not compiled with GPU support");
 #endif
-    .done();
+  } else {
+    psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                        h_mask, w_mask, half_h_mask, half_w_mask);
+  }
+}

-PARROTS_EXTENSION_REGISTER(psamask_backward)
-    .attr("psa_type")
-    .attr("num_")
-    .attr("h_feature")
-    .attr("w_feature")
-    .attr("h_mask")
-    .attr("w_mask")
-    .attr("half_h_mask")
-    .attr("half_w_mask")
-    .input(1)
-    .output(1)
-    .apply(psamask_backward_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(psamask_backward_cuda)
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  if (grad_input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_output);
+    psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                          w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+#else
+    AT_ERROR("PSAMask is not compiled with GPU support");
 #endif
-    .done();
+  } else {
+    psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                         w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+  }
+}
--- a/mmcv/ops/csrc/parrots/psamask_cuda.cu
+++ b/mmcv/ops/csrc/parrots/psamask_cuda.cu
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src

-#include "parrots_cuda_helper.hpp"
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+
+#include <THC/THCDeviceUtils.cuh>
+
 #include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"

-void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
-                                      const DArrayLite input, DArrayLite output,
-                                      const int num_, const int h_feature,
-                                      const int w_feature, const int h_mask,
-                                      const int w_mask, const int half_h_mask,
-                                      const int half_w_mask, CudaContext& ctx) {
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (psa_type == 0)
-    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
-      psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
-          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
-    });
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
  else
-    PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
-      psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
-          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-          half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
-    });
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
 }

 void PSAMaskBackwardCUDAKernelLauncher(
-    const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int half_h_mask, const int half_w_mask,
-    CudaContext& ctx) {
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (psa_type == 0)
-    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
-      psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
-          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
-    });
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
  else
-    PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
-      psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
-          nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-          half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
-    });
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
 }
--- a/mmcv/ops/csrc/parrots/psamask_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/psamask_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "psamask_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                   const OperatorBase::in_list_t &ins,
+                                   OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+#endif
+
+void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                      h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_backward_cuda_parrots)
+#endif
+    .done();
--- a/mmcv/ops/csrc/parrots/psamask_pytorch.h
+++ b/mmcv/ops/csrc/parrots/psamask_pytorch.h
+#ifndef PSAMASK_PYTORCH_H
+#define PSAMASK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+#endif
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask);
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask);
+#endif  // PSAMASK_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
-// Copyright (c) 2018, SenseTime.
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+#endif

-void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois,
-                                DArrayLite output, DArrayLite argmax_y,
-                                DArrayLite argmax_x, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, int pool_mode,
-                                bool aligned);
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned);

-void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
-                                 DArrayLite argmax_y, DArrayLite argmax_x,
-                                 DArrayLite grad_input, int aligned_height,
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
                                 int aligned_width, float spatial_scale,
                                 int sampling_ratio, int pool_mode,
                                 bool aligned);

-void ROIAlignForwardCUDAKernelLauncher(DArrayLite input, DArrayLite rois,
-                                       DArrayLite output, DArrayLite argmax_y,
-                                       DArrayLite argmax_x, int aligned_height,
-                                       int aligned_width, float spatial_scale,
-                                       int sampling_ratio, int pool_mode,
-                                       bool aligned, cudaStream_t stream);
-
-void ROIAlignBackwardCUDAKernelLauncher(
-    DArrayLite grad_output, DArrayLite rois, DArrayLite argmax_y,
-    DArrayLite argmax_x, DArrayLite grad_input, int aligned_height,
-    int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode,
-    bool aligned, cudaStream_t stream);
-
-void roi_align_forward_cpu(HostContext& ctx, const SSElement& attr,
-                           const OperatorBase::in_list_t& ins,
-                           OperatorBase::out_list_t& outs) {
-  int aligned_height;
-  int aligned_width;
-  float spatial_scale;
-  int sampling_ratio;
-  int pool_mode;
-  bool aligned;
-  SSAttrs(attr)
-      .get<int>("aligned_height", aligned_height)
-      .get<int>("aligned_width", aligned_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<int>("pool_mode", pool_mode)
-      .get<bool>("aligned", aligned)
-      .done();
-
-  auto& input = ins[0];
-  auto& rois = ins[1];
-  auto& output = outs[0];
-  auto& argmax_y = outs[1];
-  auto& argmax_x = outs[2];
-
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
                             aligned_height, aligned_width, spatial_scale,
                             sampling_ratio, pool_mode, aligned);
 }

-void roi_align_backward_cpu(HostContext& ctx, const SSElement& attr,
-                            const OperatorBase::in_list_t& ins,
-                            OperatorBase::out_list_t& outs) {
-  int aligned_height;
-  int aligned_width;
-  float spatial_scale;
-  int sampling_ratio;
-  int pool_mode;
-  bool aligned;
-  SSAttrs(attr)
-      .get<int>("aligned_height", aligned_height)
-      .get<int>("aligned_width", aligned_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<int>("pool_mode", pool_mode)
-      .get<bool>("aligned", aligned)
-      .done();
-
-  auto& grad_output = ins[0];
-  auto& rois = ins[1];
-  auto& argmax_y = ins[2];
-  auto& argmax_x = ins[3];
-  auto& grad_input = outs[0];
-
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
                              aligned_height, aligned_width, spatial_scale,
                              sampling_ratio, pool_mode, aligned);
 }

-void roi_align_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                            const OperatorBase::in_list_t& ins,
-                            OperatorBase::out_list_t& outs) {
-  int aligned_height;
-  int aligned_width;
-  float spatial_scale;
-  int sampling_ratio;
-  int pool_mode;
-  bool aligned;
-  SSAttrs(attr)
-      .get<int>("aligned_height", aligned_height)
-      .get<int>("aligned_width", aligned_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<int>("pool_mode", pool_mode)
-      .get<bool>("aligned", aligned)
-      .done();
-
-  auto& input = ins[0];
-  auto& rois = ins[1];
-  auto& output = outs[0];
-  auto& argmax_y = outs[1];
-  auto& argmax_x = outs[2];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  ROIAlignForwardCUDAKernelLauncher(
-      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-      spatial_scale, sampling_ratio, pool_mode, aligned, stream);
-}
-
-void roi_align_backward_cuda(CudaContext& ctx, const SSElement& attr,
-                             const OperatorBase::in_list_t& ins,
-                             OperatorBase::out_list_t& outs) {
-  int aligned_height;
-  int aligned_width;
-  float spatial_scale;
-  int sampling_ratio;
-  int pool_mode;
-  bool aligned;
-  SSAttrs(attr)
-      .get<int>("aligned_height", aligned_height)
-      .get<int>("aligned_width", aligned_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<int>("pool_mode", pool_mode)
-      .get<bool>("aligned", aligned)
-      .done();
-
-  auto& grad_output = ins[0];
-  auto& rois = ins[1];
-  auto& argmax_y = ins[2];
-  auto& argmax_x = ins[3];
-  auto& grad_input = outs[0];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  ROIAlignBackwardCUDAKernelLauncher(
-      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
-      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
-}
-
-PARROTS_EXTENSION_REGISTER(roi_align_forward)
-    .attr("aligned_height")
-    .attr("aligned_width")
-    .attr("spatial_scale")
-    .attr("sampling_ratio")
-    .attr("pool_mode")
-    .attr("aligned")
-    .input(2)
-    .output(3)
-    .apply(roi_align_forward_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(roi_align_forward_cuda)
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
 #endif
-    .done();
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
+}

-PARROTS_EXTENSION_REGISTER(roi_align_backward)
-    .attr("aligned_height")
-    .attr("aligned_width")
-    .attr("spatial_scale")
-    .attr("sampling_ratio")
-    .attr("pool_mode")
-    .attr("aligned")
-    .input(4)
-    .output(1)
-    .apply(roi_align_backward_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(roi_align_backward_cuda)
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+    CHECK_CUDA_INPUT(grad_input);
+
+    roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                            aligned_height, aligned_width, spatial_scale,
+                            sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
 #endif
-    .done();
+  } else {
+    CHECK_CPU_INPUT(grad_output);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    CHECK_CPU_INPUT(grad_input);
+
+    roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+  }
+}
--- a/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
 // Modified from
 // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <iostream>
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>

-#include "parrots_cpp_helper.hpp"
+#include "../pytorch_cpp_helper.hpp"

 // implementation taken from Caffe2
 template <typename T>
@@ -133,8 +134,8 @@ void ROIAlignForward(const int nthreads, const T* input, const T* rois,
    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (aligned) {
-      PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0)
-          << "ROIs in ROIAlign cannot have non-negative size!";
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
@@ -294,8 +295,8 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (aligned) {
-      PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0)
-          << "ROIs in ROIAlign do not have non-negative size!";
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
@@ -378,38 +379,37 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
  }          // for
 }  // ROIAlignBackward

-void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois,
-                                DArrayLite output, DArrayLite argmax_y,
-                                DArrayLite argmax_x, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, int pool_mode,
-                                bool aligned) {
-  int output_size = output.size();
-  int channels = input.dim(1);
-  int height = input.dim(2);
-  int width = input.dim(3);
-
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.elemType().prim(), ([&] {
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
        ROIAlignForward<scalar_t>(
-            output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
-            output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
-            argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
            aligned, channels, height, width);
-      }));
+      });
 }

-void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
-                                 DArrayLite argmax_y, DArrayLite argmax_x,
-                                 DArrayLite grad_input, int aligned_height,
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
                                 int aligned_width, float spatial_scale,
                                 int sampling_ratio, int pool_mode,
                                 bool aligned) {
-  int output_size = grad_output.size();
-  int channels = grad_input.dim(1);
-  int height = grad_input.dim(2);
-  int width = grad_input.dim(3);
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);

  // get stride values to ensure indexing into gradients is correct.
  int n_stride = grad_output.stride(0);
@@ -417,14 +417,14 @@ void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
  int h_stride = grad_output.stride(2);
  int w_stride = grad_output.stride(3);

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
        ROIAlignBackward<scalar_t>(
-            output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
-            argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
-            grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
-            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
-            aligned, channels, height, width, n_stride, c_stride, h_stride,
-            w_stride);
-      }));
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
 }