Add new parrots extension implementation for all ops (#794)

* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint

Add new parrots extension implementation for all ops (#794)
* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint
48d99025 · z55250825 · GitHub · 72e4cc12 · 48d99025 · 48d99025
Unverified Commit 48d99025 authored Feb 26, 2021 by z55250825 Committed by GitHub Feb 26, 2021
20 changed files
--- a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
-                                    const DArrayLite bboxes2, DArrayLite ious,
-                                    const int mode, const bool aligned,
-                                    const int offset, cudaStream_t stream);
+#ifdef MMCV_WITH_CUDA
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);

-void bbox_overlaps_cuda(CudaContext& ctx, const SSElement& attr,
-                        const OperatorBase::in_list_t& ins,
-                        OperatorBase::out_list_t& outs) {
-  int mode, offset;
-  bool aligned;
-  SSAttrs(attr)
-      .get<int>("mode", mode)
-      .get<bool>("aligned", aligned)
-      .get<int>("offset", offset)
-      .done();
-
-  const auto& bboxes1 = ins[0];
-  const auto& bboxes2 = ins[1];
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+#endif

-  auto& ious = outs[0];
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  if (bboxes1.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(bboxes1);
+    CHECK_CUDA_INPUT(bboxes2);
+    CHECK_CUDA_INPUT(ious);

-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset,
-                                 stream);
+    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+#else
+    AT_ERROR("bbox_overlaps is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("bbox_overlaps is not implemented on CPU");
+  }
 }
-
-PARROTS_EXTENSION_REGISTER(bbox_overlaps)
-    .attr("mode")
-    .attr("aligned")
-    .attr("offset")
-    .input(2)
-    .output(1)
-    .apply(bbox_overlaps_cuda)
-    .done();
--- a/mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu
 #include "bbox_overlaps_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
-                                    const DArrayLite bboxes2, DArrayLite ious,
-                                    const int mode, const bool aligned,
-                                    const int offset, cudaStream_t stream) {
-  int output_size = ious.size();
-  int num_bbox1 = bboxes1.dim(0);
-  int num_bbox2 = bboxes2.dim(0);
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      bboxes1.elemType().prim(), ([&] {
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
        bbox_overlaps_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                bboxes1.ptr<scalar_t>(), bboxes2.ptr<scalar_t>(),
-                ious.ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
                offset);
      }));
-
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "bbox_overlaps_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
+ * ious, const int mode, const bool aligned, const int offset);
+ */
+void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = buildATensor(ctx, ins[0]);
+  const auto& bboxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
+#ifndef BBOX_OVERLAPS_PYTORCH_H
+#define BBOX_OVERLAPS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+#endif  // BBOX_OVERLAPS_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-void box_iou_rotated_cpu_launcher(const DArrayLite boxes1,
-                                  const DArrayLite boxes2, DArrayLite ious,
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);

-void box_iou_rotated_cuda_launcher(const DArrayLite boxes1,
-                                   const DArrayLite boxes2, DArrayLite ious,
-                                   const int mode_flag, const bool aligned,
-                                   cudaStream_t stream);
-
-void box_iou_rotated_cpu(HostContext& ctx, const SSElement& attr,
-                         const OperatorBase::in_list_t& ins,
-                         OperatorBase::out_list_t& outs) {
-  const auto& boxes1 = ins[0];
-  const auto& boxes2 = ins[1];
-
-  bool aligned;
-  int mode_flag;
-  SSAttrs(attr)
-      .get<bool>("aligned", aligned)
-      .get<int>("mode_flag", mode_flag)
-      .done();
-  auto& ious = outs[0];
-  box_iou_rotated_cpu_launcher(boxes1, boxes2, ious, mode_flag, aligned);
-}
-
-void box_iou_rotated_cuda(CudaContext& ctx, const SSElement& attr,
-                          const OperatorBase::in_list_t& ins,
-                          OperatorBase::out_list_t& outs) {
-  const auto& boxes1 = ins[0];
-  const auto& boxes2 = ins[1];
-
-  bool aligned;
-  int mode_flag;
-  SSAttrs(attr)
-      .get<bool>("aligned", aligned)
-      .get<int>("mode_flag", mode_flag)
-      .done();
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  auto& ious = outs[0];
-  box_iou_rotated_cuda_launcher(boxes1, boxes2, ious, mode_flag, aligned,
-                                stream);
-}
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif

-PARROTS_EXTENSION_REGISTER(box_iou_rotated)
-    .attr("aligned")
-    .attr("mode_flag")
-    .input(2)
-    .output(1)
-    .apply(box_iou_rotated_cpu)
-#ifdef PARROTS_USE_CUDA
-    .apply(box_iou_rotated_cuda)
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
 #endif
-    .done();
+  } else {
+    box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+  }
+}
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
@@ -2,35 +2,32 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
 #include "box_iou_rotated_utils.hpp"
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

 template <typename T>
-void box_iou_rotated_cpu_kernel(const DArrayLite boxes1,
-                                const DArrayLite boxes2, DArrayLite ious,
-                                const int mode_flag, const bool aligned) {
-  int output_size = ious.size();
-  int num_boxes1 = boxes1.dim(0);
-  int num_boxes2 = boxes2.dim(0);
-
-  auto ious_ptr = ious.ptr<float>();
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);

  if (aligned) {
    for (int i = 0; i < output_size; i++) {
-      ious_ptr[i] = single_box_iou_rotated<T>(boxes1[i].ptr<T>(),
-                                              boxes2[i].ptr<T>(), mode_flag);
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
    }
  } else {
    for (int i = 0; i < num_boxes1; i++) {
      for (int j = 0; j < num_boxes2; j++) {
-        ious_ptr[i * num_boxes2 + j] = single_box_iou_rotated<T>(
-            boxes1[i].ptr<T>(), boxes2[j].ptr<T>(), mode_flag);
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
      }
    }
  }
 }

-void box_iou_rotated_cpu_launcher(const DArrayLite boxes1,
-                                  const DArrayLite boxes2, DArrayLite ious,
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
 }
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_cuda.cu
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_cuda.cu
@@ -2,23 +2,24 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
 #include "box_iou_rotated_cuda.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-void box_iou_rotated_cuda_launcher(const DArrayLite boxes1,
-                                   const DArrayLite boxes2, DArrayLite ious,
-                                   const int mode_flag, const bool aligned,
-                                   cudaStream_t stream) {
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
  using scalar_t = float;
+  AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor");

-  int output_size = ious.size();
-  int num_boxes1 = boxes1.dim(0);
-  int num_boxes2 = boxes2.dim(0);
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);

+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  box_iou_rotated_cuda_kernel<scalar_t>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          num_boxes1, num_boxes2, boxes1.ptr<scalar_t>(),
-          boxes2.ptr<scalar_t>(), (scalar_t*)ious.ptr<scalar_t>(), mode_flag,
-          aligned);
-
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "box_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+}
+#endif
+
+PARROTS_EXTENSION_REGISTER(box_iou_rotated)
+    .attr("aligned")
+    .attr("mode_flag")
+    .input(2)
+    .output(1)
+    .apply(box_iou_rotated_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(box_iou_rotated_cuda_parrots)
+#endif
+    .done();
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
+#ifndef BOX_IOU_ROTATED_PYTORCH_H
+#define BOX_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
+
+#endif  // BOX_IOU_ROTATED_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/carafe.cpp
+++ b/mmcv/ops/csrc/parrots/carafe.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-void CARAFEForwardCUDAKernelLauncher(
-    const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
-    DArrayLite routput, DArrayLite rmasks, DArrayLite output,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream);
+#ifdef MMCV_WITH_CUDA
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);

 void CARAFEBackwardCUDAKernelLauncher(
-    const DArrayLite top_grad, const DArrayLite rfeatures,
-    const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
-    DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
-    DArrayLite mask_grad, const int kernel_size, const int group_size,
-    const int scale_factor, cudaStream_t stream);
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);

-void carafe_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                         const OperatorBase::in_list_t& ins,
-                         OperatorBase::out_list_t& outs) {
-  int kernel_size, group_size, scale_factor;
-  SSAttrs(attr)
-      .get<int>("kernel_size", kernel_size)
-      .get<int>("group_size", group_size)
-      .get<int>("scale_factor", scale_factor)
-      .done();
-
-  const auto& features = ins[0];
-  const auto& masks = ins[1];
-
-  auto& rfeatures = outs[0];
-  auto& routput = outs[1];
-  auto& rmasks = outs[2];
-  auto& output = outs[3];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
-                                  output, kernel_size, group_size, scale_factor,
-                                  stream);
+                                  output, kernel_size, group_size,
+                                  scale_factor);
 }

-void carafe_backward_cuda(CudaContext& ctx, const SSElement& attr,
-                          const OperatorBase::in_list_t& ins,
-                          OperatorBase::out_list_t& outs) {
-  int kernel_size, group_size, scale_factor;
-  SSAttrs(attr)
-      .get<int>("kernel_size", kernel_size)
-      .get<int>("group_size", group_size)
-      .get<int>("scale_factor", scale_factor)
-      .done();
-
-  const auto& top_grad = ins[0];
-  const auto& rfeatures = ins[1];
-  const auto& masks = ins[2];
-
-  auto& rtop_grad = outs[0];
-  auto rbottom_grad_hs = outs[1];
-  auto& rbottom_grad = outs[2];
-  auto& rmask_grad = outs[3];
-  auto& bottom_grad = outs[4];
-  auto& mask_grad = outs[5];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
                                   bottom_grad, mask_grad, kernel_size,
-                                   group_size, scale_factor, stream);
+                                   group_size, scale_factor);
 }
+#endif

-PARROTS_EXTENSION_REGISTER(carafe_forward)
-    .attr("kernel_size")
-    .attr("group_size")
-    .attr("scale_factor")
-    .input(2)
-    .output(4)
-    .apply(carafe_forward_cuda)
-    .done();
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  if (features.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(routput);
+    CHECK_CUDA_INPUT(rmasks);
+    CHECK_CUDA_INPUT(output);
+    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                        kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
+}

-PARROTS_EXTENSION_REGISTER(carafe_backward)
-    .attr("kernel_size")
-    .attr("group_size")
-    .attr("scale_factor")
-    .input(3)
-    .output(6)
-    .apply(carafe_backward_cuda)
-    .done();
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  if (top_grad.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rtop_grad);
+    CHECK_CUDA_INPUT(rbottom_grad_hs);
+    CHECK_CUDA_INPUT(rbottom_grad);
+    CHECK_CUDA_INPUT(rmask_grad);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                         kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/carafe_cuda.cu
+++ b/mmcv/ops/csrc/parrots/carafe_cuda.cu
 #include "carafe_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-void CARAFEForwardCUDAKernelLauncher(
-    const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
-    DArrayLite routput, DArrayLite rmasks, DArrayLite output,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream) {
-  const int batch_size = output.dim(0);
-  const int channels = output.dim(1);
-  const int output_height = output.dim(2);
-  const int output_width = output.dim(3);
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);

-  const int input_height = features.dim(2);
-  const int input_width = features.dim(3);
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);

-  const int mask_channels = masks.dim(1);
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});

  // one warp per pixel
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(input_height * input_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, input_height * input_width, dh, dw,
-                features.ptr<scalar_t>(), rfeatures.ptr<scalar_t>());
+                bottom_data, top_data);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
        const int dh = divideUP(mask_channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, mask_channels, output_height * output_width, dh, dw,
-                masks.ptr<scalar_t>(), rmasks.ptr<scalar_t>());
+                bottom_data, top_data);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();

        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
                                  THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels, rfeatures.ptr<scalar_t>(), rmasks.ptr<scalar_t>(),
-            kernel_size, group_size, scale_factor, channels, input_height,
-            input_width, output_height, output_width, mask_channels,
-            routput.ptr<scalar_t>());
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, channels, dh, dw,
-                routput.ptr<scalar_t>(), output.ptr<scalar_t>());
+                bottom_data, top_data);
      }));

-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void CARAFEBackwardCUDAKernelLauncher(
-    const DArrayLite top_grad, const DArrayLite rfeatures,
-    const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
-    DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
-    DArrayLite mask_grad, const int kernel_size, const int group_size,
-    const int scale_factor, cudaStream_t stream) {
-  const int batch_size = top_grad.dim(0);
-  const int channels = top_grad.dim(1);
-  const int output_height = top_grad.dim(2);
-  const int output_width = top_grad.dim(3);
-
-  const int input_height = bottom_grad.dim(2);
-  const int input_width = bottom_grad.dim(3);
-
-  const int mask_channels = masks.dim(1);
-
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, output_height * output_width, dh, dw,
-                top_grad.ptr<scalar_t>(), rtop_grad.ptr<scalar_t>());
+                bottom_data, top_data);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();

        CARAFEBackward_Feature<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
-               stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
-                         masks.ptr<scalar_t>(), kernel_size, group_size,
-                         scale_factor, channels, input_height, input_width,
-                         output_height, output_width, mask_channels,
-                         rbottom_grad_hs.ptr<scalar_t>());
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
        const int num_kernels =
            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();

-        FeatureSum<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
-                               THREADS_PER_BLOCK, 0, stream>>>(
-            num_kernels, rbottom_grad_hs.ptr<scalar_t>(), scale_factor,
-            channels, input_height, input_width, rbottom_grad.ptr<scalar_t>());
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
        const int dh = divideUP(input_height * input_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, input_height * input_width, channels, dh, dw,
-                rbottom_grad.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>());
+                bottom_data, top_data);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
        const int num_kernels = batch_size * output_height * output_width *
                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();

        CARAFEBackward_Mask<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
-               stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
-                         rfeatures.ptr<scalar_t>(), kernel_size, group_size,
-                         scale_factor, channels, input_height, input_width,
-                         output_height, output_width, mask_channels,
-                         rmask_grad.ptr<scalar_t>());
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
      }));
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(mask_channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, mask_channels, dh, dw,
-                rmask_grad.ptr<scalar_t>(), mask_grad.ptr<scalar_t>());
+                bottom_data, top_data);
      }));

-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/carafe_naive.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-void CARAFENAIVEForwardCUDAKernelLauncher(
-    const DArrayLite features, const DArrayLite masks, DArrayLite output,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream);
+#ifdef MMCV_WITH_CUDA
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);

 void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const DArrayLite top_grad, const DArrayLite features,
-    const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream);
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);

-void carafe_naive_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                               const OperatorBase::in_list_t& ins,
-                               OperatorBase::out_list_t& outs) {
-  int kernel_size, group_size, scale_factor;
-  SSAttrs(attr)
-      .get<int>("kernel_size", kernel_size)
-      .get<int>("group_size", group_size)
-      .get<int>("scale_factor", scale_factor)
-      .done();
-
-  const auto& features = ins[0];
-  const auto& masks = ins[1];
-
-  auto& output = outs[0];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
-                                       group_size, scale_factor, stream);
+                                       group_size, scale_factor);
 }

-void carafe_naive_backward_cuda(CudaContext& ctx, const SSElement& attr,
-                                const OperatorBase::in_list_t& ins,
-                                OperatorBase::out_list_t& outs) {
-  int kernel_size, group_size, scale_factor;
-  SSAttrs(attr)
-      .get<int>("kernel_size", kernel_size)
-      .get<int>("group_size", group_size)
-      .get<int>("scale_factor", scale_factor)
-      .done();
-
-  const auto& top_grad = ins[0];
-  const auto& features = ins[1];
-  const auto& masks = ins[2];
-
-  auto& bottom_grad = outs[0];
-  auto& mask_grad = outs[1];
-
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
                                        mask_grad, kernel_size, group_size,
-                                        scale_factor, stream);
+                                        scale_factor);
 }
+#endif

-PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
-    .attr("kernel_size")
-    .attr("group_size")
-    .attr("scale_factor")
-    .input(2)
-    .output(1)
-    .apply(carafe_naive_forward_cuda)
-    .done();
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  if (features.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(output);
+    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                              scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
+}

-PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
-    .attr("kernel_size")
-    .attr("group_size")
-    .attr("scale_factor")
-    .input(3)
-    .output(2)
-    .apply(carafe_naive_backward_cuda)
-    .done();
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  if (top_grad.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
+                               mask_grad, kernel_size, group_size,
+                               scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/carafe_naive_cuda.cu
+++ b/mmcv/ops/csrc/parrots/carafe_naive_cuda.cu
 #include "carafe_naive_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"

-void CARAFENAIVEForwardCUDAKernelLauncher(
-    const DArrayLite features, const DArrayLite masks, DArrayLite output,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream) {
-  int output_size = output.size();
-  int channels = output.dim(1);
-  int height = output.dim(2);
-  int width = output.dim(3);
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
        carafe_naive_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, features.ptr<scalar_t>(), masks.ptr<scalar_t>(),
-                output.ptr<scalar_t>(), kernel_size, group_size, scale_factor,
-                channels, height, width);
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
      }));

-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const DArrayLite top_grad, const DArrayLite features,
-    const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor,
-    cudaStream_t stream) {
-  int output_size = top_grad.size();
-  int channels = top_grad.dim(1);
-  int height = top_grad.dim(2);
-  int width = top_grad.dim(3);
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);

-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.elemType().prim(), ([&] {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
        carafe_naive_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, top_grad.ptr<scalar_t>(), features.ptr<scalar_t>(),
-                masks.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>(),
-                mask_grad.ptr<scalar_t>(), kernel_size, group_size,
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
                scale_factor, channels, height, width);
      }));

-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_naive_pytorch.h"
+
+using namespace parrots;
+
+/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+ *                                int kernel_size, int group_size,
+ *                                int scale_factor)
+ */
+void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
+ * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
+ *                                int scale_factor);
+ */
+void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& features = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  auto mask_grad = buildATensor(ctx, outs[1]);
+  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
+++ b/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
+#ifndef CARAFE_NAIVE_PYTORCH_H
+#define CARAFE_NAIVE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+#endif  // CARAFE_NAIVE_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/carafe_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+ *                          Tensor routput, Tensor rmasks, Tensor output,
+ *                          int kernel_size, int group_size, int scale_factor);
+ */
+void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto rfeatures = buildATensor(ctx, outs[0]);
+  auto routput = buildATensor(ctx, outs[1]);
+  auto rmasks = buildATensor(ctx, outs[2]);
+  auto output = buildATensor(ctx, outs[3]);
+
+  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+/*
+ * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+ *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
+ *                           Tensor rbottom_grad, Tensor rmask_grad,
+ *                           Tensor bottom_grad, Tensor mask_grad, int
+ * kernel_size, int group_size, int scale_factor);
+ */
+void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& rfeatures = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto rtop_grad = buildATensor(ctx, outs[0]);
+  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
+  auto rbottom_grad = buildATensor(ctx, outs[2]);
+  auto rmask_grad = buildATensor(ctx, outs[3]);
+  auto bottom_grad = buildATensor(ctx, outs[4]);
+  auto mask_grad = buildATensor(ctx, outs[5]);
+
+  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/carafe_pytorch.h
+++ b/mmcv/ops/csrc/parrots/carafe_pytorch.h
+#ifndef CARAFE_PYTORCH_H
+#define CARAFE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+#endif  // CARAFE_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/cc_attention.cpp
+++ b/mmcv/ops/csrc/parrots/cc_attention.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"

-void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
-                                 DArrayLite weight, CudaContext &ctx,
-                                 cudaStream_t stream);
+#ifdef MMCV_WITH_CUDA
+void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f, Tensor weight);

-void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
-                                  const DArrayLite f, DArrayLite dt,
-                                  DArrayLite df, CudaContext &ctx,
-                                  cudaStream_t stream);
+void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
+                                  const Tensor f, Tensor dt, Tensor df);

-void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
-                                    DArrayLite out, CudaContext &ctx,
-                                    cudaStream_t stream);
+void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
+                                    Tensor out);

-void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
-                                     const DArrayLite weight,
-                                     const DArrayLite g, DArrayLite dw,
-                                     DArrayLite dg, CudaContext &ctx,
-                                     cudaStream_t stream);
+void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
+                                     const Tensor g, Tensor dw, Tensor dg);

-void ca_forward_cuda(CudaContext &ctx, const SSElement &attr,
-                     const OperatorBase::in_list_t &ins,
-                     OperatorBase::out_list_t &outs) {
-  const auto &t = ins[0];
-  const auto &f = ins[1];
-  auto &weight = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  CAForwardCUDAKernelLauncher(t, f, weight, ctx, stream);
+void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight) {
+  CAForwardCUDAKernelLauncher(t, f, weight);
 }

-void ca_backward_cuda(CudaContext &ctx, const SSElement &attr,
-                      const OperatorBase::in_list_t &ins,
-                      OperatorBase::out_list_t &outs) {
-  const auto &dw = ins[0];
-  const auto &t = ins[1];
-  const auto &f = ins[2];
-  auto &dt = outs[0];
-  auto &df = outs[1];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  CABackwardCUDAKernelLauncher(dw, t, f, dt, df, ctx, stream);
+void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
+                      Tensor dt, Tensor df) {
+  CABackwardCUDAKernelLauncher(dw, t, f, dt, df);
 }

-void ca_map_forward_cuda(CudaContext &ctx, const SSElement &attr,
-                         const OperatorBase::in_list_t &ins,
-                         OperatorBase::out_list_t &outs) {
-  const auto &weight = ins[0];
-  const auto &g = ins[1];
-  auto &out = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  CAMapForwardCUDAKernelLauncher(weight, g, out, ctx, stream);
+void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out) {
+  CAMapForwardCUDAKernelLauncher(weight, g, out);
 }

-void ca_map_backward_cuda(CudaContext &ctx, const SSElement &attr,
-                          const OperatorBase::in_list_t &ins,
-                          OperatorBase::out_list_t &outs) {
-  const auto &dout = ins[0];
-  const auto &weight = ins[1];
-  const auto &g = ins[2];
-  auto &dw = outs[0];
-  auto &dg = outs[1];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg, ctx, stream);
+void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
+                          const Tensor g, Tensor dw, Tensor dg) {
+  CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg);
 }
+#endif

-PARROTS_EXTENSION_REGISTER(ca_forward)
-    .input(2)
-    .output(1)
-    .apply(ca_forward_cuda)
-    .done();
+void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
+  if (t.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(t);
+    CHECK_CUDA_INPUT(f);
+    CHECK_CUDA_INPUT(weight);
+    ca_forward_cuda(t, f, weight);
+#else
+    AT_ERROR("ca is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}

-PARROTS_EXTENSION_REGISTER(ca_backward)
-    .input(3)
-    .output(2)
-    .apply(ca_backward_cuda)
-    .done();
+void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
+                 Tensor df) {
+  if (dw.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(dw);
+    CHECK_CUDA_INPUT(t);
+    CHECK_CUDA_INPUT(f);
+    CHECK_CUDA_INPUT(dt);
+    CHECK_CUDA_INPUT(df);
+    ca_backward_cuda(dw, t, f, dt, df);
+#else
+    AT_ERROR("ca is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}

-PARROTS_EXTENSION_REGISTER(ca_map_forward)
-    .input(2)
-    .output(1)
-    .apply(ca_map_forward_cuda)
-    .done();
+void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
+  if (weight.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(g);
+    CHECK_CUDA_INPUT(out);
+    ca_map_forward_cuda(weight, g, out);
+#else
+    AT_ERROR("ca_map is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}

-PARROTS_EXTENSION_REGISTER(ca_map_backward)
-    .input(3)
-    .output(2)
-    .apply(ca_map_backward_cuda)
-    .done();
+void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
+                     Tensor dw, Tensor dg) {
+  if (dout.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(dout);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(g);
+    CHECK_CUDA_INPUT(dw);
+    CHECK_CUDA_INPUT(dg);
+    ca_map_backward_cuda(dout, weight, g, dw, dg);
+#else
+    AT_ERROR("ca_map is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ca is not implemented on the CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/cc_attention_cuda.cu
+++ b/mmcv/ops/csrc/parrots/cc_attention_cuda.cu
+// Modified from
+// https://github.com/LikeLy-Journey/SegmenTron/blob/master/segmentron/modules/csrc/criss_cross_attention/ca_cuda.cu
+
+#include <THC/THC.h>
+
+#include <THC/THCDeviceUtils.cuh>
+
+#include "cc_attention_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f,
+                                 Tensor weight) {
+  AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = t.size(0);
+  auto c = t.size(1);
+  auto h = t.size(2);
+  auto w = t.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_forward", [&] {
+    ca_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        weight.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
+                                  const Tensor f, Tensor dt, Tensor df) {
+  AT_ASSERTM(dw.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = t.size(0);
+  auto c = t.size(1);
+  auto h = t.size(2);
+  auto w = t.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_backward_kernel_t", [&] {
+    ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.contiguous().data_ptr<scalar_t>(),
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        dt.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+
+  AT_DISPATCH_FLOATING_TYPES(f.scalar_type(), "ca_backward_kernel_f", [&] {
+    ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
+        dw.contiguous().data_ptr<scalar_t>(),
+        t.contiguous().data_ptr<scalar_t>(),
+        f.contiguous().data_ptr<scalar_t>(),
+        df.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
+                                    Tensor out) {
+  AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = g.size(0);
+  auto c = g.size(1);
+  auto h = g.size(2);
+  auto w = g.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = c;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_forward", [&] {
+    ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        weight.contiguous().data_ptr<scalar_t>(),
+        g.contiguous().data_ptr<scalar_t>(),
+        out.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
+                                     const Tensor g, Tensor dw, Tensor dg) {
+  AT_ASSERTM(dout.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
+
+  auto n = dout.size(0);
+  auto c = dout.size(1);
+  auto h = dout.size(2);
+  auto w = dout.size(3);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Run kernel
+  dim3 threads(32, 32);
+  int d1 = (w + threads.x - 1) / threads.x;
+  int d2 = (h + threads.y - 1) / threads.y;
+  int d3 = h + w;
+  dim3 blocks(d1, d2, d3);
+
+  AT_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "ca_map_backward_kernel_w", [&] {
+        ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
+            dout.contiguous().data_ptr<scalar_t>(),
+            weight.contiguous().data_ptr<scalar_t>(),
+            g.contiguous().data_ptr<scalar_t>(),
+            dw.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+      });
+
+  AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_backward_kernel_g", [&] {
+    ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
+        dout.contiguous().data_ptr<scalar_t>(),
+        weight.contiguous().data_ptr<scalar_t>(),
+        g.contiguous().data_ptr<scalar_t>(),
+        dg.contiguous().data_ptr<scalar_t>(), n, c, h, w);
+  });
+  THCudaCheck(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/parrots/cc_attention_cuda_kernel.cu
+++ b/mmcv/ops/csrc/parrots/cc_attention_cuda_kernel.cu
-#include "cc_attention_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
-
-void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
-                                 DArrayLite weight, CudaContext &ctx,
-                                 cudaStream_t stream) {
-  auto n = t.dim(0);
-  auto c = t.dim(1);
-  auto h = t.dim(2);
-  auto w = t.dim(3);
-
-  // Run kernel
-  dim3 threads(32, 32);
-  int d1 = (w + threads.x - 1) / threads.x;
-  int d2 = (h + threads.y - 1) / threads.y;
-  int d3 = h + w;
-  dim3 blocks(d1, d2, d3);
-
-  PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
-    ca_forward_kernel<scalar_t>
-        <<<blocks, threads, 0, stream>>>(t.ptr<scalar_t>(), f.ptr<scalar_t>(),
-                                         weight.ptr<scalar_t>(), n, c, h, w);
-  });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-}
-
-void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
-                                  const DArrayLite f, DArrayLite dt,
-                                  DArrayLite df, CudaContext &ctx,
-                                  cudaStream_t stream) {
-  auto n = t.dim(0);
-  auto c = t.dim(1);
-  auto h = t.dim(2);
-  auto w = t.dim(3);
-
-  // Run kernel
-  dim3 threads(32, 32);
-  int d1 = (w + threads.x - 1) / threads.x;
-  int d2 = (h + threads.y - 1) / threads.y;
-  int d3 = c;
-  dim3 blocks(d1, d2, d3);
-
-  PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
-    ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
-        dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
-        dt.ptr<scalar_t>(), n, c, h, w);
-  });
-
-  PARROTS_DISPATCH_FLOATING_TYPES(f.elemType().prim(), [&] {
-    ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
-        dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
-        df.ptr<scalar_t>(), n, c, h, w);
-  });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-}
-
-void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
-                                    DArrayLite out, CudaContext &ctx,
-                                    cudaStream_t stream) {
-  auto n = g.dim(0);
-  auto c = g.dim(1);
-  auto h = g.dim(2);
-  auto w = g.dim(3);
-
-  // Run kernel
-  dim3 threads(32, 32);
-  int d1 = (w + threads.x - 1) / threads.x;
-  int d2 = (h + threads.y - 1) / threads.y;
-  int d3 = c;
-  dim3 blocks(d1, d2, d3);
-
-  PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
-    ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        weight.ptr<scalar_t>(), g.ptr<scalar_t>(), out.ptr<scalar_t>(), n, c, h,
-        w);
-  });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-}
-
-void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
-                                     const DArrayLite weight,
-                                     const DArrayLite g, DArrayLite dw,
-                                     DArrayLite dg, CudaContext &ctx,
-                                     cudaStream_t stream) {
-  auto n = dout.dim(0);
-  auto c = dout.dim(1);
-  auto h = dout.dim(2);
-  auto w = dout.dim(3);
-
-  // Run kernel
-  dim3 threads(32, 32);
-  int d1 = (w + threads.x - 1) / threads.x;
-  int d2 = (h + threads.y - 1) / threads.y;
-  int d3 = h + w;
-  dim3 blocks(d1, d2, d3);
-
-  PARROTS_DISPATCH_FLOATING_TYPES(weight.elemType().prim(), [&] {
-    ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
-        dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
-        dw.ptr<scalar_t>(), n, c, h, w);
-  });
-
-  PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
-    ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
-        dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
-        dg.ptr<scalar_t>(), n, c, h, w);
-  });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-}