Add new parrots extension implementation for all ops (#794)

* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint

Add new parrots extension implementation for all ops (#794)
* delete all parrots file add bbox_overlaps new parrots op impl * support first new impl parrts op (bbox_overlaps)(success test) * add box_iou_rotated op, test succeed * add carafe and carafe_naive op, test succeed (one parrots bug need fix) * add cc_attention op, test success * add corner_pool op, test success * add parrots op deform_conv, test success * add deform_roi_pool op, test success (but has question) * add focal loss op, test success (gradcheck) * add masked_conv2d op, test success * add modulated_deform_conv op, test success * add nms and nms_rotated op, test success * add psamask op, test success * add roi_align op, test_success * add roi_pool op, test success * add sync_bn op, test success * add tin_shift op, test success * fix test_deform_roi_pool, add parrots test * skip test_onnx because parrots does not support onnx * fix c++ lint * fix python lint * fix python lint
48d99025 · z55250825 · GitHub · 72e4cc12 · 48d99025 · 48d99025
Unverified Commit 48d99025 authored Feb 26, 2021 by z55250825 Committed by GitHub Feb 26, 2021
20 changed files
--- a/mmcv/ops/csrc/parrots/cc_attention_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/cc_attention_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "cc_attention_pytorch.h"
+using namespace parrots;
+/*void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);*/
+void ca_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                             const OperatorBase::in_list_t &ins,
+                             OperatorBase::out_list_t &outs) {
+  const auto &t = buildATensor(ctx, ins[0]);
+  const auto &f = buildATensor(ctx, ins[1]);
+  auto weight = buildATensor(ctx, outs[0]);
+  ca_forward_cuda(t, f, weight);
+}
+/* void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
+ *                       Tensor dt, Tensor df)
+ */
+void ca_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                              const OperatorBase::in_list_t &ins,
+                              OperatorBase::out_list_t &outs) {
+  const auto &dw = buildATensor(ctx, ins[0]);
+  const auto &t = buildATensor(ctx, ins[1]);
+  const auto &f = buildATensor(ctx, ins[2]);
+  auto dt = buildATensor(ctx, outs[0]);
+  auto df = buildATensor(ctx, outs[1]);
+  ca_backward_cuda(dw, t, f, dt, df);
+}
+/* void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out); */
+void ca_map_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  const auto &weight = buildATensor(ctx, ins[0]);
+  const auto &g = buildATensor(ctx, ins[1]);
+  auto out = buildATensor(ctx, outs[0]);
+  ca_map_forward_cuda(weight, g, out);
+}
+/* void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
+ *                           const Tensor g, Tensor dw, Tensor dg);
+ */
+void ca_map_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  const auto &dout = buildATensor(ctx, ins[0]);
+  const auto &weight = buildATensor(ctx, ins[1]);
+  const auto &g = buildATensor(ctx, ins[2]);
+  auto dw = buildATensor(ctx, outs[0]);
+  auto dg = buildATensor(ctx, outs[1]);
+  ca_map_backward_cuda(dout, weight, g, dw, dg);
+}
+PARROTS_EXTENSION_REGISTER(ca_forward)
+    .input(2)
+    .output(1)
+    .apply(ca_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(ca_backward)
+    .input(3)
+    .output(2)
+    .apply(ca_backward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(ca_map_forward)
+    .input(2)
+    .output(1)
+    .apply(ca_map_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(ca_map_backward)
+    .input(3)
+    .output(2)
+    .apply(ca_map_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/cc_attention_pytorch.h
+++ b/mmcv/ops/csrc/parrots/cc_attention_pytorch.h
+#ifndef CC_ATTENTION_PYTORCH_H
+#define CC_ATTENTION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);
+void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
+                      Tensor dt, Tensor df);
+void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out);
+void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
+                          const Tensor g, Tensor dw, Tensor dg);
+#endif  // CC_ATTENTION_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/corner_pool.cpp
+++ b/mmcv/ops/csrc/parrots/corner_pool.cpp
 // Modified from
 // https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"
-void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+Tensor bottom_pool_forward(Tensor input) {
-                              const OperatorBase::in_list_t& ins,
+  // Initialize output
-                              OperatorBase::out_list_t& outs) {}
+  Tensor output = at::zeros_like(input);
+  // Get height
-void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+  int64_t height = input.size(2);
-                               const OperatorBase::in_list_t& ins,
+  output.copy_(input);
-                               OperatorBase::out_list_t& outs) {}
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
-void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+    Tensor max_temp = at::slice(output, 2, ind, height);
-                           const OperatorBase::in_list_t& ins,
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
-                           OperatorBase::out_list_t& outs) {}
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
-void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+  }
-                            const OperatorBase::in_list_t& ins,
-                            OperatorBase::out_list_t& outs) {}
+  return output;
+}
-void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                            const OperatorBase::in_list_t& ins,
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
-                            OperatorBase::out_list_t& outs) {}
+  auto output = at::zeros_like(input);
-void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+  int32_t batch = input.size(0);
-                             const OperatorBase::in_list_t& ins,
+  int32_t channel = input.size(1);
-                             OperatorBase::out_list_t& outs) {}
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
-void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
-                             const OperatorBase::in_list_t& ins,
+  auto max_val = torch::zeros({batch, channel, width},
-                             OperatorBase::out_list_t& outs) {}
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
-void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+                              at::device(at::kCUDA).dtype(at::kLong));
-                              const OperatorBase::in_list_t& ins,
-                              OperatorBase::out_list_t& outs) {}
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
-PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
-    .input(1)
+  max_ind.fill_(0);
-    .output(1)
-    .apply(bottom_pool_forward_cuda)
+  auto output_temp = output.select(2, 0);
-    .done();
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
-PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
-    .input(2)
+  auto un_max_ind = max_ind.unsqueeze(2);
-    .output(1)
+  auto gt_mask = torch::zeros({batch, channel, width},
-    .apply(bottom_pool_backward_cuda)
+                              at::device(at::kCUDA).dtype(at::kBool));
-    .done();
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
-PARROTS_EXTENSION_REGISTER(top_pool_forward)
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
-    .input(1)
+    input_temp = input.select(2, ind + 1);
-    .output(1)
+    at::gt_out(gt_mask, input_temp, max_val);
-    .apply(top_pool_forward_cuda)
-    .done();
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
-PARROTS_EXTENSION_REGISTER(top_pool_backward)
+    max_ind.masked_fill_(gt_mask, ind + 1);
-    .input(2)
-    .output(1)
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
-    .apply(top_pool_backward_cuda)
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
-    .done();
+  }
-PARROTS_EXTENSION_REGISTER(left_pool_forward)
+  return output;
-    .input(1)
+}
-    .output(1)
-    .apply(left_pool_forward_cuda)
+Tensor left_pool_forward(Tensor input) {
-    .done();
+  // Initialize output
+  Tensor output = at::zeros_like(input);
-PARROTS_EXTENSION_REGISTER(left_pool_backward)
+  // Get width
-    .input(2)
+  int64_t width = input.size(3);
-    .output(1)
+  output.copy_(input);
-    .apply(left_pool_backward_cuda)
-    .done();
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
-PARROTS_EXTENSION_REGISTER(right_pool_forward)
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
-    .input(1)
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
-    .output(1)
+    at::max_out(max_temp, cur_temp, next_temp);
-    .apply(right_pool_forward_cuda)
+  }
-    .done();
+  return output;
-PARROTS_EXTENSION_REGISTER(right_pool_backward)
+}
-    .input(2)
-    .output(1)
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
-    .apply(right_pool_backward_cuda)
+  auto output = at::zeros_like(input);
-    .done();
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+  max_ind.fill_(width - 1);
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+  return output;
+}
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+  return output;
+}
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+  max_ind.fill_(0);
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+  return output;
+}
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+  return output;
+}
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+  max_ind.fill_(height - 1);
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+  return output;
+}
--- a/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "corner_pool_pytorch.h"
+using namespace parrots;
+#ifdef MMCV_WITH_CUDA
+void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+#endif
+void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_forward_parrots)
+#endif
+    .apply(bottom_pool_forward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_backward_parrots)
+#endif
+    .apply(bottom_pool_backward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(top_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_forward_parrots)
+#endif
+    .apply(top_pool_forward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(top_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_backward_parrots)
+#endif
+    .apply(top_pool_backward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(left_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_forward_parrots)
+#endif
+    .apply(left_pool_forward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(left_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_backward_parrots)
+#endif
+    .apply(left_pool_backward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(right_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_forward_parrots)
+#endif
+    .apply(right_pool_forward_parrots_cpu)
+    .done();
+PARROTS_EXTENSION_REGISTER(right_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_backward_parrots)
+#endif
+    .apply(right_pool_backward_parrots_cpu)
+    .done();
--- a/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
+++ b/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
+#ifndef CORNER_POOL_PYTORCH_H
+#define CORNER_POOL_PYTORCH_H
+#include <torch/extension.h>
+at::Tensor bottom_pool_forward(at::Tensor input);
+at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor left_pool_forward(at::Tensor input);
+at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor right_pool_forward(at::Tensor input);
+at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor top_pool_forward(at::Tensor input);
+at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
+#endif  // CORNER_POOL_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
-// Copyright (c) 2018, SenseTime.
+#include "pytorch_cpp_helper.hpp"
-#include "parrots_cpp_helper.hpp"
-void DeformConvForwardCUDAKernelLauncher(
+#ifdef MMCV_WITH_CUDA
-    const DArrayLite input, const DArrayLite weight, const DArrayLite offset,
+void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
-    DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH,
+                                         Tensor offset, Tensor output,
-    int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
+                                         Tensor columns, Tensor ones, int kW,
-    int deformable_group, int im2col_step, CudaContext& ctx,
+                                         int kH, int dW, int dH, int padW,
-    cudaStream_t stream);
+                                         int padH, int dilationW, int dilationH,
+                                         int group, int deformable_group,
+                                         int im2col_step);
 void DeformConvBackwardInputCUDAKernelLauncher(
-    const DArrayLite input, const DArrayLite offset,
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
-    const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset,
+    Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
-    DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
-    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step);
-    int deformable_group, int im2col_step, CudaContext& ctx,
-    cudaStream_t stream);
 void DeformConvBackwardParametersCUDAKernelLauncher(
-    const DArrayLite input, const DArrayLite offset,
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
-    const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
-    DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
-    int dilationW, int dilationH, int group, int deformable_group, float scale,
+    float scale, int im2col_step);
-    int im2col_step, CudaContext& ctx, cudaStream_t stream);
+void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
-void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                              Tensor output, Tensor columns, Tensor ones,
-                              const OperatorBase::in_list_t& ins,
+                              int kW, int kH, int dW, int dH, int padW,
-                              OperatorBase::out_list_t& outs) {
+                              int padH, int dilationW, int dilationH, int group,
-  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+                              int deformable_group, int im2col_step) {
-      im2col_step;
-  SSAttrs(attr)
-      .get<int>("kW", kW)
-      .get<int>("kH", kH)
-      .get<int>("dW", dW)
-      .get<int>("dH", dH)
-      .get<int>("padW", padW)
-      .get<int>("padH", padH)
-      .get<int>("dilationW", dilationW)
-      .get<int>("dilationH", dilationH)
-      .get<int>("group", group)
-      .get<int>("deformable_group", deformable_group)
-      .get<int>("im2col_step", im2col_step)
-      .done();
-  const auto input = ins[0];
-  const auto weight = ins[1];
-  const auto offset = ins[2];
-  auto output = outs[0];
-  auto columns = outs[1];
-  auto ones = outs[2];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  DeformConvForwardCUDAKernelLauncher(
      input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
-      dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream);
+      dilationW, dilationH, group, deformable_group, im2col_step);
 }
-void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr,
+void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
-                                     const OperatorBase::in_list_t& ins,
+                                     Tensor gradOutput, Tensor gradInput,
-                                     OperatorBase::out_list_t& outs) {
+                                     Tensor gradOffset, Tensor weight,
-  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+                                     Tensor columns, int kW, int kH, int dW,
-      im2col_step;
+                                     int dH, int padW, int padH, int dilationW,
-  SSAttrs(attr)
+                                     int dilationH, int group,
-      .get<int>("kW", kW)
+                                     int deformable_group, int im2col_step) {
-      .get<int>("kH", kH)
-      .get<int>("dW", dW)
-      .get<int>("dH", dH)
-      .get<int>("padW", padW)
-      .get<int>("padH", padH)
-      .get<int>("dilationW", dilationW)
-      .get<int>("dilationH", dilationH)
-      .get<int>("group", group)
-      .get<int>("deformable_group", deformable_group)
-      .get<int>("im2col_step", im2col_step)
-      .done();
-  auto input = ins[0];
-  auto offset = ins[1];
-  auto gradOutput = ins[2];
-  auto gradInput = outs[0];
-  auto gradOffset = outs[1];
-  auto weight = outs[2];
-  auto columns = outs[3];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  DeformConvBackwardInputCUDAKernelLauncher(
      input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
      dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
-      im2col_step, ctx, stream);
+      im2col_step);
 }
-void deform_conv_backward_parameters_cuda(CudaContext& ctx,
+void deform_conv_backward_parameters_cuda(
-                                          const SSElement& attr,
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
-                                          const OperatorBase::in_list_t& ins,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
-                                          OperatorBase::out_list_t& outs) {
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
-  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+    float scale, int im2col_step) {
-      im2col_step;
-  float scale;
-  SSAttrs(attr)
-      .get<int>("kW", kW)
-      .get<int>("kH", kH)
-      .get<int>("dW", dW)
-      .get<int>("dH", dH)
-      .get<int>("padW", padW)
-      .get<int>("padH", padH)
-      .get<int>("dilationW", dilationW)
-      .get<int>("dilationH", dilationH)
-      .get<int>("group", group)
-      .get<int>("deformable_group", deformable_group)
-      .get<float>("scale", scale)
-      .get<int>("im2col_step", im2col_step)
-      .done();
-  auto input = ins[0];
-  auto offset = ins[1];
-  auto gradOutput = ins[2];
-  auto gradWeight = outs[0];
-  auto columns = outs[1];
-  auto ones = outs[2];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  DeformConvBackwardParametersCUDAKernelLauncher(
      input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
      padW, padH, dilationW, dilationH, group, deformable_group, scale,
-      im2col_step, ctx, stream);
+      im2col_step);
+}
+#endif
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+    deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW,
+                             kH, dW, dH, padW, padH, dilationW, dilationH,
+                             group, deformable_group, im2col_step);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformConv is not implemented on CPU");
+  }
 }
-PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
-    .attr("kW")
+                                Tensor gradInput, Tensor gradOffset,
-    .attr("kH")
+                                Tensor weight, Tensor columns, int kW, int kH,
-    .attr("dW")
+                                int dW, int dH, int padW, int padH,
-    .attr("dH")
+                                int dilationW, int dilationH, int group,
-    .attr("padW")
+                                int deformable_group, int im2col_step) {
-    .attr("padH")
+  if (input.device().is_cuda()) {
-    .attr("dilationW")
+#ifdef MMCV_WITH_CUDA
-    .attr("dilationH")
+    CHECK_CUDA_INPUT(input);
-    .attr("group")
+    CHECK_CUDA_INPUT(offset);
-    .attr("deformable_group")
+    CHECK_CUDA_INPUT(gradOutput);
-    .attr("im2col_step")
+    CHECK_CUDA_INPUT(gradInput);
-    .input(3)
+    CHECK_CUDA_INPUT(gradOffset);
-    .output(3)
+    CHECK_CUDA_INPUT(weight);
-    .apply(deform_conv_forward_cuda)
+    CHECK_CUDA_INPUT(columns);
-    .done();
+    deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
-PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+                                    gradOffset, weight, columns, kW, kH, dW, dH,
-    .attr("kW")
+                                    padW, padH, dilationW, dilationH, group,
-    .attr("kH")
+                                    deformable_group, im2col_step);
-    .attr("dW")
+#else
-    .attr("dH")
+    AT_ERROR("DeformConv is not compiled with GPU support");
-    .attr("padW")
+#endif
-    .attr("padH")
+  } else {
-    .attr("dilationW")
+    AT_ERROR("DeformConv is not implemented on CPU");
-    .attr("dilationH")
+  }
-    .attr("group")
+}
-    .attr("deformable_group")
-    .attr("im2col_step")
-    .input(3)
-    .output(4)
-    .apply(deform_conv_backward_input_cuda)
-    .done();
-PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
-    .attr("kW")
+                                     Tensor gradOutput, Tensor gradWeight,
-    .attr("kH")
+                                     Tensor columns, Tensor ones, int kW,
-    .attr("dW")
+                                     int kH, int dW, int dH, int padW, int padH,
-    .attr("dH")
+                                     int dilationW, int dilationH, int group,
-    .attr("padW")
+                                     int deformable_group, float scale,
-    .attr("padH")
+                                     int im2col_step) {
-    .attr("dilationW")
+  if (input.device().is_cuda()) {
-    .attr("dilationH")
+#ifdef MMCV_WITH_CUDA
-    .attr("group")
+    CHECK_CUDA_INPUT(input);
-    .attr("deformable_group")
+    CHECK_CUDA_INPUT(offset);
-    .attr("scale")
+    CHECK_CUDA_INPUT(gradOutput);
-    .attr("im2col_step")
+    CHECK_CUDA_INPUT(gradWeight);
-    .input(3)
+    CHECK_CUDA_INPUT(columns);
-    .output(3)
+    CHECK_CUDA_INPUT(ones);
-    .apply(deform_conv_backward_parameters_cuda)
-    .done();
+    deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
+                                         columns, ones, kW, kH, dW, dH, padW,
+                                         padH, dilationW, dilationH, group,
+                                         deformable_group, scale, im2col_step);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformConv is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
 #include "deform_conv_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
-void deformable_im2col(DArrayLite data_im, DArrayLite data_offset,
+void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
-                       const int channels, const int height, const int width,
+                       const int height, const int width, const int ksize_h,
-                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int pad_w, const int stride_h, const int stride_w,
+                       const int stride_h, const int stride_w,
                       const int dilation_h, const int dilation_w,
                       const int parallel_imgs, const int deformable_group,
-                       DArrayLite data_col, cudaStream_t stream) {
+                       Tensor data_col) {
  // num_axes should be smaller than block size
  // todo: check parallel_imgs is correctly passed in
  int height_col =
@@ -17,28 +17,31 @@ void deformable_im2col(DArrayLite data_im, DArrayLite data_offset,
  int num_kernels = channels * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.elemType().prim(), ([&] {
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
-        deformable_im2col_gpu_kernel<scalar_t>
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-                num_kernels, data_im.ptr<scalar_t>(),
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-                data_offset.ptr<scalar_t>(), height, width, ksize_h, ksize_w,
-                pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-                channel_per_deformable_group, parallel_imgs, channels,
-                deformable_group, height_col, width_col,
-                data_col.ptr<scalar_t>());
-      }));
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
 }
-void deformable_col2im(DArrayLite data_col, DArrayLite data_offset,
+void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
-                       const int channels, const int height, const int width,
+                       const int height, const int width, const int ksize_h,
-                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int pad_w, const int stride_h, const int stride_w,
+                       const int stride_h, const int stride_w,
                       const int dilation_h, const int dilation_w,
                       const int parallel_imgs, const int deformable_group,
-                       DArrayLite grad_im, cudaStream_t stream) {
+                       Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
@@ -47,27 +50,29 @@ void deformable_col2im(DArrayLite data_col, DArrayLite data_offset,
      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.elemType().prim(), ([&] {
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
-                                       THREADS_PER_BLOCK, 0, stream>>>(
+                                       THREADS_PER_BLOCK, 0,
-            num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
+                                       at::cuda::getCurrentCUDAStream()>>>(
-            channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+            num_kernels, data_col_, data_offset_, channels, height, width,
-            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-            parallel_imgs, deformable_group, height_col, width_col,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
-            grad_im.ptr<scalar_t>());
+            deformable_group, height_col, width_col, grad_im_);
      }));
+  AT_CUDA_CHECK(cudaGetLastError());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
 }
 void deformable_col2im_coord(
-    DArrayLite data_col, DArrayLite data_im, DArrayLite data_offset,
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int channels, const int height, const int width, const int ksize_h,
+    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int stride_w, const int dilation_h, const int dilation_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int parallel_imgs, const int deformable_group, DArrayLite grad_offset,
+    const int deformable_group, Tensor grad_offset) {
-    cudaStream_t stream) {
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
@@ -77,51 +82,55 @@ void deformable_col2im_coord(
  int channel_per_deformable_group =
      channels * ksize_h * ksize_w / deformable_group;
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.elemType().prim(), ([&] {
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
-        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels),
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-                                             THREADS_PER_BLOCK, 0, stream>>>(
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-            num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(),
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-            data_offset.ptr<scalar_t>(), channels, height, width, ksize_h,
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, parallel_imgs,
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
            2 * ksize_h * ksize_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset.ptr<scalar_t>());
+            height_col, width_col, grad_offset_);
      }));
+  AT_CUDA_CHECK(cudaGetLastError());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
 }
-void deform_conv_shape_check(DArrayLite input, DArrayLite offset,
+void deform_conv_shape_check(Tensor input, Tensor offset, Tensor *gradOutput,
-                             DArrayLite* gradOutput, DArrayLite weight, int kH,
+                             Tensor weight, int kH, int kW, int dH, int dW,
-                             int kW, int dH, int dW, int padH, int padW,
+                             int padH, int padW, int dilationH, int dilationW,
-                             int dilationH, int dilationW, int group,
+                             int group, int deformable_group) {
-                             int deformable_group) {
+  TORCH_CHECK(
-  PARROTS_CHECKARGS(weight.ndims() == 4)
+      weight.ndimension() == 4,
-      << "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: "
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
-      << weight.ndims();
+      weight.ndimension());
-  PARROTS_CHECKARGS(weight.isContiguous())
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
-      << "weight tensor has to be contiguous";
-  PARROTS_CHECKARGS(kW > 0 && kH > 0)
+  TORCH_CHECK(kW > 0 && kH > 0,
-      << "kernel size should be greater than zero, but got kH: " << kH
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
-      << " kW: " << kW;
+              kH, kW);
-  PARROTS_CHECKARGS(weight.dim(2) == kH && weight.dim(3) == kW)
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
-      << "kernel size should be consistent with weight, but got kH: " << kH
+              "kernel size should be consistent with weight, ",
-      << " kW: " << kW << " weight.dim(2): " << weight.dim(2)
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
-      << ", weight.dim(3): " << weight.dim(3);
+              kH, kW, weight.size(2), weight.size(3));
-  PARROTS_CHECKARGS(dW > 0 && dH > 0)
+  TORCH_CHECK(dW > 0 && dH > 0,
-      << "stride should be greater than zero, but got dH: " << dH
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
-      << " dW: " << dW;
+              dW);
-  PARROTS_CHECKARGS(dilationW > 0 && dilationH > 0)
+  TORCH_CHECK(
-      << "dilation should be greater than 0, but got dilationH: " << dilationH
+      dilationW > 0 && dilationH > 0,
-      << " dilationW: " << dilationW;
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
-  int ndim = input.ndims();
+  int ndim = input.ndimension();
  int dimf = 0;
  int dimh = 1;
  int dimw = 2;
@@ -132,62 +141,67 @@ void deform_conv_shape_check(DArrayLite input, DArrayLite offset,
    dimw++;
  }
-  PARROTS_CHECKARGS(ndim == 3 || ndim == 4)
+  TORCH_CHECK(ndim == 3 || ndim == 4,
-      << "3D or 4D input tensor expected but got: " << ndim;
+              "3D or 4D input tensor expected but got: %s", ndim);
-  size_t nInputPlane = weight.dim(1) * group;
+  long nInputPlane = weight.size(1) * group;
-  size_t inputHeight = input.dim(dimh);
+  long inputHeight = input.size(dimh);
-  size_t inputWidth = input.dim(dimw);
+  long inputWidth = input.size(dimw);
-  size_t nOutputPlane = weight.dim(0);
+  long nOutputPlane = weight.size(0);
-  size_t outputHeight =
+  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  size_t outputWidth =
+  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  PARROTS_CHECKARGS(nInputPlane % deformable_group == 0)
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
-      << "input channels must divide deformable group size";
+              "input channels must divide deformable group size");
-  PARROTS_CHECKARGS(outputWidth >= 1 || outputHeight >= 1)
+  if (outputWidth < 1 || outputHeight < 1)
-      << "Given input size: (" << nInputPlane << " x " << inputHeight << " x "
+    AT_ERROR(
-      << inputWidth << "). Calculated output size: (" << nOutputPlane << " x "
+        "Given input size: (%ld x %ld x %ld). "
-      << outputHeight << " x " << outputWidth << "). Output size is too small";
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
-  PARROTS_CHECKARGS(input.dim(1) == nInputPlane)
+  TORCH_CHECK(input.size(1) == nInputPlane,
-      << "invalid number of input planes, expected: " << nInputPlane
+              "invalid number of input planes, expected: %d, but got: %d",
-      << ", but got: " << input.dim(1);
+              nInputPlane, input.size(1));
-  PARROTS_CHECKARGS(inputHeight >= kH && inputWidth >= kW)
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
-      << "input image is smaller than kernel";
+              "input image is smaller than kernel");
-  PARROTS_CHECKARGS(offset.dim(2) == outputHeight &&
+  TORCH_CHECK(
-                    offset.dim(3) == outputWidth)
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
-      << "invalid spatial dim of offset, expected height: " << outputHeight
+      "invalid spatial size of offset, expected height: %d width: %d, but "
-      << " width: " << outputWidth << ", but got height: " << offset.dim(2)
+      "got height: %d width: %d",
-      << " width: " << offset.dim(3);
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
-  PARROTS_CHECKARGS(offset.dim(1) == deformable_group * 2 * kH * kW)
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
-      << "invalid number of channels of offset";
+              "invalid number of channels of offset");
  if (gradOutput != NULL) {
-    PARROTS_CHECKARGS(gradOutput->dim(dimf) == nOutputPlane)
+    TORCH_CHECK(
-        << "invalid number of gradOutput planes, expected: " << nOutputPlane
+        gradOutput->size(dimf) == nOutputPlane,
-        << ", but got: " << gradOutput->dim(dimf);
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
-    PARROTS_CHECKARGS(gradOutput->dim(dimh) == outputHeight &&
-                      gradOutput->dim(dimw) == outputWidth)
+    TORCH_CHECK(
-        << "invalid dim of gradOutput, expected height: " << outputHeight
+        (gradOutput->size(dimh) == outputHeight &&
-        << " width: " << outputWidth
+         gradOutput->size(dimw) == outputWidth),
-        << " , but got height: " << gradOutput->dim(dimh)
+        "invalid size of gradOutput, expected height: %d width: %d , but "
-        << " width: " << gradOutput->dim(dimw);
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
  }
 }
-void DeformConvForwardCUDAKernelLauncher(
+void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
-    DArrayLite input, DArrayLite weight, DArrayLite offset, DArrayLite output,
+                                         Tensor offset, Tensor output,
-    DArrayLite columns, DArrayLite ones, int kW, int kH, int dW, int dH,
+                                         Tensor columns, Tensor ones, int kW,
-    int padW, int padH, int dilationW, int dilationH, int group,
+                                         int kH, int dW, int dH, int padW,
-    int deformable_group, int im2col_step, CudaContext& ctx,
+                                         int padH, int dilationW, int dilationH,
-    cudaStream_t stream) {
+                                         int group, int deformable_group,
+                                         int im2col_step) {
  // todo: resize columns to include im2col: done
  // todo: add im2col_step as input
  // todo: add new output buffer and transpose it to output (or directly
@@ -196,45 +210,41 @@ void DeformConvForwardCUDAKernelLauncher(
  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
  int batch = 1;
-  if (input.ndims() == 3) {
+  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
-    input = input.view({1, input.dim(0), input.dim(1), input.dim(2)});
+    input.unsqueeze_(0);
-    offset = offset.view({1, offset.dim(0), offset.dim(1), offset.dim(2)});
+    offset.unsqueeze_(0);
  }
  // todo: assert batchsize dividable by im2col_step
-  size_t batchSize = input.dim(0);
+  long batchSize = input.size(0);
-  size_t nInputPlane = input.dim(1);
+  long nInputPlane = input.size(1);
-  size_t inputHeight = input.dim(2);
+  long inputHeight = input.size(2);
-  size_t inputWidth = input.dim(3);
+  long inputWidth = input.size(3);
-  size_t nOutputPlane = weight.dim(0);
+  long nOutputPlane = weight.size(0);
-  size_t outputWidth =
+  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  size_t outputHeight =
+  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  PARROTS_CHECKARGS(offset.dim(0) == batchSize)
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
-      << "invalid batch size of offset";
  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
-  columns = ctx.createDArrayLite(
+  if (ones.ndimension() != 2 ||
-      input.elemType(), DArrayShape(nInputPlane * kW * kH,
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
-                                    im2col_step * outputHeight * outputWidth));
+    ones = at::ones({outputHeight, outputWidth}, input.options());
-  columns.setZeros(ctx.getStream());
-  if (ones.ndims() != 2 ||
-      ones.dim(0) * ones.dim(1) < outputHeight * outputWidth) {
-    ones = ctx.createDArrayLite(input.elemType(),
-                                DArrayShape(outputHeight, outputWidth));
-    fill(ctx, ones, *toScalar(1));
  }
  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
@@ -243,45 +253,41 @@ void DeformConvForwardCUDAKernelLauncher(
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-  auto output_buffer = ctx.createDArrayLite(
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
-      input.elemType(), DArrayShape(batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
-                                    im2col_step * outputHeight, outputWidth));
+                                   output.options());
-  output_buffer.setZeros(ctx.getStream());
  output_buffer = output_buffer.view(
-      {output_buffer.dim(0), group, output_buffer.dim(1) / group,
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
-       output_buffer.dim(2) * output_buffer.dim(3)});
+       output_buffer.size(2), output_buffer.size(3)});
-  for (size_t elt = 0; elt < batchSize / im2col_step; elt++) {
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, columns,
+                      dilationW, im2col_step, deformable_group, columns);
-                      stream);
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-    weight = weight.view(
+                          weight.size(2), weight.size(3)});
-        {group, nOutputPlane / group, nInputPlane / group * kH * kW});
+    for (int g = 0; g < group; g++) {
-    for (size_t g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
-      auto output_g = output_buffer[elt][g];
+                                  .flatten(1)
-      auto weight_g = weight[g];
+                                  .addmm_(weight[g].flatten(1), columns[g])
-      auto columns_g = columns[g];
+                                  .view_as(output_buffer[elt][g]);
-      gemm(ctx, 1, false, weight_g, false, columns_g, 1, output_g);
    }
-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    columns =
-    weight = weight.view({nOutputPlane, nInputPlane, kH, kW});
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }
  output_buffer = output_buffer.view(
-      {output_buffer.dim(0), output_buffer.dim(1) * output_buffer.dim(2),
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
-       output_buffer.dim(3)});
+       output_buffer.size(3), output_buffer.size(4)});
  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
                                      im2col_step, outputHeight, outputWidth});
-  output_buffer = transpose(ctx, output_buffer, 1, 2);
+  output_buffer.transpose_(1, 2);
-  if (!output_buffer.isContiguous()) {
+  output.copy_(output_buffer);
-    output_buffer = ctx.cloneDArrayLite(output_buffer);
-  }
-  copy(ctx, output, output_buffer);
  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
@@ -291,58 +297,53 @@ void DeformConvForwardCUDAKernelLauncher(
  if (batch == 0) {
    output = output.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
-    offset = offset.view({offset.dim(1), offset.dim(2), offset.dim(3)});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
 }
 void DeformConvBackwardInputCUDAKernelLauncher(
-    DArrayLite input, DArrayLite offset, DArrayLite gradOutput,
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
-    DArrayLite gradInput, DArrayLite gradOffset, DArrayLite weight,
+    Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
-    DArrayLite columns, int kW, int kH, int dW, int dH, int padW, int padH,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
-    int dilationW, int dilationH, int group, int deformable_group,
+    int deformable_group, int im2col_step) {
-    int im2col_step, CudaContext& ctx, cudaStream_t stream) {
  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
                          padH, padW, dilationH, dilationW, group,
                          deformable_group);
+  at::DeviceGuard guard(input.device());
  int batch = 1;
-  if (input.ndims() == 3) {
+  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
-    input = input.view({1, input.dim(0), input.dim(1), input.dim(2)});
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
-    offset = offset.view({1, offset.dim(0), offset.dim(1), offset.dim(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
    gradOutput = gradOutput.view(
-        {1, gradOutput.dim(0), gradOutput.dim(1), gradOutput.dim(2)});
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }
-  size_t batchSize = input.dim(0);
+  long batchSize = input.size(0);
-  size_t nInputPlane = input.dim(1);
+  long nInputPlane = input.size(1);
-  size_t inputHeight = input.dim(2);
+  long inputHeight = input.size(2);
-  size_t inputWidth = input.dim(3);
+  long inputWidth = input.size(3);
-  size_t nOutputPlane = weight.dim(0);
+  long nOutputPlane = weight.size(0);
-  size_t outputWidth =
+  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  size_t outputHeight =
+  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  PARROTS_CHECKARGS(offset.dim(0) == batchSize)
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
-      << "invalid batch size of offset";
  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  columns = ctx.createDArrayLite(
+  columns = at::zeros(
-      input.elemType(), DArrayShape(nInputPlane * kW * kH,
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
-                                    im2col_step * outputHeight * outputWidth));
+      input.options());
-  columns.setZeros(ctx.getStream());
  // change order of grad output
  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
-  gradOutput = transpose(ctx, gradOutput, 1, 2);
+  gradOutput.transpose_(1, 2);
-  if (!gradOutput.isContiguous()) {
-    gradOutput = ctx.cloneDArrayLite(gradOutput);
-  }
  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
                              inputHeight, inputWidth});
@@ -355,41 +356,37 @@ void DeformConvBackwardInputCUDAKernelLauncher(
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-  for (size_t elt = 0; elt < batchSize / im2col_step; elt++) {
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    // divide into groups
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    weight = weight.view({group, weight.dim(0) / group,
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-                          weight.dim(1) * weight.dim(2) * weight.dim(3)});
+                          weight.size(2), weight.size(3)});
    gradOutput = gradOutput.view(
-        {gradOutput.dim(0), group, gradOutput.dim(1) / group,
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
-         gradOutput.dim(2) * gradOutput.dim(3) * gradOutput.dim(4)});
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
-    for (size_t g = 0; g < group; g++) {
+    for (int g = 0; g < group; g++) {
-      auto columns_g = columns[g];
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
-      gemm(ctx, 1, true, weight[g], false, gradOutput[elt][g], 0, columns_g);
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
    }
-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    columns =
-    gradOutput = gradOutput.view({gradOutput.dim(0),
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-                                  gradOutput.dim(1) * gradOutput.dim(2),
+    gradOutput = gradOutput.view(
-                                  im2col_step, outputHeight, outputWidth});
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
-    weight = weight.view({nOutputPlane, nInputPlane, kH, kW});
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
                            dilationH, dilationW, im2col_step, deformable_group,
-                            gradOffset[elt], stream);
+                            gradOffset[elt]);
    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, gradInput[elt],
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
-                      stream);
  }
-  gradOutput = transpose(ctx, gradOutput, 1, 2);
+  gradOutput.transpose_(1, 2);
-  if (!gradOutput.isContiguous()) {
-    gradOutput = ctx.cloneDArrayLite(gradOutput);
-  }
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
@@ -404,17 +401,17 @@ void DeformConvBackwardInputCUDAKernelLauncher(
    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
-    offset = offset.view({offset.dim(1), offset.dim(2), offset.dim(3)});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
-    gradOffset = gradOffset.view({offset.dim(1), offset.dim(2), offset.dim(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
 }
 void DeformConvBackwardParametersCUDAKernelLauncher(
-    DArrayLite input, DArrayLite offset, DArrayLite gradOutput,
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
-    DArrayLite gradWeight, DArrayLite columns, DArrayLite ones, int kW, int kH,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
-    int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
-    int deformable_group, float scale, int im2col_step, CudaContext& ctx,
+    float scale, int im2col_step) {
-    cudaStream_t stream) {
  // todo: transpose and reshape outGrad
  // todo: reshape columns
  // todo: add im2col_step as input
@@ -422,53 +419,52 @@ void DeformConvBackwardParametersCUDAKernelLauncher(
  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
                          dW, padH, padW, dilationH, dilationW, group,
                          deformable_group);
+  at::DeviceGuard guard(input.device());
  int batch = 1;
-  if (input.ndims() == 3) {
+  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
-    input = input.view({1, input.dim(0), input.dim(1), input.dim(2)});
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
    gradOutput = gradOutput.view(
-        {1, gradOutput.dim(0), gradOutput.dim(1), gradOutput.dim(2)});
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }
-  size_t batchSize = input.dim(0);
+  long batchSize = input.size(0);
-  size_t nInputPlane = input.dim(1);
+  long nInputPlane = input.size(1);
-  size_t inputHeight = input.dim(2);
+  long inputHeight = input.size(2);
-  size_t inputWidth = input.dim(3);
+  long inputWidth = input.size(3);
-  size_t nOutputPlane = gradWeight.dim(0);
+  long nOutputPlane = gradWeight.size(0);
-  size_t outputWidth =
+  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  size_t outputHeight =
+  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  PARROTS_CHECKARGS(offset.dim(0) == batchSize)
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
-      << "invalid batch size of offset";
-  columns = ctx.createDArrayLite(
+  columns = at::zeros(
-      input.elemType(), DArrayShape(nInputPlane * kW * kH,
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
-                                    im2col_step * outputHeight * outputWidth));
+      input.options());
-  columns.setZeros(ctx.getStream());
  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
-  gradOutput = transpose(ctx, gradOutput, 1, 2);
+  gradOutput.transpose_(1, 2);
-  if (!gradOutput.isContiguous()) {
-    gradOutput = ctx.cloneDArrayLite(gradOutput);
-  }
-  auto gradOutputBuffer = ctx.cloneDArrayLite(gradOutput);
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
  gradOutputBuffer =
      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
                             im2col_step * outputHeight, outputWidth});
-  gradOutput = transpose(ctx, gradOutput, 1, 2);
+  gradOutput.transpose_(1, 2);
-  if (!gradOutput.isContiguous()) {
-    gradOutput = ctx.cloneDArrayLite(gradOutput);
-  }
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
@@ -478,33 +474,36 @@ void DeformConvBackwardParametersCUDAKernelLauncher(
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-  for (size_t elt = 0; elt < batchSize / im2col_step; elt++) {
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, columns,
+                      dilationW, im2col_step, deformable_group, columns);
-                      stream);
    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(
-        {gradOutputBuffer.dim(0), group, gradOutputBuffer.dim(1) / group,
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
-         gradOutputBuffer.dim(2) * gradOutputBuffer.dim(3)});
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
-    columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    gradWeight = gradWeight.view(
+    gradWeight =
-        {group, gradWeight.dim(0) / group,
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
-         gradWeight.dim(1) * gradWeight.dim(2) * gradWeight.dim(3)});
+                         gradWeight.size(2), gradWeight.size(3)});
    for (int g = 0; g < group; g++) {
-      auto gradWeight_g = gradWeight[g];
+      gradWeight[g] = gradWeight[g]
-      gemm(ctx, scale, false, gradOutputBuffer[elt][g], true, columns[g], 1,
+                          .flatten(1)
-           gradWeight_g);
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
    }
    gradOutputBuffer = gradOutputBuffer.view(
-        {gradOutputBuffer.dim(0),
+        {gradOutputBuffer.size(0),
-         gradOutputBuffer.dim(1) * gradOutputBuffer.dim(2),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
-         im2col_step * outputHeight, outputWidth});
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
-    columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
+    columns =
-    gradWeight = gradWeight.view(
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-        {gradWeight.dim(0) * gradWeight.dim(1), nInputPlane / group, kH, kW});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
  }
  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});

--- a/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "deform_conv_pytorch.h"
+using namespace parrots;
+/*void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
+ *                              Tensor output, Tensor columns, Tensor ones,
+ *                              int kW, int kH, int dW, int dH, int padW,
+ *                              int padH, int dilationW, int dilationH, int
+ * group, int deformable_group, int im2col_step);
+ */
+void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW, kH,
+                           dW, dH, padW, padH, dilationW, dilationH, group,
+                           deformable_group, im2col_step);
+}
+/*void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
+ *                                     Tensor gradOutput, Tensor gradInput,
+ *                                     Tensor gradOffset, Tensor weight,
+ *                                     Tensor columns, int kW, int kH, int dW,
+ *                                     int dH, int padW, int padH, int
+ * dilationW, int dilationH, int group, int deformable_group, int im2col_step);
+ */
+void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+  deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
+                                  gradOffset, weight, columns, kW, kH, dW, dH,
+                                  padW, padH, dilationW, dilationH, group,
+                                  deformable_group, im2col_step);
+}
+/*void deform_conv_backward_parameters_cuda(
+ *     Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
+ *     Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
+ *     int padH, int dilationW, int dilationH, int group, int deformable_group,
+ *     float scale, int im2col_step);
+ */
+void deform_conv_backward_parameters_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
+                                       columns, ones, kW, kH, dW, dH, padW,
+                                       padH, dilationW, dilationH, group,
+                                       deformable_group, scale, im2col_step);
+}
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
+++ b/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
+#ifndef DEFORM_CONV_PYTORCH_H
+#define DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
+                              Tensor output, Tensor columns, Tensor ones,
+                              int kW, int kH, int dW, int dH, int padW,
+                              int padH, int dilationW, int dilationH, int group,
+                              int deformable_group, int im2col_step);
+void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradInput,
+                                     Tensor gradOffset, Tensor weight,
+                                     Tensor columns, int kW, int kH, int dW,
+                                     int dH, int padW, int padH, int dilationW,
+                                     int dilationH, int group,
+                                     int deformable_group, int im2col_step);
+void deform_conv_backward_parameters_cuda(
+    Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
+    Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    float scale, int im2col_step);
+#endif  // DEFORM_CONV_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"
-void DeformRoIPoolForwardCUDAKernelLauncher(
+#ifdef MMCV_WITH_CUDA
-    const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-    DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
+                                            Tensor offset, Tensor output,
-    int sampling_ratio, float gamma, cudaStream_t stream);
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
 void DeformRoIPoolBackwardCUDAKernelLauncher(
-    const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    int pooled_height, int pooled_width, float spatial_scale,
+    float spatial_scale, int sampling_ratio, float gamma);
-    int sampling_ratio, float gamma, cudaStream_t stream);
-void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
-                                  const OperatorBase::in_list_t& ins,
+                                  Tensor output, int pooled_height,
-                                  OperatorBase::out_list_t& outs) {
+                                  int pooled_width, float spatial_scale,
-  int pooled_height;
+                                  int sampling_ratio, float gamma) {
-  int pooled_width;
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
-  float spatial_scale;
+                                         pooled_height, pooled_width,
-  int sampling_ratio;
+                                         spatial_scale, sampling_ratio, gamma);
-  float gamma;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<float>("gamma", gamma)
-      .done();
-  const auto& input = ins[0];
-  const auto& rois = ins[1];
-  const auto& offset = ins[2];
-  auto& output = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  DeformRoIPoolForwardCUDAKernelLauncher(
-      input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
-      sampling_ratio, gamma, stream);
 }
-void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
-                                   const OperatorBase::in_list_t& ins,
+                                   Tensor rois, Tensor offset,
-                                   OperatorBase::out_list_t& outs) {
+                                   Tensor grad_input, Tensor grad_offset,
-  int pooled_height;
+                                   int pooled_height, int pooled_width,
-  int pooled_width;
+                                   float spatial_scale, int sampling_ratio,
-  float spatial_scale;
+                                   float gamma) {
-  int sampling_ratio;
-  float gamma;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
-      .get<float>("gamma", gamma)
-      .done();
-  const auto& grad_output = ins[0];
-  const auto& input = ins[1];
-  const auto& rois = ins[2];
-  const auto& offset = ins[3];
-  auto& grad_input = outs[0];
-  auto& grad_offset = outs[1];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  DeformRoIPoolBackwardCUDAKernelLauncher(
      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma, stream);
+      pooled_width, spatial_scale, sampling_ratio, gamma);
 }
+#endif
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(output);
-PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
-    .attr("pooled_height")
+                                 pooled_width, spatial_scale, sampling_ratio,
-    .attr("pooled_width")
+                                 gamma);
-    .attr("spatial_scale")
+#else
-    .attr("sampling_ratio")
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-    .attr("gamma")
+#endif
-    .input(3)
+  } else {
-    .output(1)
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
-    .apply(deform_roi_pool_forward_cuda)
+  }
-    .done();
+}
-PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
-    .attr("pooled_height")
+                              Tensor offset, Tensor grad_input,
-    .attr("pooled_width")
+                              Tensor grad_offset, int pooled_height,
-    .attr("spatial_scale")
+                              int pooled_width, float spatial_scale,
-    .attr("sampling_ratio")
+                              int sampling_ratio, float gamma) {
-    .attr("gamma")
+  if (grad_output.device().is_cuda()) {
-    .input(4)
+#ifdef MMCV_WITH_CUDA
-    .output(2)
+    CHECK_CUDA_INPUT(grad_output);
-    .apply(deform_roi_pool_backward_cuda)
+    CHECK_CUDA_INPUT(input);
-    .done();
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_offset);
+    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                  grad_offset, pooled_height, pooled_width,
+                                  spatial_scale, sampling_ratio, gamma);
+#else
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu
 #include "deform_roi_pool_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
-void DeformRoIPoolForwardCUDAKernelLauncher(
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-    const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
+                                            Tensor offset, Tensor output,
-    DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
+                                            int pooled_height, int pooled_width,
-    int sampling_ratio, float gamma, cudaStream_t stream) {
+                                            float spatial_scale,
-  int output_size = output.size();
+                                            int sampling_ratio, float gamma) {
-  int channels = input.dim(1);
+  int output_size = output.numel();
-  int height = input.dim(2);
+  int channels = input.size(1);
-  int width = input.dim(3);
+  int height = input.size(2);
+  int width = input.size(3);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  at::cuda::CUDAGuard device_guard(input.device());
-      input.elemType().prim(), ([&] {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
        deform_roi_pool_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
+                output_size, input.data_ptr<scalar_t>(),
-                offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height,
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
-                pooled_width, spatial_scale, sampling_ratio, gamma, channels,
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
-                height, width);
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
-      }));
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
 void DeformRoIPoolBackwardCUDAKernelLauncher(
-    const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    int pooled_height, int pooled_width, float spatial_scale,
+    float spatial_scale, int sampling_ratio, float gamma) {
-    int sampling_ratio, float gamma, cudaStream_t stream) {
+  int output_size = grad_output.numel();
-  int output_size = grad_output.size();
+  int channels = grad_input.size(1);
-  int channels = grad_input.dim(1);
+  int height = grad_input.size(2);
-  int height = grad_input.dim(2);
+  int width = grad_input.size(3);
-  int width = grad_input.dim(3);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  at::cuda::CUDAGuard device_guard(grad_output.device());
-      grad_output.elemType().prim(), ([&] {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
        deform_roi_pool_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(),
+                output_size, grad_output.data_ptr<scalar_t>(),
-                rois.ptr<scalar_t>(), offset.ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-                grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
-                pooled_height, pooled_width, spatial_scale, sampling_ratio,
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
-                gamma, channels, height, width);
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
-      }));
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "deform_roi_pool_pytorch.h"
+using namespace parrots;
+/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+ *                                  Tensor output, int pooled_height,
+ *                                  int pooled_width, float spatial_scale,
+ *                                  int sampling_ratio, float gamma);
+ */
+void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+  auto output = buildATensor(ctx, outs[0]);
+  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+ *                                   Tensor rois, Tensor offset,
+ *                                   Tensor grad_input, Tensor grad_offset,
+ *                                   int pooled_height, int pooled_width,
+ *                                   float spatial_scale, int sampling_ratio,
+ *                                   float gamma);
+ */
+void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& input = buildATensor(ctx, ins[1]);
+  const auto& rois = buildATensor(ctx, ins[2]);
+  const auto& offset = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  auto grad_offset = buildATensor(ctx, outs[1]);
+  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
+#ifndef DEFORM_ROI_POOL_PYTORCH_H
+#define DEFORM_ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+#endif  // DEFORM_ROI_POOL_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
-// Copyright (c) 2018, SenseTime.
+#include "pytorch_cpp_helper.hpp"
-#include "parrots_cpp_helper.hpp"
+#ifdef MMCV_WITH_CUDA
-void SigmoidFocalLossForwardCUDAKernelLauncher(
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+                                               Tensor weight, Tensor output,
-    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+                                               const float gamma,
+                                               const float alpha);
-void SigmoidFocalLossBackwardCUDAKernelLauncher(
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
+                                                Tensor weight,
+                                                Tensor grad_input,
-void SoftmaxFocalLossForwardCUDAKernelLauncher(
+                                                const float gamma,
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+                                                const float alpha);
-    DArrayLite output, float gamma, float alpha, cudaStream_t stream);
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+                                               Tensor weight, Tensor output,
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+                                               const float gamma,
-    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+                                               const float alpha);
-    cudaStream_t stream);
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+                                                Tensor weight, Tensor buff,
-                                     const OperatorBase::in_list_t& ins,
+                                                Tensor grad_input,
-                                     OperatorBase::out_list_t& outs) {
+                                                const float gamma,
-  float gamma;
+                                                const float alpha);
-  float alpha;
-  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
-  // get inputs and outputs
-  const auto& input = ins[0];
-  const auto& target = ins[1];
-  const auto& weight = ins[2];
-  auto& output = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha, stream);
+                                            gamma, alpha);
 }
-void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
-                                      const OperatorBase::in_list_t& ins,
+                                      Tensor weight, Tensor grad_input,
-                                      OperatorBase::out_list_t& outs) {
+                                      float gamma, float alpha) {
-  float gamma;
-  float alpha;
-  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
-  // get inputs and outputs
-  const auto& input = ins[0];
-  const auto& target = ins[1];
-  const auto& weight = ins[2];
-  auto& grad_input = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
-                                             gamma, alpha, stream);
+                                             gamma, alpha);
 }
-void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
-                                     const OperatorBase::in_list_t& ins,
+                                     Tensor output, float gamma, float alpha) {
-                                     OperatorBase::out_list_t& outs) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-  float gamma;
+                                            gamma, alpha);
-  float alpha;
-  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
-  // get inputs and outputs
-  const auto& input = ins[0];
-  const auto& target = ins[1];
-  const auto& weight = ins[2];
-  auto& grad_input = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
-                                            gamma, alpha, stream);
 }
-void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
-                                      const OperatorBase::in_list_t& ins,
+                                      Tensor weight, Tensor buff,
-                                      OperatorBase::out_list_t& outs) {
+                                      Tensor grad_input, float gamma,
-  float gamma;
+                                      float alpha) {
-  float alpha;
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
-  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+                                             grad_input, gamma, alpha);
+}
-  // get inputs and outputs
+#endif
-  const auto& input = ins[0];
-  const auto& target = ins[1];
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
-  const auto& weight = ins[2];
+                                Tensor output, float gamma, float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
+}
-  auto& buff = outs[0];
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
-  auto& grad_input = outs[1];
+                                 Tensor grad_input, float gamma, float alpha) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(grad_input);
+    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                     alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
+}
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                Tensor output, float gamma, float alpha) {
-                                             grad_input, gamma, alpha, stream);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
+  }
 }
-PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
-    .attr("gamma")
+                                 Tensor buff, Tensor grad_input, float gamma,
-    .attr("alpha")
+                                 float alpha) {
-    .input(3)
+  if (input.device().is_cuda()) {
-    .output(1)
+#ifdef MMCV_WITH_CUDA
-    .apply(sigmoid_focal_loss_forward_cuda)
+    CHECK_CUDA_INPUT(input);
-    .done();
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
-PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    CHECK_CUDA_INPUT(buff);
-    .attr("gamma")
+    CHECK_CUDA_INPUT(grad_input);
-    .attr("alpha")
-    .input(3)
+    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
-    .output(1)
+                                     gamma, alpha);
-    .apply(sigmoid_focal_loss_backward_cuda)
+#else
-    .done();
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
-PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+  } else {
-    .attr("gamma")
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-    .attr("alpha")
+  }
-    .input(3)
+}
-    .output(1)
-    .apply(softmax_focal_loss_forward_cuda)
-    .done();
-PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
-    .attr("gamma")
-    .attr("alpha")
-    .input(3)
-    .output(2)
-    .apply(softmax_focal_loss_backward_cuda)
-    .done();
--- a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
 #include "sigmoid_focal_loss_cuda_kernel.cuh"
 #include "softmax_focal_loss_cuda_kernel.cuh"
-void SigmoidFocalLossForwardCUDAKernelLauncher(
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+                                               Tensor weight, Tensor output,
-    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+                                               const float gamma,
-  int output_size = output.size();
+                                               const float alpha) {
-  int num_classes = input.dim(1);
+  int output_size = output.numel();
+  int num_classes = input.size(1);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
-      input.elemType().prim(), ([&] {
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                output_size, input.data_ptr<scalar_t>(),
-                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
-                num_classes);
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
-      }));
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
-void SigmoidFocalLossBackwardCUDAKernelLauncher(
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-    const DArrayLite input, const DArrayLite target, const DArrayLite weight,
+                                                Tensor weight,
-    DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
+                                                Tensor grad_input,
-  int output_size = grad_input.size();
+                                                const float gamma,
-  int num_classes = input.dim(1);
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  at::cuda::CUDAGuard device_guard(grad_input.device());
-      input.elemType().prim(), ([&] {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
+                output_size, input.data_ptr<scalar_t>(),
-                weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
-                alpha, num_classes);
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
-      }));
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
-void SoftmaxFocalLossForwardCUDAKernelLauncher(
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
-    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+                                               Tensor weight, Tensor output,
-    DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
+                                               const float gamma,
-  int output_size = output.size();
+                                               const float alpha) {
-  int num_classes = softmax.dim(1);
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
-      softmax.elemType().prim(), ([&] {
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
        softmax_focal_loss_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                output_size, softmax.data_ptr<scalar_t>(),
-                weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
-                num_classes);
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
-      }));
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
-    const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
+                                                Tensor weight, Tensor buff,
-    DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
+                                                Tensor grad_input,
-    cudaStream_t stream) {
+                                                const float gamma,
-  int output_size = buff.size();
+                                                const float alpha) {
-  int num_classes = softmax.dim(1);
+  int num_classes = softmax.size(1);
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  int output_size = buff.numel();
-      grad_input.elemType().prim(), ([&] {
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                output_size, softmax.data_ptr<scalar_t>(),
-                weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
-                num_classes);
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
-      }));
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
-  output_size = grad_input.size();
+  AT_CUDA_CHECK(cudaGetLastError());
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  output_size = grad_input.numel();
-      grad_input.elemType().prim(), ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
+                output_size, softmax.data_ptr<scalar_t>(),
-                buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
-      }));
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
-  PARROTS_CUDA_CHECK(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "focal_loss_pytorch.h"
+using namespace parrots;
+void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+  auto output = buildATensor(ctx, outs[0]);
+  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+void sigmoid_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+  auto output = buildATensor(ctx, outs[0]);
+  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+void softmax_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+  auto buff = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda_parrots)
+    .done();
--- a/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
+++ b/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
+#ifndef FOCAL_LOSS_PYTORCH_H
+#define FOCAL_LOSS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+#endif  // FOCAL_LOSS_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
-#include "parrots_cpp_helper.hpp"
+#include "pytorch_cpp_helper.hpp"
-void MaskedIm2colForwardCUDAKernelLauncher(
+#ifdef MMCV_WITH_CUDA
-    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+                                           const Tensor mask_h_idx,
-    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
-void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                          const DArrayLite mask_h_idx,
+                                           const Tensor mask_h_idx,
-                                          const DArrayLite mask_w_idx,
+                                           const Tensor mask_w_idx,
-                                          DArrayLite top_data, const int height,
+                                           Tensor top_data, const int height,
-                                          const int width, const int channels,
+                                           const int width, const int channels);
-                                          cudaStream_t stream);
-void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
-                                const OperatorBase::in_list_t& ins,
+                                const Tensor mask_w_idx, Tensor col,
-                                OperatorBase::out_list_t& outs) {
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  int kernel_h, kernel_w, pad_h, pad_w;
-  SSAttrs(attr)
-      .get<int>("kernel_h", kernel_h)
-      .get<int>("kernel_w", kernel_w)
-      .get<int>("pad_h", pad_h)
-      .get<int>("pad_w", pad_w)
-      .done();
-  const auto& im = ins[0];
-  const auto& mask_h_idx = ins[1];
-  const auto& mask_w_idx = ins[2];
-  auto& col = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                        kernel_h, kernel_w, pad_h, pad_w,
+                                        kernel_h, kernel_w, pad_h, pad_w);
-                                        stream);
 }
-void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
-                                const OperatorBase::in_list_t& ins,
+                                const Tensor mask_w_idx, Tensor im, int height,
-                                OperatorBase::out_list_t& outs) {
+                                int width, int channels) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  int height, width, channels;
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-  SSAttrs(attr)
+                                        width, channels);
-      .get<int>("height", height)
-      .get<int>("width", width)
-      .get<int>("channels", channels)
-      .done();
-  const auto& col = ins[0];
-  const auto& mask_h_idx = ins[1];
-  const auto& mask_w_idx = ins[2];
-  auto& im = outs[0];
-  cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
-  MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
-                                       width, channels, stream);
 }
+#endif
-PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
-    .attr("kernel_h")
+                           const Tensor mask_w_idx, Tensor col,
-    .attr("kernel_w")
+                           const int kernel_h, const int kernel_w,
-    .attr("pad_h")
+                           const int pad_h, const int pad_w) {
-    .attr("pad_w")
+  if (im.device().is_cuda()) {
-    .input(3)
+#ifdef MMCV_WITH_CUDA
-    .output(1)
+    CHECK_CUDA_INPUT(im);
-    .apply(masked_im2col_forward_cuda)
+    CHECK_CUDA_INPUT(mask_h_idx);
-    .done();
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(col);
+    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                               kernel_w, pad_h, pad_w);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
+}
-PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
-    .attr("height")
+                           const Tensor mask_w_idx, Tensor im, int height,
-    .attr("width")
+                           int width, int channels) {
-    .attr("channels")
+  if (col.device().is_cuda()) {
-    .input(3)
+#ifdef MMCV_WITH_CUDA
-    .output(1)
+    CHECK_CUDA_INPUT(col);
-    .apply(masked_col2im_forward_cuda)
+    CHECK_CUDA_INPUT(mask_h_idx);
-    .done();
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(im);
+    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                               channels);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu
 #include "masked_conv2d_cuda_kernel.cuh"
-#include "parrots_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
-void MaskedIm2colForwardCUDAKernelLauncher(
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-    const DArrayLite bottom_data, const DArrayLite mask_h_idx,
+                                           const Tensor mask_h_idx,
-    const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
+                                           const Tensor mask_w_idx,
-    const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
+                                           Tensor top_data, const int kernel_h,
-  int channels = bottom_data.dim(1);
+                                           const int kernel_w, const int pad_h,
-  int height = bottom_data.dim(2);
+                                           const int pad_w) {
-  int width = bottom_data.dim(3);
+  int channels = bottom_data.size(1);
-  int mask_cnt = mask_h_idx.dim(0);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
  int output_size = mask_cnt * channels;
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
-      bottom_data.elemType().prim(), ([&] {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
        MaskedIm2colForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
-                kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
-                mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
      }));
+  AT_CUDA_CHECK(cudaGetLastError());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
 }
-void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
+void MaskedCol2imForwardCUDAKernelLauncher(
-                                          const DArrayLite mask_h_idx,
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
-                                          const DArrayLite mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
-                                          DArrayLite top_data, const int height,
+  int mask_cnt = mask_h_idx.size(0);
-                                          const int width, const int channels,
-                                          cudaStream_t stream) {
-  int mask_cnt = mask_h_idx.dim(0);
  int output_size = mask_cnt * channels;
-  PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
-      bottom_data.elemType().prim(), ([&] {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
        MaskedCol2imForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, bottom_data.ptr<scalar_t>(), height, width,
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
-                channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
+                mask_w_idx_, mask_cnt, top_data_);
-                mask_cnt, top_data.ptr<scalar_t>());
      }));
+  AT_CUDA_CHECK(cudaGetLastError());
-  PARROTS_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "masked_conv2d_pytorch.h"
+using namespace parrots;
+void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+  const auto& im = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+  auto col = buildATensor(ctx, outs[0]);
+  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+  const auto& col = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+  auto im = buildATensor(ctx, outs[0]);
+  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda_parrots)
+    .done();