add mmdet3d op (#1425)

Co-authored-by: zhouzaida <zhouzaida@163.com>

add mmdet3d op (#1425)
Co-authored-by: zhouzaida <zhouzaida@163.com>
2d73eafe · pc · GitHub · 75cae78c · 2d73eafe · 2d73eafe
Unverified Commit 2d73eafe authored Oct 23, 2021 by pc Committed by GitHub Oct 23, 2021
20 changed files
--- a/mmcv/ops/csrc/parrots/knn_pytorch.h
+++ b/mmcv/ops/csrc/parrots/knn_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef KNN_PYTORCH_H
+#define KNN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+#endif  // KNN_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
+++ b/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+#include "pytorch_cpp_helper.hpp"
+#ifdef MMCV_WITH_CUDA
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+#endif
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  if (xyz.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(xyz);
+    CHECK_CUDA_INPUT(boxes3d);
+    CHECK_CUDA_INPUT(pts_feature);
+    CHECK_CUDA_INPUT(pooled_features);
+    CHECK_CUDA_INPUT(pooled_empty_flag);
+    int batch_size = xyz.size(0);
+    int pts_num = xyz.size(1);
+    int boxes_num = boxes3d.size(1);
+    int feature_in_len = pts_feature.size(2);
+    int sampled_pts_num = pooled_features.size(2);
+    roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
+                                 sampled_pts_num, xyz, boxes3d, pts_feature,
+                                 pooled_features, pooled_empty_flag);
+#else
+    AT_ERROR("roipoint_pool3d is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("roipoint_pool3d is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "roipoint_pool3d_pytorch.h"
+using namespace parrots;
+#ifdef MMCV_WITH_CUDA
+void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  auto xyz = buildATensor(ctx, ins[0]);
+  auto boxes3d = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+  auto pooled_features = buildATensor(ctx, outs[0]);
+  auto pooled_empty_flag = buildATensor(ctx, outs[1]);
+  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
+                          pooled_empty_flag);
+}
+PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
+    .input(3)
+    .output(2)
+    .apply(roipoint_pool3d_forward_cuda_parrots)
+    .done();
+#endif
--- a/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
+++ b/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_PYTORCH_H
+#define ROIPOINT_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+#endif  // ROIPOINT_POOL3D_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/three_interpolate.cpp
+++ b/mmcv/ops/csrc/parrots/three_interpolate.cpp
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+#include "pytorch_cpp_helper.hpp"
+#ifdef MMCV_WITH_CUDA
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+#endif
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
+                                   weight_tensor, out_tensor);
+#else
+    AT_ERROR("three_interpolate is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_interpolate is not implemented on CPU");
+  }
+}
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  if (grad_out_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
+                                    weight_tensor, grad_points_tensor);
+#else
+    AT_ERROR("three_interpolate is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_interpolate is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "three_interpolate_pytorch.h"
+using namespace parrots;
+#ifdef MMCV_WITH_CUDA
+void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int b, c, m, n;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("m", m)
+      .get<int>("n", n)
+      .done();
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+  auto out_tensor = buildATensor(ctx, outs[0]);
+  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
+                            out_tensor, b, c, m, n);
+}
+void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int b, c, n, m;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .done();
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
+                             grad_points_tensor, b, c, n, m);
+}
+PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
+    .attr("b")
+    .attr("c")
+    .attr("m")
+    .attr("n")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_forward_cuda_parrots)
+    .done();
+PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("m")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_backward_cuda_parrots)
+    .done();
+#endif
--- a/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
+++ b/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_PYTORCH_H
+#define THREE_INTERPOLATE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+#endif  // THREE_INTERPOLATE_PYTORCH_H
--- a/mmcv/ops/csrc/parrots/three_nn.cpp
+++ b/mmcv/ops/csrc/parrots/three_nn.cpp
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+#include "pytorch_cpp_helper.hpp"
+#ifdef MMCV_WITH_CUDA
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+#endif
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  if (unknown_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_nn_forward_cuda(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                          idx_tensor);
+#else
+    AT_ERROR("three_nn is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_nn is not implemented on CPU");
+  }
+}
--- a/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include "three_nn_pytorch.h"
+using namespace parrots;
+#ifdef MMCV_WITH_CUDA
+void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+  auto unknown_tensor = buildATensor(ctx, ins[0]);
+  auto known_tensor = buildATensor(ctx, ins[1]);
+  auto dist2_tensor = buildATensor(ctx, outs[0]);
+  auto idx_tensor = buildATensor(ctx, outs[1]);
+  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
+                   m);
+}
+PARROTS_EXTENSION_REGISTER(three_nn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(2)
+    .apply(three_nn_forward_cuda_parrots)
+    .done();
+#endif
--- a/mmcv/ops/csrc/parrots/three_nn_pytorch.h
+++ b/mmcv/ops/csrc/parrots/three_nn_pytorch.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_PYTORCH_H
+#define THREE_NN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+#endif  // THREE_NN_PYTORCH_H
--- a/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
@@ -34,10 +34,10 @@ void assign_score_withk_backward_cuda(
 };
 #endif
-void assign_score_withk_forward(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
-                                int aggregate, const Tensor& points,
+                                const Tensor& scores, const Tensor& knn_idx,
-                                const Tensor& centers, const Tensor& scores,
+                                Tensor& output, int B, int N0, int N1, int M,
-                                const Tensor& knn_idx, Tensor& output) {
+                                int K, int O, int aggregate) {
  if (points.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    CHECK_CONTIGUOUS(points);
@@ -56,12 +56,12 @@ void assign_score_withk_forward(int B, int N0, int N1, int M, int K, int O,
  }
 }
-void assign_score_withk_backward(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
-                                 int aggregate, const Tensor& grad_out,
+                                 const Tensor& centers, const Tensor& scores,
-                                 const Tensor& points, const Tensor& centers,
+                                 const Tensor& knn_idx, Tensor& grad_points,
-                                 const Tensor& scores, const Tensor& knn_idx,
+                                 Tensor& grad_centers, Tensor& grad_scores,
-                                 Tensor& grad_points, Tensor& grad_centers,
+                                 int B, int N0, int N1, int M, int K, int O,
-                                 Tensor& grad_scores) {
+                                 int aggregate) {
  if (grad_points.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    CHECK_CONTIGUOUS(grad_out);

--- a/mmcv/ops/csrc/pytorch/ball_query.cpp
+++ b/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -18,9 +18,9 @@ void ball_query_forward_cuda(int b, int n, int m, float min_radius,
 };
 #endif
-void ball_query_forward(int b, int n, int m, float min_radius, float max_radius,
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
-                        int nsample, Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
-                        Tensor idx_tensor) {
+                        float min_radius, float max_radius, int nsample) {
  if (new_xyz_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(new_xyz_tensor);

--- a/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -25,8 +25,8 @@ void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
 }
 #endif
-void furthest_point_sampling_forward(int b, int n, int m, Tensor points_tensor,
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
-                                     Tensor temp_tensor, Tensor idx_tensor) {
+                                     Tensor idx_tensor, int b, int n, int m) {
  if (points_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    const float *points = points_tensor.data_ptr<float>();
@@ -41,10 +41,10 @@ void furthest_point_sampling_forward(int b, int n, int m, Tensor points_tensor,
  }
 }
-void furthest_point_sampling_with_dist_forward(int b, int n, int m,
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
-                                               Tensor points_tensor,
                                               Tensor temp_tensor,
-                                               Tensor idx_tensor) {
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
  if (points_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    const float *points = points_tensor.data<float>();

--- a/mmcv/ops/csrc/pytorch/gather_points.cpp
+++ b/mmcv/ops/csrc/pytorch/gather_points.cpp
@@ -24,9 +24,9 @@ void gather_points_backward_cuda(int b, int c, int n, int npoints,
 };
 #endif
-void gather_points_forward(int b, int c, int n, int npoints,
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
-                           Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
-                           Tensor out_tensor) {
+                           int npoints) {
  if (points_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
@@ -39,9 +39,9 @@ void gather_points_forward(int b, int c, int n, int npoints,
  }
 }
-void gather_points_backward(int b, int c, int n, int npoints,
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                            Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
-                            Tensor grad_points_tensor) {
+                            int npoints) {
  if (grad_out_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,

--- a/mmcv/ops/csrc/pytorch/knn.cpp
+++ b/mmcv/ops/csrc/pytorch/knn.cpp
@@ -14,9 +14,8 @@ void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
 }
 #endif
-void knn_forward(int b, int n, int m, int nsample, Tensor xyz_tensor,
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
-                 Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
-                 Tensor dist2_tensor) {
  if (new_xyz_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(new_xyz_tensor);

--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -4,17 +4,17 @@
 std::string get_compiler_version();
 std::string get_compiling_cuda_version();
-void assign_score_withk_forward(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
-                                int aggregate, const Tensor &points,
+                                const Tensor &scores, const Tensor &knn_idx,
-                                const Tensor &centers, const Tensor &scores,
+                                Tensor &output, int B, int N0, int N1, int M,
-                                const Tensor &knn_idx, Tensor &output);
+                                int K, int O, int aggregate);
-void assign_score_withk_backward(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
-                                 int aggregate, const Tensor &grad_out,
+                                 const Tensor &centers, const Tensor &scores,
-                                 const Tensor &points, const Tensor &centers,
+                                 const Tensor &knn_idx, Tensor &grad_points,
-                                 const Tensor &scores, const Tensor &knn_idx,
+                                 Tensor &grad_centers, Tensor &grad_scores,
-                                 Tensor &grad_points, Tensor &grad_centers,
+                                 int B, int N0, int N1, int M, int K, int O,
-                                 Tensor &grad_scores);
+                                 int aggregate);
 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor);
@@ -76,13 +76,12 @@ void group_points_backward(int b, int c, int n, int npoints, int nsample,
 void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                             Tensor pooled_features, Tensor pooled_empty_flag);
-void gather_points_forward(int b, int c, int n, int npoints,
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
-                           Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
-                           Tensor out_tensor);
-void gather_points_backward(int b, int c, int n, int npoints,
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                            Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
-                            Tensor grad_points_tensor);
+                            int npoints);
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha);
@@ -97,22 +96,23 @@ void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha);
-void three_interpolate_forward(int b, int c, int m, int n, Tensor points_tensor,
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
-                               Tensor idx_tensor, Tensor weight_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
-                               Tensor out_tensor);
+                               int c, int m, int n);
-void three_interpolate_backward(int b, int c, int n, int m,
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                                Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
-                                Tensor weight_tensor,
+                                int b, int c, int n, int m);
-                                Tensor grad_points_tensor);
-void three_nn_forward(int b, int n, int m, Tensor unknown_tensor,
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
-                      Tensor known_tensor, Tensor dist2_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
-                      Tensor idx_tensor);
+                      int m);
 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset);
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap);
@@ -124,16 +124,13 @@ int iou3d_nms_forward(Tensor boxes, Tensor keep, float nms_overlap_thresh);
 int iou3d_nms_normal_forward(Tensor boxes, Tensor keep,
                             float nms_overlap_thresh);
-void knn_forward(int b, int n, int m, int nsample, Tensor xyz_tensor,
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
-                 Tensor new_xyz_tensor, Tensor idx_tensor, Tensor dist2_tensor);
+                                     Tensor idx_tensor, int b, int n, int m);
-void furthest_point_sampling_forward(int b, int n, int m, Tensor points_tensor,
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
-                                     Tensor temp_tensor, Tensor idx_tensor);
-void furthest_point_sampling_with_dist_forward(int b, int n, int m,
-                                               Tensor points_tensor,
                                               Tensor temp_tensor,
-                                               Tensor idx_tensor);
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
 void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
@@ -238,9 +235,9 @@ void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
 void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
-void ball_query_forward(int b, int n, int m, float min_radius, float max_radius,
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
-                        int nsample, Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
-                        Tensor idx_tensor);
+                        float min_radius, float max_radius, int nsample);
 Tensor bottom_pool_forward(Tensor input);
@@ -352,32 +349,31 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
        py::arg("scale"));
  m.def("gather_points_forward", &gather_points_forward,
-        "gather_points_forward", py::arg("b"), py::arg("c"), py::arg("n"),
+        "gather_points_forward", py::arg("points_tensor"),
-        py::arg("npoints"), py::arg("points_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
-        py::arg("out_tensor"));
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
  m.def("gather_points_backward", &gather_points_backward,
-        "gather_points_backward", py::arg("b"), py::arg("c"), py::arg("n"),
+        "gather_points_backward", py::arg("grad_out_tensor"),
-        py::arg("npoints"), py::arg("grad_out_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
-        py::arg("grad_points_tensor"));
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
        "get_compiling_cuda_version");
  m.def("assign_score_withk_forward", &assign_score_withk_forward,
-        "assign_score_withk_forward", py::arg("B"), py::arg("N0"),
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
-        py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
-        py::arg("aggregate"), py::arg("points"), py::arg("centers"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
-        py::arg("scores"), py::arg("knn_idx"), py::arg("output"));
+        py::arg("aggregate"));
  m.def("assign_score_withk_backward", &assign_score_withk_backward,
-        "assign_score_withk_backward", py::arg("B"), py::arg("N0"),
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
-        py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
-        py::arg("aggregate"), py::arg("grad_out"), py::arg("points"),
        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
-        py::arg("grad_points"), py::arg("grad_centers"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
-        py::arg("grad_scores"));
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
-  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("O"), py::arg("aggregate"));
-        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
-        py::arg("dist2_tensor"));
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
        py::arg("features"), py::arg("masks"), py::arg("output"),
        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
@@ -447,17 +443,18 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
        py::arg("gamma"), py::arg("alpha"));
  m.def("three_interpolate_forward", &three_interpolate_forward,
-        "three_interpolate_forward", py::arg("b"), py::arg("c"), py::arg("m"),
+        "three_interpolate_forward", py::arg("points_tensor"),
-        py::arg("n"), py::arg("points_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
-        py::arg("weight_tensor"), py::arg("out_tensor"));
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
  m.def("three_interpolate_backward", &three_interpolate_backward,
-        "three_interpolate_backward", py::arg("b"), py::arg("c"), py::arg("n"),
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
-        py::arg("m"), py::arg("grad_out_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
-        py::arg("weight_tensor"), py::arg("grad_points_tensor"));
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
-  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward", py::arg("b"),
+        py::arg("m"));
-        py::arg("n"), py::arg("m"), py::arg("unknown_tensor"),
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
-        py::arg("known_tensor"), py::arg("dist2_tensor"),
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
-        py::arg("idx_tensor"));
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
        py::arg("aligned"), py::arg("offset"));
@@ -485,14 +482,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
        py::arg("nms_overlap_thresh"));
  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
-        "furthest_point_sampling_forward", py::arg("b"), py::arg("n"),
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
-        py::arg("m"), py::arg("points_tensor"), py::arg("temp_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
-        py::arg("idx_tensor"));
+        py::arg("n"), py::arg("m"));
  m.def("furthest_point_sampling_with_dist_forward",
        &furthest_point_sampling_with_dist_forward,
-        "furthest_point_sampling_with_dist_forward", py::arg("b"), py::arg("n"),
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
-        py::arg("m"), py::arg("points_tensor"), py::arg("temp_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
-        py::arg("idx_tensor"));
+        py::arg("n"), py::arg("m"));
  m.def("masked_im2col_forward", &masked_im2col_forward,
        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
@@ -609,9 +606,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
        py::arg("iou_threshold"), py::arg("multi_label"));
  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
-        py::arg("max_radius"), py::arg("nsample"), py::arg("new_xyz_tensor"),
+        py::arg("max_radius"), py::arg("nsample"));
-        py::arg("xyz_tensor"), py::arg("idx_tensor"));
  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
@@ -657,6 +654,19 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "backward function of border_align", py::arg("grad_output"),
        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
@@ -674,6 +684,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
        py::arg("pool_method"));
-  m.def("correlation_forward", &correlation_forward, "Correlation forward");
-  m.def("correlation_backward", &correlation_backward, "Correlation backward");
 }
--- a/mmcv/ops/csrc/pytorch/three_interpolate.cpp
+++ b/mmcv/ops/csrc/pytorch/three_interpolate.cpp
@@ -30,9 +30,9 @@ void three_interpolate_backward_cuda(int b, int c, int n, int m,
 };
 #endif
-void three_interpolate_forward(int b, int c, int m, int n, Tensor points_tensor,
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
-                               Tensor idx_tensor, Tensor weight_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
-                               Tensor out_tensor) {
+                               int c, int m, int n) {
  if (points_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
@@ -45,10 +45,9 @@ void three_interpolate_forward(int b, int c, int m, int n, Tensor points_tensor,
  }
 }
-void three_interpolate_backward(int b, int c, int n, int m,
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                                Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
-                                Tensor weight_tensor,
+                                int b, int c, int n, int m) {
-                                Tensor grad_points_tensor) {
  if (grad_out_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,

--- a/mmcv/ops/csrc/pytorch/three_nn.cpp
+++ b/mmcv/ops/csrc/pytorch/three_nn.cpp
@@ -14,9 +14,9 @@ void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
 };
 #endif
-void three_nn_forward(int b, int n, int m, Tensor unknown_tensor,
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
-                      Tensor known_tensor, Tensor dist2_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
-                      Tensor idx_tensor) {
+                      int m) {
  if (unknown_tensor.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
    three_nn_forward_cuda(b, n, m, unknown_tensor, known_tensor, dist2_tensor,

--- a/mmcv/ops/furthest_point_sample.py
+++ b/mmcv/ops/furthest_point_sample.py
@@ -30,9 +30,16 @@ class FurthestPointSampling(Function):
        output = torch.cuda.IntTensor(B, num_points)
        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
-        ext_module.furthest_point_sampling_forward(B, N, num_points,
+        ext_module.furthest_point_sampling_forward(
-                                                   points_xyz, temp, output)
+            points_xyz,
-        ctx.mark_non_differentiable(output)
+            temp,
+            output,
+            b=B,
+            n=N,
+            m=num_points,
+        )
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
        return output
    @staticmethod
@@ -62,8 +69,9 @@ class FurthestPointSamplingWithDist(Function):
        temp = points_dist.new_zeros([B, N]).fill_(1e10)
        ext_module.furthest_point_sampling_with_dist_forward(
-            B, N, num_points, points_dist, temp, output)
+            points_dist, temp, output, b=B, n=N, m=num_points)
-        ctx.mark_non_differentiable(output)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
        return output
    @staticmethod

--- a/mmcv/ops/gather_points.py
+++ b/mmcv/ops/gather_points.py
@@ -28,11 +28,12 @@ class GatherPoints(Function):
        _, C, N = features.size()
        output = torch.cuda.FloatTensor(B, C, npoint)
-        ext_module.gather_points_forward(B, C, N, npoint, features, indices,
+        ext_module.gather_points_forward(
-                                         output)
+            features, indices, output, b=B, c=C, n=N, npoints=npoint)
        ctx.for_backwards = (indices, C, N)
-        ctx.mark_non_differentiable(indices)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(indices)
        return output
    @staticmethod
@@ -42,8 +43,14 @@ class GatherPoints(Function):
        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
        grad_out_data = grad_out.data.contiguous()
-        ext_module.gather_points_backward(B, C, N, npoint, grad_out_data, idx,
+        ext_module.gather_points_backward(
-                                          grad_features.data)
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint)
        return grad_features, None