[Enhance] Replace mmdet3d ops with mmcv ops (#1240)

* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest

[Enhance] Replace mmdet3d ops with mmcv ops (#1240)
* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest
2f88c124 · Wenhao Wu · GitHub · 41d77dad · 41d77dad · 41d77dad
Unverified Commit 2f88c124 authored Mar 23, 2022 by Wenhao Wu Committed by GitHub Mar 23, 2022
20 changed files
--- a/mmdet3d/ops/interpolate/__init__.py
+++ b/mmdet3d/ops/interpolate/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .three_interpolate import three_interpolate
-from .three_nn import three_nn
-
-__all__ = ['three_nn', 'three_interpolate']
--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
-
-#include <THC/THC.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <vector>
-
-extern THCState *state;
-
-void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-                      at::Tensor idx_tensor);
-
-void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-                              const float *known, float *dist2, int *idx,
-                              cudaStream_t stream);
-
-void three_interpolate_wrapper(int b, int c, int m, int n,
-                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                               at::Tensor weight_tensor, at::Tensor out_tensor);
-
-void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-                                       const float *points, const int *idx,
-                                       const float *weight, float *out,
-                                       cudaStream_t stream);
-
-void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                                    at::Tensor grad_out_tensor,
-                                    at::Tensor idx_tensor,
-                                    at::Tensor weight_tensor,
-                                    at::Tensor grad_points_tensor);
-
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-                                            const float *grad_out,
-                                            const int *idx, const float *weight,
-                                            float *grad_points,
-                                            cudaStream_t stream);
-
-void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-                      at::Tensor idx_tensor) {
-  const float *unknown = unknown_tensor.data_ptr<float>();
-  const float *known = known_tensor.data_ptr<float>();
-  float *dist2 = dist2_tensor.data_ptr<float>();
-  int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
-}
-
-void three_interpolate_wrapper(int b, int c, int m, int n,
-                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                               at::Tensor weight_tensor,
-                               at::Tensor out_tensor) {
-  const float *points = points_tensor.data_ptr<float>();
-  const float *weight = weight_tensor.data_ptr<float>();
-  float *out = out_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
-                                    stream);
-}
-
-void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                                    at::Tensor grad_out_tensor,
-                                    at::Tensor idx_tensor,
-                                    at::Tensor weight_tensor,
-                                    at::Tensor grad_points_tensor) {
-  const float *grad_out = grad_out_tensor.data_ptr<float>();
-  const float *weight = weight_tensor.data_ptr<float>();
-  float *grad_points = grad_points_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
-                                         grad_points, stream);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
-  m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
-        "three_interpolate_wrapper");
-  m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
-        "three_interpolate_grad_wrapper");
-}
--- a/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-__global__ void three_interpolate_kernel(int b, int c, int m, int n,
-                                         const float *__restrict__ points,
-                                         const int *__restrict__ idx,
-                                         const float *__restrict__ weight,
-                                         float *__restrict__ out) {
-  // points: (B, C, M)
-  // idx: (B, N, 3)
-  // weight: (B, N, 3)
-  // output:
-  //      out: (B, C, N)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-  out += bs_idx * c * n + c_idx * n;
-
-  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                weight[2] * points[idx[2]];
-}
-
-void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-                                       const float *points, const int *idx,
-                                       const float *weight, float *out,
-                                       cudaStream_t stream) {
-  // points: (B, C, M)
-  // idx: (B, N, 3)
-  // weight: (B, N, 3)
-  // output:
-  //      out: (B, C, N)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-  three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
-                                                           idx, weight, out);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-__global__ void three_interpolate_grad_kernel(
-    int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, const float *__restrict__ weight,
-    float *__restrict__ grad_points) {
-  // grad_out: (B, C, N)
-  // weight: (B, N, 3)
-  // output:
-  //      grad_points: (B, C, M)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  grad_points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-
-  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-}
-
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-                                            const float *grad_out,
-                                            const int *idx, const float *weight,
-                                            float *grad_points,
-                                            cudaStream_t stream) {
-  // grad_out: (B, C, N)
-  // weight: (B, N, 3)
-  // output:
-  //      grad_points: (B, C, M)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-  three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
-      b, c, n, m, grad_out, idx, weight, grad_points);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-__global__ void three_nn_kernel(int b, int n, int m,
-                                const float *__restrict__ unknown,
-                                const float *__restrict__ known,
-                                float *__restrict__ dist2,
-                                int *__restrict__ idx) {
-  // unknown: (B, N, 3)
-  // known: (B, M, 3)
-  // output:
-  //      dist2: (B, N, 3)
-  //      idx: (B, N, 3)
-
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= n) return;
-
-  unknown += bs_idx * n * 3 + pt_idx * 3;
-  known += bs_idx * m * 3;
-  dist2 += bs_idx * n * 3 + pt_idx * 3;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-
-  float ux = unknown[0];
-  float uy = unknown[1];
-  float uz = unknown[2];
-
-  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-  int besti1 = 0, besti2 = 0, besti3 = 0;
-  for (int k = 0; k < m; ++k) {
-    float x = known[k * 3 + 0];
-    float y = known[k * 3 + 1];
-    float z = known[k * 3 + 2];
-    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-    if (d < best1) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = best1;
-      besti2 = besti1;
-      best1 = d;
-      besti1 = k;
-    } else if (d < best2) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = d;
-      besti2 = k;
-    } else if (d < best3) {
-      best3 = d;
-      besti3 = k;
-    }
-  }
-  dist2[0] = best1;
-  dist2[1] = best2;
-  dist2[2] = best3;
-  idx[0] = besti1;
-  idx[1] = besti2;
-  idx[2] = besti3;
-}
-
-void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-                              const float *known, float *dist2, int *idx,
-                              cudaStream_t stream) {
-  // unknown: (B, N, 3)
-  // known: (B, M, 3)
-  // output:
-  //      dist2: (B, N, 3)
-  //      idx: (B, N, 3)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-
-  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
-                                                  dist2, idx);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/interpolate/three_interpolate.py
+++ b/mmdet3d/ops/interpolate/three_interpolate.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from . import interpolate_ext
-
-
-class ThreeInterpolate(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
-                weight: torch.Tensor) -> torch.Tensor:
-        """Performs weighted linear interpolation on 3 features.
-
-        Args:
-            features (Tensor): (B, C, M) Features descriptors to be
-                interpolated from
-            indices (Tensor): (B, n, 3) index three nearest neighbors
-                of the target features in features
-            weight (Tensor): (B, n, 3) weights of interpolation
-
-        Returns:
-            Tensor: (B, C, N) tensor of the interpolated features
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-        assert weight.is_contiguous()
-
-        B, c, m = features.size()
-        n = indices.size(1)
-        ctx.three_interpolate_for_backward = (indices, weight, m)
-        output = torch.cuda.FloatTensor(B, c, n)
-
-        interpolate_ext.three_interpolate_wrapper(B, c, m, n, features,
-                                                  indices, weight, output)
-        return output
-
-    @staticmethod
-    def backward(
-        ctx, grad_out: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Backward of three interpolate.
-
-        Args:
-            grad_out (Tensor): (B, C, N) tensor with gradients of outputs
-
-        Returns:
-            Tensor: (B, C, M) tensor with gradients of features
-        """
-        idx, weight, m = ctx.three_interpolate_for_backward
-        B, c, n = grad_out.size()
-
-        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
-        grad_out_data = grad_out.data.contiguous()
-
-        interpolate_ext.three_interpolate_grad_wrapper(B, c, n, m,
-                                                       grad_out_data, idx,
-                                                       weight,
-                                                       grad_features.data)
-        return grad_features, None, None
-
-
-three_interpolate = ThreeInterpolate.apply
--- a/mmdet3d/ops/interpolate/three_nn.py
+++ b/mmdet3d/ops/interpolate/three_nn.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from . import interpolate_ext
-
-
-class ThreeNN(Function):
-
-    @staticmethod
-    def forward(ctx, target: torch.Tensor,
-                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Find the top-3 nearest neighbors of the target set from the source
-        set.
-
-        Args:
-            target (Tensor): shape (B, N, 3), points set that needs to
-                find the nearest neighbors.
-            source (Tensor): shape (B, M, 3), points set that is used
-                to find the nearest neighbors of points in target set.
-
-        Returns:
-            Tensor: shape (B, N, 3), L2 distance of each point in target
-                set to their corresponding nearest neighbors.
-        """
-        assert target.is_contiguous()
-        assert source.is_contiguous()
-
-        B, N, _ = target.size()
-        m = source.size(1)
-        dist2 = torch.cuda.FloatTensor(B, N, 3)
-        idx = torch.cuda.IntTensor(B, N, 3)
-
-        interpolate_ext.three_nn_wrapper(B, N, m, target, source, dist2, idx)
-
-        ctx.mark_non_differentiable(idx)
-
-        return torch.sqrt(dist2), idx
-
-    @staticmethod
-    def backward(ctx, a=None, b=None):
-        return None, None
-
-
-three_nn = ThreeNN.apply
--- a/mmdet3d/ops/iou3d/__init__.py
+++ b/mmdet3d/ops/iou3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu
-
-__all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu']
--- a/mmdet3d/ops/iou3d/iou3d_utils.py
+++ b/mmdet3d/ops/iou3d/iou3d_utils.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from . import iou3d_cuda
-
-
-def boxes_iou_bev(boxes_a, boxes_b):
-    """Calculate boxes IoU in the Bird's Eye View.
-
-    Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
-
-    Returns:
-        ans_iou (torch.Tensor): IoU result with shape (M, N).
-    """
-    ans_iou = boxes_a.new_zeros(
-        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
-
-    iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
-                                 ans_iou)
-
-    return ans_iou
-
-
-def nms_gpu(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
-    """NMS function GPU implementation (for BEV boxes). The overlap of two
-    boxes for IoU calculation is defined as the exact overlapping area of the
-    two boxes. In this function, one can also set `pre_max_size` and
-    `post_max_size`.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
-            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
-        thresh (int): Threshold.
-        pre_max_size (int, optional): Max size of boxes before NMS.
-            Default: None.
-        post_max_size (int, optional): Max size of boxes after NMS.
-            Default: None.
-
-    Returns:
-        torch.Tensor: Indexes after NMS.
-    """
-    order = scores.sort(0, descending=True)[1]
-
-    if pre_max_size is not None:
-        order = order[:pre_max_size]
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh, boxes.device.index)
-    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
-    if post_max_size is not None:
-        keep = keep[:post_max_size]
-    return keep
-
-
-def nms_normal_gpu(boxes, scores, thresh):
-    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
-    two boxes for IoU calculation is defined as the exact overlapping area of
-    the two boxes WITH their yaw angle set to 0.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
-        thresh (torch.Tensor): Threshold of NMS.
-
-    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
-    """
-    order = scores.sort(0, descending=True)[1]
-
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh,
-                                        boxes.device.index)
-    return order[keep[:num_out].cuda(boxes.device)].contiguous()
--- a/mmdet3d/ops/iou3d/src/iou3d.cpp
+++ b/mmdet3d/ops/iou3d/src/iou3d.cpp
-// Modified from
-// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
-
-/*
-3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
-Written by Shaoshuai Shi
-All Rights Reserved 2019-2020.
-*/
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <cstdint>
-#include <vector>
-
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-#define CHECK_ERROR(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-
-const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
-
-void boxesoverlapLauncher(const int num_a, const float *boxes_a,
-                          const int num_b, const float *boxes_b,
-                          float *ans_overlap);
-void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
-                         const float *boxes_b, float *ans_iou);
-void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
-                 float nms_overlap_thresh);
-void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
-                       int boxes_num, float nms_overlap_thresh);
-
-int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
-                          at::Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-
-  CHECK_INPUT(boxes_a);
-  CHECK_INPUT(boxes_b);
-  CHECK_INPUT(ans_overlap);
-
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  const float *boxes_a_data = boxes_a.data_ptr<float>();
-  const float *boxes_b_data = boxes_b.data_ptr<float>();
-  float *ans_overlap_data = ans_overlap.data_ptr<float>();
-
-  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
-                       ans_overlap_data);
-
-  return 1;
-}
-
-int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
-                      at::Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-
-  CHECK_INPUT(boxes_a);
-  CHECK_INPUT(boxes_b);
-  CHECK_INPUT(ans_iou);
-
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  const float *boxes_a_data = boxes_a.data_ptr<float>();
-  const float *boxes_b_data = boxes_b.data_ptr<float>();
-  float *ans_iou_data = ans_iou.data_ptr<float>();
-
-  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
-
-  return 1;
-}
-
-int nms_gpu(at::Tensor boxes, at::Tensor keep,
-	    float nms_overlap_thresh, int device_id) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
-  // params keep: (N)
-
-  CHECK_INPUT(boxes);
-  CHECK_CONTIGUOUS(keep);
-  cudaSetDevice(device_id);
-
-  int boxes_num = boxes.size(0);
-  const float *boxes_data = boxes.data_ptr<float>();
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-
-  unsigned long long *mask_data = NULL;
-  CHECK_ERROR(cudaMalloc((void **)&mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long)));
-  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
-
-  // unsigned long long mask_cpu[boxes_num * col_blocks];
-  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
-  // col_blocks];
-  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-
-  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long),
-                         cudaMemcpyDeviceToHost));
-
-  cudaFree(mask_data);
-
-  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
-
-  int num_to_keep = 0;
-
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  delete[] remv_cpu;
-  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-  return num_to_keep;
-}
-
-int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
-                   float nms_overlap_thresh, int device_id) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
-  // params keep: (N)
-
-  CHECK_INPUT(boxes);
-  CHECK_CONTIGUOUS(keep);
-  cudaSetDevice(device_id);
-
-  int boxes_num = boxes.size(0);
-  const float *boxes_data = boxes.data_ptr<float>();
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-
-  unsigned long long *mask_data = NULL;
-  CHECK_ERROR(cudaMalloc((void **)&mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long)));
-  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
-
-  // unsigned long long mask_cpu[boxes_num * col_blocks];
-  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
-  // col_blocks];
-  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-
-  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long),
-                         cudaMemcpyDeviceToHost));
-
-  cudaFree(mask_data);
-
-  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
-
-  int num_to_keep = 0;
-
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  delete[] remv_cpu;
-  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-  return num_to_keep;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
-        "oriented boxes overlap");
-  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
-  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
-  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
-}
--- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
-// Modified from
-// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
-
-/*
-3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
-Written by Shaoshuai Shi
-All Rights Reserved 2019-2020.
-*/
-
-#include <stdio.h>
-#define THREADS_PER_BLOCK 16
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-//#define DEBUG
-const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
-__device__ const float EPS = 1e-8;
-struct Point {
-  float x, y;
-  __device__ Point() {}
-  __device__ Point(double _x, double _y) { x = _x, y = _y; }
-
-  __device__ void set(float _x, float _y) {
-    x = _x;
-    y = _y;
-  }
-
-  __device__ Point operator+(const Point &b) const {
-    return Point(x + b.x, y + b.y);
-  }
-
-  __device__ Point operator-(const Point &b) const {
-    return Point(x - b.x, y - b.y);
-  }
-};
-
-__device__ inline float cross(const Point &a, const Point &b) {
-  return a.x * b.y - a.y * b.x;
-}
-
-__device__ inline float cross(const Point &p1, const Point &p2,
-                              const Point &p0) {
-  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
-}
-
-__device__ int check_rect_cross(const Point &p1, const Point &p2,
-                                const Point &q1, const Point &q2) {
-  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
-            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
-            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
-            min(q1.y, q2.y) <= max(p1.y, p2.y);
-  return ret;
-}
-
-__device__ inline int check_in_box2d(const float *box, const Point &p) {
-  // params: box (5) [x1, y1, x2, y2, angle]
-  const float MARGIN = 1e-5;
-
-  float center_x = (box[0] + box[2]) / 2;
-  float center_y = (box[1] + box[3]) / 2;
-  float angle_cos = cos(-box[4]),
-        angle_sin =
-            sin(-box[4]);  // rotate the point in the opposite direction of box
-  float rot_x =
-      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
-  float rot_y =
-      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
-#ifdef DEBUG
-  printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2],
-         box[3], box[4]);
-  printf(
-      "center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, "
-      "%.3f)\n",
-      center_x, center_y, angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
-#endif
-  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
-          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
-}
-
-__device__ inline int intersection(const Point &p1, const Point &p0,
-                                   const Point &q1, const Point &q0,
-                                   Point &ans) {
-  // fast exclusion
-  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
-
-  // check cross standing
-  float s1 = cross(q0, p1, p0);
-  float s2 = cross(p1, q1, p0);
-  float s3 = cross(p0, q1, q0);
-  float s4 = cross(q1, p1, q0);
-
-  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
-
-  // calculate intersection of two lines
-  float s5 = cross(q1, p1, p0);
-  if (fabs(s5 - s1) > EPS) {
-    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
-    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
-
-  } else {
-    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
-    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
-    float D = a0 * b1 - a1 * b0;
-
-    ans.x = (b0 * c1 - b1 * c0) / D;
-    ans.y = (a1 * c0 - a0 * c1) / D;
-  }
-
-  return 1;
-}
-
-__device__ inline void rotate_around_center(const Point &center,
-                                            const float angle_cos,
-                                            const float angle_sin, Point &p) {
-  float new_x =
-      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
-  float new_y =
-      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
-  p.set(new_x, new_y);
-}
-
-__device__ inline int point_cmp(const Point &a, const Point &b,
-                                const Point &center) {
-  return atan2(a.y - center.y, a.x - center.x) >
-         atan2(b.y - center.y, b.x - center.x);
-}
-
-__device__ inline float box_overlap(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
-
-  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
-        a_angle = box_a[4];
-  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
-        b_angle = box_b[4];
-
-  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
-  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
-#ifdef DEBUG
-  printf(
-      "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
-      a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
-  printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
-         center_b.x, center_b.y);
-#endif
-
-  Point box_a_corners[5];
-  box_a_corners[0].set(a_x1, a_y1);
-  box_a_corners[1].set(a_x2, a_y1);
-  box_a_corners[2].set(a_x2, a_y2);
-  box_a_corners[3].set(a_x1, a_y2);
-
-  Point box_b_corners[5];
-  box_b_corners[0].set(b_x1, b_y1);
-  box_b_corners[1].set(b_x2, b_y1);
-  box_b_corners[2].set(b_x2, b_y2);
-  box_b_corners[3].set(b_x1, b_y2);
-
-  // get oriented corners
-  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
-  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
-
-  for (int k = 0; k < 4; k++) {
-#ifdef DEBUG
-    printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
-           box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
-           box_b_corners[k].y);
-#endif
-    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
-    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
-#ifdef DEBUG
-    printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
-           box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
-#endif
-  }
-
-  box_a_corners[4] = box_a_corners[0];
-  box_b_corners[4] = box_b_corners[0];
-
-  // get intersection of lines
-  Point cross_points[16];
-  Point poly_center;
-  int cnt = 0, flag = 0;
-
-  poly_center.set(0, 0);
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
-                          box_b_corners[j + 1], box_b_corners[j],
-                          cross_points[cnt]);
-      if (flag) {
-        poly_center = poly_center + cross_points[cnt];
-        cnt++;
-      }
-    }
-  }
-
-  // check corners
-  for (int k = 0; k < 4; k++) {
-    if (check_in_box2d(box_a, box_b_corners[k])) {
-      poly_center = poly_center + box_b_corners[k];
-      cross_points[cnt] = box_b_corners[k];
-      cnt++;
-    }
-    if (check_in_box2d(box_b, box_a_corners[k])) {
-      poly_center = poly_center + box_a_corners[k];
-      cross_points[cnt] = box_a_corners[k];
-      cnt++;
-    }
-  }
-
-  poly_center.x /= cnt;
-  poly_center.y /= cnt;
-
-  // sort the points of polygon
-  Point temp;
-  for (int j = 0; j < cnt - 1; j++) {
-    for (int i = 0; i < cnt - j - 1; i++) {
-      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
-        temp = cross_points[i];
-        cross_points[i] = cross_points[i + 1];
-        cross_points[i + 1] = temp;
-      }
-    }
-  }
-
-#ifdef DEBUG
-  printf("cnt=%d\n", cnt);
-  for (int i = 0; i < cnt; i++) {
-    printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
-           cross_points[i].y);
-  }
-#endif
-
-  // get the overlap areas
-  float area = 0;
-  for (int k = 0; k < cnt - 1; k++) {
-    area += cross(cross_points[k] - cross_points[0],
-                  cross_points[k + 1] - cross_points[0]);
-  }
-
-  return fabs(area) / 2.0;
-}
-
-__device__ inline float iou_bev(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
-  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
-  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
-  float s_overlap = box_overlap(box_a, box_b);
-  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
-}
-
-__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
-                                     const int num_b, const float *boxes_b,
-                                     float *ans_overlap) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
-  }
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float s_overlap = box_overlap(cur_box_a, cur_box_b);
-  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
-}
-
-__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
-                                     const int num_b, const float *boxes_b,
-                                     float *ans_iou) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
-  }
-
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
-  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
-}
-
-__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
-                           const float *boxes, unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
-  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
-
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-__device__ inline float iou_normal(float const *const a, float const *const b) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
-  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
-  return interS / fmaxf(Sa + Sb - interS, EPS);
-}
-
-__global__ void nms_normal_kernel(const int boxes_num,
-                                  const float nms_overlap_thresh,
-                                  const float *boxes,
-                                  unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
-  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
-
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-void boxesoverlapLauncher(const int num_a, const float *boxes_a,
-                          const int num_b, const float *boxes_b,
-                          float *ans_overlap) {
-  dim3 blocks(
-      DIVUP(num_b, THREADS_PER_BLOCK),
-      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
-
-  boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
-                                            ans_overlap);
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-
-void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
-                         const float *boxes_b, float *ans_iou) {
-  dim3 blocks(
-      DIVUP(num_b, THREADS_PER_BLOCK),
-      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
-
-  boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-}
-
-void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
-                 float nms_overlap_thresh) {
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
-  dim3 threads(THREADS_PER_BLOCK_NMS);
-  nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
-}
-
-void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
-                       int boxes_num, float nms_overlap_thresh) {
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
-  dim3 threads(THREADS_PER_BLOCK_NMS);
-  nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes,
-                                         mask);
-}
--- a/mmdet3d/ops/knn/__init__.py
+++ b/mmdet3d/ops/knn/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .knn import knn
-
-__all__ = ['knn']
--- a/mmdet3d/ops/knn/knn.py
+++ b/mmdet3d/ops/knn/knn.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.autograd import Function
-
-from . import knn_ext
-
-
-class KNN(Function):
-    r"""KNN (CUDA) based on heap data structure.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/pointops/src/knnquery_heap>`_.
-
-    Find k-nearest points.
-    """
-
-    @staticmethod
-    def forward(ctx,
-                k: int,
-                xyz: torch.Tensor,
-                center_xyz: torch.Tensor = None,
-                transposed: bool = False) -> torch.Tensor:
-        """Forward.
-
-        Args:
-            k (int): number of nearest neighbors.
-            xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
-                xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) if transposed == False,
-                else (B, 3, npoint). centers of the knn query.
-            transposed (bool): whether the input tensors are transposed.
-                defaults to False. Should not explicitly use this keyword
-                when calling knn (=KNN.apply), just add the fourth param.
-
-        Returns:
-            Tensor: (B, k, npoint) tensor with the indices of
-                the features that form k-nearest neighbours.
-        """
-        assert k > 0
-
-        if center_xyz is None:
-            center_xyz = xyz
-
-        if transposed:
-            xyz = xyz.transpose(2, 1).contiguous()
-            center_xyz = center_xyz.transpose(2, 1).contiguous()
-
-        assert xyz.is_contiguous()  # [B, N, 3]
-        assert center_xyz.is_contiguous()  # [B, npoint, 3]
-
-        center_xyz_device = center_xyz.get_device()
-        assert center_xyz_device == xyz.get_device(), \
-            'center_xyz and xyz should be put on the same device'
-        if torch.cuda.current_device() != center_xyz_device:
-            torch.cuda.set_device(center_xyz_device)
-
-        B, npoint, _ = center_xyz.shape
-        N = xyz.shape[1]
-
-        idx = center_xyz.new_zeros((B, npoint, k)).int()
-        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
-
-        knn_ext.knn_wrapper(B, N, npoint, k, xyz, center_xyz, idx, dist2)
-        # idx shape to [B, k, npoint]
-        idx = idx.transpose(2, 1).contiguous()
-        ctx.mark_non_differentiable(idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, a=None):
-        return None, None, None
-
-
-knn = KNN.apply
--- a/mmdet3d/ops/knn/src/knn.cpp
+++ b/mmdet3d/ops/knn/src/knn.cpp
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
-
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-#include <vector>
-#include <THC/THC.h>
-#include <ATen/cuda/CUDAContext.h>
-
-extern THCState *state;
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-
-
-void knn_kernel_launcher(
-    int b,
-    int n,
-    int m,
-    int nsample,
-    const float *xyz,
-    const float *new_xyz,
-    int *idx,
-    float *dist2,
-    cudaStream_t stream
-    );
-
-void knn_wrapper(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
-{
-    CHECK_INPUT(new_xyz_tensor);
-    CHECK_INPUT(xyz_tensor);
-
-    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
-    const float *xyz = xyz_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    float *dist2 = dist2_tensor.data_ptr<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    knn_kernel_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("knn_wrapper", &knn_wrapper, "knn_wrapper");
-}
--- a/mmdet3d/ops/knn/src/knn_cuda.cu
+++ b/mmdet3d/ops/knn/src/knn_cuda.cu
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
-
-#include <cmath>
-#include <cstdio>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-
-
-__device__ void swap_float(float *x, float *y)
-{
-    float tmp = *x;
-    *x = *y;
-    *y = tmp;
-}
-
-
-__device__ void swap_int(int *x, int *y)
-{
-    int tmp = *x;
-    *x = *y;
-    *y = tmp;
-}
-
-
-__device__ void reheap(float *dist, int *idx, int k)
-{
-    int root = 0;
-    int child = root * 2 + 1;
-    while (child < k)
-    {
-        if(child + 1 < k && dist[child+1] > dist[child])
-            child++;
-        if(dist[root] > dist[child])
-            return;
-        swap_float(&dist[root], &dist[child]);
-        swap_int(&idx[root], &idx[child]);
-        root = child;
-        child = root * 2 + 1;
-    }
-}
-
-
-__device__ void heap_sort(float *dist, int *idx, int k)
-{
-    int i;
-    for (i = k - 1; i > 0; i--)
-    {
-        swap_float(&dist[0], &dist[i]);
-        swap_int(&idx[0], &idx[i]);
-        reheap(dist, idx, i);
-    }
-}
-
-
-// input: xyz (b, n, 3) new_xyz (b, m, 3)
-// output: idx (b, m, nsample) dist2 (b, m, nsample)
-__global__ void knn_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
-    int bs_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || pt_idx >= m) return;
-
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
-    dist2 += bs_idx * m * nsample + pt_idx * nsample;
-
-    float new_x = new_xyz[0];
-    float new_y = new_xyz[1];
-    float new_z = new_xyz[2];
-
-    float best_dist[100];
-    int best_idx[100];
-    for(int i = 0; i < nsample; i++){
-        best_dist[i] = 1e10;
-        best_idx[i] = 0;
-    }
-    for(int i = 0; i < n; i++){
-        float x = xyz[i * 3 + 0];
-        float y = xyz[i * 3 + 1];
-        float z = xyz[i * 3 + 2];
-        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
-        if (d2 < best_dist[0]){
-            best_dist[0] = d2;
-            best_idx[0] = i;
-            reheap(best_dist, best_idx, nsample);
-        }
-    }
-    heap_sort(best_dist, best_idx, nsample);
-    for(int i = 0; i < nsample; i++){
-        idx[i] = best_idx[i];
-        dist2[i] = best_dist[i];
-    }
-}
-
-
-void knn_kernel_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
-    // param new_xyz: (B, m, 3)
-    // param xyz: (B, n, 3)
-    // param idx: (B, m, nsample)
-
-    cudaError_t err;
-
-    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    knn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
-    // cudaDeviceSynchronize();  // for using printf in kernel function
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
--- a/mmdet3d/ops/paconv/__init__.py
+++ b/mmdet3d/ops/paconv/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .assign_score import assign_score_withk
 from .paconv import PAConv, PAConvCUDA

-__all__ = ['assign_score_withk', 'PAConv', 'PAConvCUDA']
+__all__ = ['PAConv', 'PAConvCUDA']
--- a/mmdet3d/ops/paconv/assign_score.py
+++ b/mmdet3d/ops/paconv/assign_score.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch.autograd import Function
-
-from . import assign_score_withk_ext
-
-
-class AssignScoreWithK(Function):
-    r"""Perform weighted sum to generate output features according to scores.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/paconv_lib/src/gpu>`_.
-
-    This is a memory-efficient CUDA implementation of assign_scores operation,
-        which first transform all point feature with weight bank, then assemble
-        neighbor features with `knn_idx` and perform weighted sum of `scores`.
-    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
-        more detailed descriptions.
-
-    Note:
-        This implementation assumes using ``neighbor`` kernel input, which is
-            (point_features - center_features, point_features).
-        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
-        pointnet2/paconv.py#L128 for more details.
-    """
-
-    @staticmethod
-    def forward(ctx,
-                scores,
-                point_features,
-                center_features,
-                knn_idx,
-                aggregate='sum'):
-        """Forward.
-
-        Args:
-            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
-                aggregate weight matrices in the weight bank.
-                ``npoint`` is the number of sampled centers.
-                ``K`` is the number of queried neighbors.
-                ``M`` is the number of weight matrices in the weight bank.
-            point_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed point features to be aggregated.
-            center_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed center features to be aggregated.
-            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
-                We assume the first idx in each row is the idx of the center.
-            aggregate (str, optional): Aggregation method.
-                Can be 'sum', 'avg' or 'max'. Defaults to 'sum'.
-
-        Returns:
-            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
-        """
-        agg = {'sum': 0, 'avg': 1, 'max': 2}
-
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-
-        output = point_features.new_zeros((B, out_dim, npoint, K))
-        assign_score_withk_ext.assign_score_withk_forward_wrapper(
-            B, N, npoint, M, K, out_dim, agg[aggregate],
-            point_features.contiguous(), center_features.contiguous(),
-            scores.contiguous(), knn_idx.contiguous(), output)
-
-        ctx.save_for_backward(output, point_features, center_features, scores,
-                              knn_idx)
-        ctx.agg = agg[aggregate]
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        """Backward.
-
-        Args:
-            grad_out (torch.Tensor): (B, out_dim, npoint, K)
-
-        Returns:
-            grad_scores (torch.Tensor): (B, npoint, K, M)
-            grad_point_features (torch.Tensor): (B, N, M, out_dim)
-            grad_center_features (torch.Tensor): (B, N, M, out_dim)
-        """
-        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
-
-        agg = ctx.agg
-
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-
-        grad_point_features = point_features.new_zeros(point_features.shape)
-        grad_center_features = center_features.new_zeros(center_features.shape)
-        grad_scores = scores.new_zeros(scores.shape)
-
-        assign_score_withk_ext.assign_score_withk_backward_wrapper(
-            B, N, npoint, M, K, out_dim, agg, grad_out.contiguous(),
-            point_features.contiguous(), center_features.contiguous(),
-            scores.contiguous(), knn_idx.contiguous(), grad_point_features,
-            grad_center_features, grad_scores)
-
-        return grad_scores, grad_point_features, \
-            grad_center_features, None, None
-
-
-assign_score_withk = AssignScoreWithK.apply
--- a/mmdet3d/ops/paconv/paconv.py
+++ b/mmdet3d/ops/paconv/paconv.py
@@ -4,10 +4,10 @@ import copy
 import torch
 from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer,
                      constant_init)
+from mmcv.ops import assign_score_withk as assign_score_cuda
 from torch import nn as nn
 from torch.nn import functional as F

-from .assign_score import assign_score_withk as assign_score_cuda
 from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist



--- a/mmdet3d/ops/paconv/src/assign_score_withk.cpp
+++ b/mmdet3d/ops/paconv/src/assign_score_withk.cpp
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
-
-#include <torch/torch.h>
-#include <torch/extension.h>
-
-void assign_score_withk_forward_wrapper(
-  int B, int N0, int N1, int M,
-  int K, int O, int aggregate,
-  const at::Tensor& points,
-  const at::Tensor& centers,
-  const at::Tensor& scores,
-  const at::Tensor& knn_idx,
-  at::Tensor& output
-  );
-
-void assign_score_withk_backward_wrapper(
-  int B, int N0, int N1, int M,
-  int K, int O, int aggregate,
-  const at::Tensor& grad_out,
-  const at::Tensor& points,
-  const at::Tensor& centers,
-  const at::Tensor& scores,
-  const at::Tensor& knn_idx,
-  at::Tensor& grad_points,
-  at::Tensor& grad_centers,
-  at::Tensor& grad_scores
-  );
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("assign_score_withk_forward_wrapper",
-        &assign_score_withk_forward_wrapper,
-        "Assign score kernel forward (GPU), save memory version");
-  m.def("assign_score_withk_backward_wrapper",
-        &assign_score_withk_backward_wrapper,
-        "Assign score kernel backward (GPU), save memory version");
-}
--- a/mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
+++ b/mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <cmath>
-#include <cstdint>
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/types.h>
-
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-
-#define CHECK_CONTIGUOUS(x)                                          \
-  do {                                                               \
-    AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
-  } while (0)
-
-#define CUDA_CHECK_ERRORS()                                           \
-  do {                                                                \
-    cudaError_t err = cudaGetLastError();                             \
-    if (cudaSuccess != err) {                                         \
-      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
-              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
-              __FILE__);                                              \
-      exit(-1);                                                       \
-    }                                                                 \
-  } while (0)
-
-
-// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
-// output: fout(B,O,N)
-// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
-//       i(k) = idx(b,i,k)
-//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
-//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
-//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
-
-
-__global__ void assign_score_withk_forward_kernel(const int B, const int N0, const int N1,
-                                                  const int M, const int K, const int O, const int aggregate,
-                                                  const float* points,
-                                                  const float* centers,
-                                                  const float* scores,
-                                                  const int64_t* knn_idx,
-                                                  float* output) {
-
-    // ----- parallel loop for B, N1, K and O ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*N1*K*O) return;
-    // ------- loop for M ----------
-    for (int m = 0; m < M; m++) {
-        int b = (int)(i / (O * N1 * K));
-        int o = (int)(i % (O * N1 * K) / (N1 * K));
-        int n = (int)(i % (N1 * K) / K);
-        int k = (int)(i % K);
-        int cn = (int) knn_idx[b*K*N1 + n*K + 0]; //The first neighbor is the center point
-        int kn = (int) knn_idx[b*K*N1 + n*K + k];
-        if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-            continue;
-        }
-        assert (b < B);
-        assert (kn < N0);
-        assert (cn < N0);
-        assert (o < O);
-        assert (n < N1);
-        atomicAdd(output + b*N1*O*K + o*N1*K + n*K + k,
-            points[b*N0*M*O + kn*M*O + m*O + o] * scores[b*N1*K*M + n*K*M + k*M + m]
-                - centers[b*N0*M*O + cn*M*O + m*O + o] * scores[b*N1*K*M + n*K*M + k*M + m]);
-    }
-}
-
-
-__global__ void assign_score_withk_backward_points_kernel(const int B, const int N0, const int N, const int M,
-                                                          const int K, const int O, const int aggregate,
-                                                          const float* grad_out,
-                                                          const float* scores,
-                                                          const int64_t* knn_idx,
-                                                          float* grad_points,
-                                                          float* grad_centers) {
-
-    // ----- parallel loop for B, M, O ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*M*O) return;
-    int b = (int)(i / (M * O));
-    int m = (int)(i % (M * O) / O);
-    int o = (int)(i % O);
-
-    // ----- loop for N,K ---------
-    for (int n = 0; n < N; n++) {
-        for (int k = 0; k < K; k++) {
-            int kn = knn_idx[b*N*K + n*K + k];
-            int cn = knn_idx[b*N*K + n*K + 0];
-            if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-                continue;
-            }
-            atomicAdd(grad_points + b*N0*M*O + kn*M*O + m*O + o,
-                scores[b*N*K*M + n*K*M + k*M + m] * grad_out[b*O*N*K + o*N*K + n*K + k]);
-            atomicAdd(grad_centers + b*N0*M*O + cn*M*O + m*O + o,
-                - scores[b*N*K*M + n*K*M + k*M + m] * grad_out[b*O*N*K + o*N*K + n*K + k]);
-            }
-    }
-
-}
-
-
-__global__ void assign_score_withk_backward_scores_kernel(const int B, const int N0, const int N, const int M,
-                                                          const int K, const int O, const int aggregate,
-                                                          const float* grad_out,
-                                                          const float* points,
-                                                          const float* centers,
-                                                          const int64_t* knn_idx,
-                                                          float* grad_scores) {
-
-    // ----- parallel loop for B, N, K, M ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*N*K*M) return;
-    int b = (int)(i / (N * M * K));
-    int n = (int)(i % (N * M * K) / M / K);
-    int k = (int)(i % (M * K) / M);
-    int m = (int)(i % M);
-    int cn = knn_idx[b*N*K + n*K + 0];
-    int kn = knn_idx[b*N*K + n*K + k];
-    if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-        return;
-    }
-
-    // -------------- loop for O ------------------------
-    for(int o = 0; o < O; o++) {
-        atomicAdd(grad_scores + b*N*K*M + n*K*M + k*M + m,
-            (points[b*N0*M*O + kn*M*O + m*O + o]
-                - centers[b*N0*M*O + cn*M*O + m*O + o])* grad_out[b*O*N*K + o*N*K + n*K + k]);
-    }
-}
-
-
-void assign_score_withk_forward_wrapper(int B, int N0, int N1, int M, int K, int O, int aggregate,
-                                        const at::Tensor& points,
-                                        const at::Tensor& centers,
-                                        const at::Tensor& scores,
-                                        const at::Tensor& knn_idx,
-                                        at::Tensor& output) {
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(output);
-
-    const float* points_data = points.data_ptr<float>();
-    const float* centers_data = centers.data_ptr<float>();
-    const float* scores_data = scores.data_ptr<float>();
-    const int64_t* knn_idx_data = knn_idx.data_ptr<int64_t>();
-    float* output_data = output.data_ptr<float>();
-
-    dim3 blocks(DIVUP(B*O*N1*K, THREADS_PER_BLOCK));
-    dim3 threads(THREADS_PER_BLOCK);
-    assign_score_withk_forward_kernel<<<blocks, threads, 0>>>(
-        B, N0, N1, M, K, O, aggregate, points_data, centers_data, scores_data, knn_idx_data, output_data);
-    CUDA_CHECK_ERRORS();
-
-}
-
-
-void assign_score_withk_backward_wrapper(int B, int N0, int N1, int M, int K, int O, int aggregate,
-                                         const at::Tensor& grad_out,
-                                         const at::Tensor& points,
-                                         const at::Tensor& centers,
-                                         const at::Tensor& scores,
-                                         const at::Tensor& knn_idx,
-                                         at::Tensor& grad_points,
-                                         at::Tensor& grad_centers,
-                                         at::Tensor& grad_scores) {
-
-    CHECK_CONTIGUOUS(grad_out);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(grad_scores);
-    CHECK_CONTIGUOUS(grad_points);
-    CHECK_CONTIGUOUS(grad_centers);
-
-    const float* grad_out_data = grad_out.data_ptr<float>();
-    const float* points_data = points.data_ptr<float>();
-    const float* centers_data = centers.data_ptr<float>();
-    const float* scores_data = scores.data_ptr<float>();
-    const int64_t* knn_idx_data = knn_idx.data_ptr<int64_t>();
-    float* grad_points_data = grad_points.data_ptr<float>();
-    float* grad_centers_data = grad_centers.data_ptr<float>();
-    float* grad_scores_data = grad_scores.data_ptr<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    dim3 blocks1(DIVUP(B*M*O, THREADS_PER_BLOCK));
-    dim3 threads1(THREADS_PER_BLOCK);
-    dim3 blocks2(DIVUP(B*N1*K*M, THREADS_PER_BLOCK));
-    dim3 threads2(THREADS_PER_BLOCK);
-    assign_score_withk_backward_points_kernel<<<blocks1, threads1, 0>>>(
-        B, N0, N1, M, K, O, aggregate, grad_out_data, scores_data, knn_idx_data, grad_points_data, grad_centers_data);
-    assign_score_withk_backward_scores_kernel<<<blocks2, threads2, 0>>>(
-        B, N0, N1, M, K, O, aggregate, grad_out_data, points_data, centers_data, knn_idx_data, grad_scores_data);
-
-    CUDA_CHECK_ERRORS();
-}
--- a/mmdet3d/ops/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_fp_module.py
@@ -3,11 +3,10 @@ from typing import List

 import torch
 from mmcv.cnn import ConvModule
+from mmcv.ops import three_interpolate, three_nn
 from mmcv.runner import BaseModule, force_fp32
 from torch import nn as nn

-from mmdet3d.ops import three_interpolate, three_nn
-

 class PointFPModule(BaseModule):
    """Point feature propagation module used in PointNets.