Release v1.0.0rc1

333536f6 · Wenwei Zhang · GitHub · 9c7270d0 · f747daab · 9c7270d0
Unverified Commit 333536f6 authored Apr 06, 2022 by Wenwei Zhang Committed by GitHub Apr 06, 2022
20 changed files
--- a/mmdet3d/ops/gather_points/__init__.py
+++ b/mmdet3d/ops/gather_points/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .gather_points import gather_points
-
-__all__ = ['gather_points']
--- a/mmdet3d/ops/gather_points/gather_points.py
+++ b/mmdet3d/ops/gather_points/gather_points.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.autograd import Function
-
-from . import gather_points_ext
-
-
-class GatherPoints(Function):
-    """Gather Points.
-
-    Gather points with given index.
-    """
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor,
-                indices: torch.Tensor) -> torch.Tensor:
-        """forward.
-
-        Args:
-            features (Tensor): (B, C, N) features to gather.
-            indices (Tensor): (B, M) where M is the number of points.
-
-        Returns:
-            Tensor: (B, C, M) where M is the number of points.
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-
-        B, npoint = indices.size()
-        _, C, N = features.size()
-        output = features.new_zeros((B, C, npoint))
-
-        gather_points_ext.gather_points_wrapper(B, C, N, npoint, features,
-                                                indices, output)
-
-        ctx.for_backwards = (indices, C, N)
-        ctx.mark_non_differentiable(indices)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        idx, C, N = ctx.for_backwards
-        B, npoint = idx.size()
-
-        grad_features = grad_out.new_zeros((B, C, N))
-        grad_out_data = grad_out.data.contiguous()
-        gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint,
-                                                     grad_out_data, idx,
-                                                     grad_features.data)
-        return grad_features, None
-
-
-gather_points = GatherPoints.apply
--- a/mmdet3d/ops/gather_points/src/gather_points.cpp
+++ b/mmdet3d/ops/gather_points/src/gather_points.cpp
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/TensorUtils.h>
-#include <THC/THC.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <vector>
-
-
-extern THCState *state;
-
-int gather_points_wrapper(int b, int c, int n, int npoints,
-                          at::Tensor& points_tensor, at::Tensor& idx_tensor,
-                          at::Tensor& out_tensor);
-
-void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-                                   const at::Tensor& points_tensor,
-                                   const at::Tensor& idx_tensor,
-                                   at::Tensor& out_tensor);
-
-int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-                               at::Tensor& grad_out_tensor,
-                               at::Tensor& idx_tensor,
-                               at::Tensor& grad_points_tensor);
-
-void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                        const at::Tensor& grad_out_tensor,
-                                        const at::Tensor& idx_tensor,
-                                        at::Tensor& grad_points_tensor);
-
-int gather_points_wrapper(int b, int c, int n, int npoints,
-                          at::Tensor& points_tensor, at::Tensor& idx_tensor,
-                          at::Tensor& out_tensor)
-{
-  gather_points_kernel_launcher(b, c, n, npoints, points_tensor, idx_tensor, out_tensor);
-  return 1;
-}
-
-int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-                               at::Tensor& grad_out_tensor,
-                               at::Tensor& idx_tensor,
-                               at::Tensor& grad_points_tensor)
-{
-  gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                                     grad_points_tensor);
-  return 1;
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-  m.def("gather_points_wrapper", &gather_points_wrapper,
-        "gather_points_wrapper");
-  m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,
-        "gather_points_grad_wrapper");
-}
--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
-#include <stdio.h>
-#include <stdlib.h>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/types.h>
-
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-
-#define TOTAL_THREADS 1024
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-template <typename scalar_t>
-__global__ void gather_points_kernel(int b, int c, int n, int m,
-                                     const scalar_t *__restrict__ points,
-                                     const int *__restrict__ idx,
-                                     scalar_t *__restrict__ out) {
-  // points: (B, C, N)
-  // idx: (B, M)
-  // output:
-  //      out: (B, C, M)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-
-  out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
-  points += bs_idx * c * n + c_idx * n;
-  out[0] = points[idx[0]];
-}
-
-void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-                                   const at::Tensor& points_tensor,
-                                   const at::Tensor& idx_tensor,
-                                   at::Tensor& out_tensor)
-{
-  // points: (B, C, N)
-  // idx: (B, npoints)
-  // output:
-  //      out: (B, C, npoints)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-              b); // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      out_tensor.scalar_type(), "gather_points_kernel",
-      [&]
-       {
-         const scalar_t *points = points_tensor.data_ptr<scalar_t>();
-         const int *idx = idx_tensor.data_ptr<int>();
-         scalar_t *out = out_tensor.data_ptr<scalar_t>();
-         gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
-                                                              idx, out);
-       });
-  err = cudaGetLastError();
-  if (cudaSuccess != err)
-  {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-template <typename scalar_t>
-__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
-                                          const scalar_t *__restrict__ grad_out,
-                                          const int *__restrict__ idx,
-                                          scalar_t *__restrict__ grad_points) {
-  // grad_out: (B, C, M)
-  // idx: (B, M)
-  // output:
-  //      grad_points: (B, C, N)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-
-  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
-  grad_points += bs_idx * c * n + c_idx * n;
-
-  atomicAdd(grad_points + idx[0], grad_out[0]);
-}
-
-void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                        const at::Tensor& grad_out_tensor,
-                                        const at::Tensor& idx_tensor,
-                                        at::Tensor& grad_points_tensor)
-{
-  // grad_out: (B, C, npoints)
-  // idx: (B, npoints)
-  // output:
-  //      grad_points: (B, C, N)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-              b); // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_points_tensor.scalar_type(), "gather_points_grad_kernel",
-      [&]
-       {
-         const scalar_t *grad_out = grad_out_tensor.data_ptr<scalar_t>();
-         const int *idx = idx_tensor.data_ptr<int>();
-         scalar_t *grad_points = grad_points_tensor.data_ptr<scalar_t>();
-         gather_points_grad_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-             b, c, n, npoints, grad_out, idx, grad_points);
-       });
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err)
-  {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/group_points/__init__.py
+++ b/mmdet3d/ops/group_points/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .group_points import GroupAll, QueryAndGroup, grouping_operation
-
-__all__ = ['QueryAndGroup', 'GroupAll', 'grouping_operation']
--- a/mmdet3d/ops/group_points/group_points.py
+++ b/mmdet3d/ops/group_points/group_points.py
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from typing import Tuple
-
-import torch
-from mmcv.runner import force_fp32
-from torch import nn as nn
-from torch.autograd import Function
-
-from ..ball_query import ball_query
-from ..knn import knn
-from . import group_points_ext
-
-
-class QueryAndGroup(nn.Module):
-    """Query and Group.
-
-    Groups with a ball query of radius
-
-    Args:
-        max_radius (float): The maximum radius of the balls.
-            If None is given, we will use kNN sampling instead of ball query.
-        sample_num (int): Maximum number of features to gather in the ball.
-        min_radius (float, optional): The minimum radius of the balls.
-            Default: 0.
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
-            Default: False.
-        normalize_xyz (bool, optional): Whether to normalize xyz.
-            Default: False.
-        uniform_sample (bool, optional): Whether to sample uniformly.
-            Default: False
-        return_unique_cnt (bool, optional): Whether to return the count of
-            unique samples. Default: False.
-        return_grouped_idx (bool, optional): Whether to return grouped idx.
-            Default: False.
-    """
-
-    def __init__(self,
-                 max_radius,
-                 sample_num,
-                 min_radius=0,
-                 use_xyz=True,
-                 return_grouped_xyz=False,
-                 normalize_xyz=False,
-                 uniform_sample=False,
-                 return_unique_cnt=False,
-                 return_grouped_idx=False):
-        super(QueryAndGroup, self).__init__()
-        self.max_radius = max_radius
-        self.min_radius = min_radius
-        self.sample_num = sample_num
-        self.use_xyz = use_xyz
-        self.return_grouped_xyz = return_grouped_xyz
-        self.normalize_xyz = normalize_xyz
-        self.uniform_sample = uniform_sample
-        self.return_unique_cnt = return_unique_cnt
-        self.return_grouped_idx = return_grouped_idx
-        if self.return_unique_cnt:
-            assert self.uniform_sample, \
-                'uniform_sample should be True when ' \
-                'returning the count of unique samples'
-        if self.max_radius is None:
-            assert not self.normalize_xyz, \
-                'can not normalize grouped xyz when max_radius is None'
-        self.fp16_enabled = False
-
-    @force_fp32()
-    def forward(self, points_xyz, center_xyz, features=None):
-        """forward.
-
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) Centriods.
-            features (Tensor): (B, C, N) Descriptors of the features.
-
-        Return：
-            Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
-        """
-        # if self.max_radius is None, we will perform kNN instead of ball query
-        # idx is of shape [B, npoint, sample_num]
-        if self.max_radius is None:
-            idx = knn(self.sample_num, points_xyz, center_xyz, False)
-            idx = idx.transpose(1, 2).contiguous()
-        else:
-            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
-                             points_xyz, center_xyz)
-
-        if self.uniform_sample:
-            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
-            for i_batch in range(idx.shape[0]):
-                for i_region in range(idx.shape[1]):
-                    unique_ind = torch.unique(idx[i_batch, i_region, :])
-                    num_unique = unique_ind.shape[0]
-                    unique_cnt[i_batch, i_region] = num_unique
-                    sample_ind = torch.randint(
-                        0,
-                        num_unique, (self.sample_num - num_unique, ),
-                        dtype=torch.long)
-                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
-                    idx[i_batch, i_region, :] = all_ind
-
-        xyz_trans = points_xyz.transpose(1, 2).contiguous()
-        # (B, 3, npoint, sample_num)
-        grouped_xyz = grouping_operation(xyz_trans, idx)
-        grouped_xyz_diff = grouped_xyz - \
-            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
-        if self.normalize_xyz:
-            grouped_xyz_diff /= self.max_radius
-
-        if features is not None:
-            grouped_features = grouping_operation(features, idx)
-            if self.use_xyz:
-                # (B, C + 3, npoint, sample_num)
-                new_features = torch.cat([grouped_xyz_diff, grouped_features],
-                                         dim=1)
-            else:
-                new_features = grouped_features
-        else:
-            assert (self.use_xyz
-                    ), 'Cannot have not features and not use xyz as a feature!'
-            new_features = grouped_xyz_diff
-
-        ret = [new_features]
-        if self.return_grouped_xyz:
-            ret.append(grouped_xyz)
-        if self.return_unique_cnt:
-            ret.append(unique_cnt)
-        if self.return_grouped_idx:
-            ret.append(idx)
-        if len(ret) == 1:
-            return ret[0]
-        else:
-            return tuple(ret)
-
-
-class GroupAll(nn.Module):
-    """Group All.
-
-    Group xyz with feature.
-
-    Args:
-        use_xyz (bool): Whether to use xyz.
-    """
-
-    def __init__(self, use_xyz: bool = True):
-        super().__init__()
-        self.use_xyz = use_xyz
-        self.fp16_enabled = False
-
-    @force_fp32()
-    def forward(self,
-                xyz: torch.Tensor,
-                new_xyz: torch.Tensor,
-                features: torch.Tensor = None):
-        """forward.
-
-        Args:
-            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            new_xyz (Tensor): Ignored.
-            features (Tensor): (B, C, N) features to group.
-
-        Return:
-            Tensor: (B, C + 3, 1, N) Grouped feature.
-        """
-        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
-        if features is not None:
-            grouped_features = features.unsqueeze(2)
-            if self.use_xyz:
-                new_features = torch.cat([grouped_xyz, grouped_features],
-                                         dim=1)  # (B, 3 + C, 1, N)
-            else:
-                new_features = grouped_features
-        else:
-            new_features = grouped_xyz
-
-        return new_features
-
-
-class GroupingOperation(Function):
-    """Grouping Operation.
-
-    Group feature with given index.
-    """
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor,
-                indices: torch.Tensor) -> torch.Tensor:
-        """forward.
-
-        Args:
-            features (Tensor): (B, C, N) tensor of features to group.
-            indices (Tensor): (B, npoint, nsample) the indices of
-                features to group with.
-
-        Returns:
-            Tensor: (B, C, npoint, nsample) Grouped features.
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-
-        B, nfeatures, nsample = indices.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
-
-        group_points_ext.forward(B, C, N, nfeatures, nsample, features,
-                                 indices, output)
-
-        ctx.for_backwards = (indices, N)
-        return output
-
-    @staticmethod
-    def backward(ctx,
-                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """backward.
-
-        Args:
-            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
-                of the output from forward.
-
-        Returns:
-            Tensor: (B, C, N) gradient of the features.
-        """
-        idx, N = ctx.for_backwards
-
-        B, C, npoint, nsample = grad_out.size()
-        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
-
-        grad_out_data = grad_out.data.contiguous()
-        group_points_ext.backward(B, C, N, npoint, nsample, grad_out_data, idx,
-                                  grad_features.data)
-        return grad_features, None
-
-
-grouping_operation = GroupingOperation.apply
--- a/mmdet3d/ops/group_points/src/group_points.cpp
+++ b/mmdet3d/ops/group_points/src/group_points.cpp
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
-
-#include <THC/THC.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <vector>
-
-extern THCState *state;
-
-int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-                         at::Tensor points_tensor, at::Tensor idx_tensor,
-                         at::Tensor out_tensor);
-
-void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-                                  const float *points, const int *idx,
-                                  float *out, cudaStream_t stream);
-
-int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
-                              at::Tensor grad_points_tensor);
-
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                       int nsample, const float *grad_out,
-                                       const int *idx, float *grad_points,
-                                       cudaStream_t stream);
-
-int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
-                              at::Tensor grad_points_tensor) {
-  float *grad_points = grad_points_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-  const float *grad_out = grad_out_tensor.data_ptr<float>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
-                                    grad_points, stream);
-  return 1;
-}
-
-int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-                         at::Tensor points_tensor, at::Tensor idx_tensor,
-                         at::Tensor out_tensor) {
-  const float *points = points_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-  float *out = out_tensor.data_ptr<float>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
-                               stream);
-  return 1;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &group_points_wrapper, "group_points_wrapper");
-  m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
-}
--- a/mmdet3d/ops/group_points/src/group_points_cuda.cu
+++ b/mmdet3d/ops/group_points/src/group_points_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
-                                         int nsample,
-                                         const float *__restrict__ grad_out,
-                                         const int *__restrict__ idx,
-                                         float *__restrict__ grad_points) {
-  // grad_out: (B, C, npoints, nsample)
-  // idx: (B, npoints, nsample)
-  // output:
-  //      grad_points: (B, C, N)
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int pt_idx = index / nsample;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-
-  int sample_idx = index % nsample;
-  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-              pt_idx * nsample + sample_idx;
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-
-  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
-}
-
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                       int nsample, const float *grad_out,
-                                       const int *idx, float *grad_points,
-                                       cudaStream_t stream) {
-  // grad_out: (B, C, npoints, nsample)
-  // idx: (B, npoints, nsample)
-  // output:
-  //      grad_points: (B, C, N)
-  cudaError_t err;
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-
-  group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
-      b, c, n, npoints, nsample, grad_out, idx, grad_points);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-__global__ void group_points_kernel(int b, int c, int n, int npoints,
-                                    int nsample,
-                                    const float *__restrict__ points,
-                                    const int *__restrict__ idx,
-                                    float *__restrict__ out) {
-  // points: (B, C, N)
-  // idx: (B, npoints, nsample)
-  // output:
-  //      out: (B, C, npoints, nsample)
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int pt_idx = index / nsample;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-
-  int sample_idx = index % nsample;
-
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-                pt_idx * nsample + sample_idx;
-
-  out[out_idx] = points[in_idx];
-}
-
-void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-                                  const float *points, const int *idx,
-                                  float *out, cudaStream_t stream) {
-  // points: (B, C, N)
-  // idx: (B, npoints, nsample)
-  // output:
-  //      out: (B, C, npoints, nsample)
-  cudaError_t err;
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-
-  group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
-                                                      points, idx, out);
-  // cudaDeviceSynchronize();  // for using printf in kernel function
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/interpolate/__init__.py
+++ b/mmdet3d/ops/interpolate/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .three_interpolate import three_interpolate
-from .three_nn import three_nn
-
-__all__ = ['three_nn', 'three_interpolate']
--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
-
-#include <THC/THC.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <vector>
-
-extern THCState *state;
-
-void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-                      at::Tensor idx_tensor);
-
-void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-                              const float *known, float *dist2, int *idx,
-                              cudaStream_t stream);
-
-void three_interpolate_wrapper(int b, int c, int m, int n,
-                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                               at::Tensor weight_tensor, at::Tensor out_tensor);
-
-void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-                                       const float *points, const int *idx,
-                                       const float *weight, float *out,
-                                       cudaStream_t stream);
-
-void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                                    at::Tensor grad_out_tensor,
-                                    at::Tensor idx_tensor,
-                                    at::Tensor weight_tensor,
-                                    at::Tensor grad_points_tensor);
-
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-                                            const float *grad_out,
-                                            const int *idx, const float *weight,
-                                            float *grad_points,
-                                            cudaStream_t stream);
-
-void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-                      at::Tensor idx_tensor) {
-  const float *unknown = unknown_tensor.data_ptr<float>();
-  const float *known = known_tensor.data_ptr<float>();
-  float *dist2 = dist2_tensor.data_ptr<float>();
-  int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
-}
-
-void three_interpolate_wrapper(int b, int c, int m, int n,
-                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                               at::Tensor weight_tensor,
-                               at::Tensor out_tensor) {
-  const float *points = points_tensor.data_ptr<float>();
-  const float *weight = weight_tensor.data_ptr<float>();
-  float *out = out_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
-                                    stream);
-}
-
-void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                                    at::Tensor grad_out_tensor,
-                                    at::Tensor idx_tensor,
-                                    at::Tensor weight_tensor,
-                                    at::Tensor grad_points_tensor) {
-  const float *grad_out = grad_out_tensor.data_ptr<float>();
-  const float *weight = weight_tensor.data_ptr<float>();
-  float *grad_points = grad_points_tensor.data_ptr<float>();
-  const int *idx = idx_tensor.data_ptr<int>();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
-                                         grad_points, stream);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
-  m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
-        "three_interpolate_wrapper");
-  m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
-        "three_interpolate_grad_wrapper");
-}
--- a/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-__global__ void three_interpolate_kernel(int b, int c, int m, int n,
-                                         const float *__restrict__ points,
-                                         const int *__restrict__ idx,
-                                         const float *__restrict__ weight,
-                                         float *__restrict__ out) {
-  // points: (B, C, M)
-  // idx: (B, N, 3)
-  // weight: (B, N, 3)
-  // output:
-  //      out: (B, C, N)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-  out += bs_idx * c * n + c_idx * n;
-
-  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                weight[2] * points[idx[2]];
-}
-
-void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-                                       const float *points, const int *idx,
-                                       const float *weight, float *out,
-                                       cudaStream_t stream) {
-  // points: (B, C, M)
-  // idx: (B, N, 3)
-  // weight: (B, N, 3)
-  // output:
-  //      out: (B, C, N)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-  three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
-                                                           idx, weight, out);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-__global__ void three_interpolate_grad_kernel(
-    int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, const float *__restrict__ weight,
-    float *__restrict__ grad_points) {
-  // grad_out: (B, C, N)
-  // weight: (B, N, 3)
-  // output:
-  //      grad_points: (B, C, M)
-
-  int bs_idx = blockIdx.z;
-  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  grad_points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-
-  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-}
-
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-                                            const float *grad_out,
-                                            const int *idx, const float *weight,
-                                            float *grad_points,
-                                            cudaStream_t stream) {
-  // grad_out: (B, C, N)
-  // weight: (B, N, 3)
-  // output:
-  //      grad_points: (B, C, M)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-  three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
-      b, c, n, m, grad_out, idx, weight, grad_points);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-__global__ void three_nn_kernel(int b, int n, int m,
-                                const float *__restrict__ unknown,
-                                const float *__restrict__ known,
-                                float *__restrict__ dist2,
-                                int *__restrict__ idx) {
-  // unknown: (B, N, 3)
-  // known: (B, M, 3)
-  // output:
-  //      dist2: (B, N, 3)
-  //      idx: (B, N, 3)
-
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= n) return;
-
-  unknown += bs_idx * n * 3 + pt_idx * 3;
-  known += bs_idx * m * 3;
-  dist2 += bs_idx * n * 3 + pt_idx * 3;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-
-  float ux = unknown[0];
-  float uy = unknown[1];
-  float uz = unknown[2];
-
-  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-  int besti1 = 0, besti2 = 0, besti3 = 0;
-  for (int k = 0; k < m; ++k) {
-    float x = known[k * 3 + 0];
-    float y = known[k * 3 + 1];
-    float z = known[k * 3 + 2];
-    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-    if (d < best1) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = best1;
-      besti2 = besti1;
-      best1 = d;
-      besti1 = k;
-    } else if (d < best2) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = d;
-      besti2 = k;
-    } else if (d < best3) {
-      best3 = d;
-      besti3 = k;
-    }
-  }
-  dist2[0] = best1;
-  dist2[1] = best2;
-  dist2[2] = best3;
-  idx[0] = besti1;
-  idx[1] = besti2;
-  idx[2] = besti3;
-}
-
-void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-                              const float *known, float *dist2, int *idx,
-                              cudaStream_t stream) {
-  // unknown: (B, N, 3)
-  // known: (B, M, 3)
-  // output:
-  //      dist2: (B, N, 3)
-  //      idx: (B, N, 3)
-
-  cudaError_t err;
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
-              b);  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK);
-
-  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
-                                                  dist2, idx);
-
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-}
--- a/mmdet3d/ops/interpolate/three_interpolate.py
+++ b/mmdet3d/ops/interpolate/three_interpolate.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from . import interpolate_ext
-
-
-class ThreeInterpolate(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
-                weight: torch.Tensor) -> torch.Tensor:
-        """Performs weighted linear interpolation on 3 features.
-
-        Args:
-            features (Tensor): (B, C, M) Features descriptors to be
-                interpolated from
-            indices (Tensor): (B, n, 3) index three nearest neighbors
-                of the target features in features
-            weight (Tensor): (B, n, 3) weights of interpolation
-
-        Returns:
-            Tensor: (B, C, N) tensor of the interpolated features
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-        assert weight.is_contiguous()
-
-        B, c, m = features.size()
-        n = indices.size(1)
-        ctx.three_interpolate_for_backward = (indices, weight, m)
-        output = torch.cuda.FloatTensor(B, c, n)
-
-        interpolate_ext.three_interpolate_wrapper(B, c, m, n, features,
-                                                  indices, weight, output)
-        return output
-
-    @staticmethod
-    def backward(
-        ctx, grad_out: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Backward of three interpolate.
-
-        Args:
-            grad_out (Tensor): (B, C, N) tensor with gradients of outputs
-
-        Returns:
-            Tensor: (B, C, M) tensor with gradients of features
-        """
-        idx, weight, m = ctx.three_interpolate_for_backward
-        B, c, n = grad_out.size()
-
-        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
-        grad_out_data = grad_out.data.contiguous()
-
-        interpolate_ext.three_interpolate_grad_wrapper(B, c, n, m,
-                                                       grad_out_data, idx,
-                                                       weight,
-                                                       grad_features.data)
-        return grad_features, None, None
-
-
-three_interpolate = ThreeInterpolate.apply
--- a/mmdet3d/ops/interpolate/three_nn.py
+++ b/mmdet3d/ops/interpolate/three_nn.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from . import interpolate_ext
-
-
-class ThreeNN(Function):
-
-    @staticmethod
-    def forward(ctx, target: torch.Tensor,
-                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Find the top-3 nearest neighbors of the target set from the source
-        set.
-
-        Args:
-            target (Tensor): shape (B, N, 3), points set that needs to
-                find the nearest neighbors.
-            source (Tensor): shape (B, M, 3), points set that is used
-                to find the nearest neighbors of points in target set.
-
-        Returns:
-            Tensor: shape (B, N, 3), L2 distance of each point in target
-                set to their corresponding nearest neighbors.
-        """
-        assert target.is_contiguous()
-        assert source.is_contiguous()
-
-        B, N, _ = target.size()
-        m = source.size(1)
-        dist2 = torch.cuda.FloatTensor(B, N, 3)
-        idx = torch.cuda.IntTensor(B, N, 3)
-
-        interpolate_ext.three_nn_wrapper(B, N, m, target, source, dist2, idx)
-
-        ctx.mark_non_differentiable(idx)
-
-        return torch.sqrt(dist2), idx
-
-    @staticmethod
-    def backward(ctx, a=None, b=None):
-        return None, None
-
-
-three_nn = ThreeNN.apply
--- a/mmdet3d/ops/iou3d/__init__.py
+++ b/mmdet3d/ops/iou3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu
-
-__all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu']
--- a/mmdet3d/ops/iou3d/iou3d_utils.py
+++ b/mmdet3d/ops/iou3d/iou3d_utils.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from . import iou3d_cuda
-
-
-def boxes_iou_bev(boxes_a, boxes_b):
-    """Calculate boxes IoU in the Bird's Eye View.
-
-    Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
-
-    Returns:
-        ans_iou (torch.Tensor): IoU result with shape (M, N).
-    """
-    ans_iou = boxes_a.new_zeros(
-        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
-
-    iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
-                                 ans_iou)
-
-    return ans_iou
-
-
-def nms_gpu(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
-    """NMS function GPU implementation (for BEV boxes). The overlap of two
-    boxes for IoU calculation is defined as the exact overlapping area of the
-    two boxes. In this function, one can also set `pre_max_size` and
-    `post_max_size`.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
-            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
-        thresh (int): Threshold.
-        pre_max_size (int, optional): Max size of boxes before NMS.
-            Default: None.
-        post_max_size (int, optional): Max size of boxes after NMS.
-            Default: None.
-
-    Returns:
-        torch.Tensor: Indexes after NMS.
-    """
-    order = scores.sort(0, descending=True)[1]
-
-    if pre_max_size is not None:
-        order = order[:pre_max_size]
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh, boxes.device.index)
-    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
-    if post_max_size is not None:
-        keep = keep[:post_max_size]
-    return keep
-
-
-def nms_normal_gpu(boxes, scores, thresh):
-    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
-    two boxes for IoU calculation is defined as the exact overlapping area of
-    the two boxes WITH their yaw angle set to 0.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
-        thresh (torch.Tensor): Threshold of NMS.
-
-    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
-    """
-    order = scores.sort(0, descending=True)[1]
-
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh,
-                                        boxes.device.index)
-    return order[keep[:num_out].cuda(boxes.device)].contiguous()
--- a/mmdet3d/ops/iou3d/src/iou3d.cpp
+++ b/mmdet3d/ops/iou3d/src/iou3d.cpp
-// Modified from
-// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
-
-/*
-3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
-Written by Shaoshuai Shi
-All Rights Reserved 2019-2020.
-*/
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-
-#include <cstdint>
-#include <vector>
-
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-#define CHECK_ERROR(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-
-const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
-
-void boxesoverlapLauncher(const int num_a, const float *boxes_a,
-                          const int num_b, const float *boxes_b,
-                          float *ans_overlap);
-void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
-                         const float *boxes_b, float *ans_iou);
-void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
-                 float nms_overlap_thresh);
-void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
-                       int boxes_num, float nms_overlap_thresh);
-
-int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
-                          at::Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-
-  CHECK_INPUT(boxes_a);
-  CHECK_INPUT(boxes_b);
-  CHECK_INPUT(ans_overlap);
-
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  const float *boxes_a_data = boxes_a.data_ptr<float>();
-  const float *boxes_b_data = boxes_b.data_ptr<float>();
-  float *ans_overlap_data = ans_overlap.data_ptr<float>();
-
-  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
-                       ans_overlap_data);
-
-  return 1;
-}
-
-int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
-                      at::Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-
-  CHECK_INPUT(boxes_a);
-  CHECK_INPUT(boxes_b);
-  CHECK_INPUT(ans_iou);
-
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  const float *boxes_a_data = boxes_a.data_ptr<float>();
-  const float *boxes_b_data = boxes_b.data_ptr<float>();
-  float *ans_iou_data = ans_iou.data_ptr<float>();
-
-  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
-
-  return 1;
-}
-
-int nms_gpu(at::Tensor boxes, at::Tensor keep,
-	    float nms_overlap_thresh, int device_id) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
-  // params keep: (N)
-
-  CHECK_INPUT(boxes);
-  CHECK_CONTIGUOUS(keep);
-  cudaSetDevice(device_id);
-
-  int boxes_num = boxes.size(0);
-  const float *boxes_data = boxes.data_ptr<float>();
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-
-  unsigned long long *mask_data = NULL;
-  CHECK_ERROR(cudaMalloc((void **)&mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long)));
-  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
-
-  // unsigned long long mask_cpu[boxes_num * col_blocks];
-  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
-  // col_blocks];
-  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-
-  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long),
-                         cudaMemcpyDeviceToHost));
-
-  cudaFree(mask_data);
-
-  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
-
-  int num_to_keep = 0;
-
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  delete[] remv_cpu;
-  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-  return num_to_keep;
-}
-
-int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
-                   float nms_overlap_thresh, int device_id) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
-  // params keep: (N)
-
-  CHECK_INPUT(boxes);
-  CHECK_CONTIGUOUS(keep);
-  cudaSetDevice(device_id);
-
-  int boxes_num = boxes.size(0);
-  const float *boxes_data = boxes.data_ptr<float>();
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-
-  unsigned long long *mask_data = NULL;
-  CHECK_ERROR(cudaMalloc((void **)&mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long)));
-  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
-
-  // unsigned long long mask_cpu[boxes_num * col_blocks];
-  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
-  // col_blocks];
-  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-
-  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
-                         boxes_num * col_blocks * sizeof(unsigned long long),
-                         cudaMemcpyDeviceToHost));
-
-  cudaFree(mask_data);
-
-  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
-
-  int num_to_keep = 0;
-
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  delete[] remv_cpu;
-  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-  return num_to_keep;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
-        "oriented boxes overlap");
-  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
-  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
-  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
-}
--- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
-// Modified from
-// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
-
-/*
-3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
-Written by Shaoshuai Shi
-All Rights Reserved 2019-2020.
-*/
-
-#include <stdio.h>
-#define THREADS_PER_BLOCK 16
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-//#define DEBUG
-const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
-__device__ const float EPS = 1e-8;
-struct Point {
-  float x, y;
-  __device__ Point() {}
-  __device__ Point(double _x, double _y) { x = _x, y = _y; }
-
-  __device__ void set(float _x, float _y) {
-    x = _x;
-    y = _y;
-  }
-
-  __device__ Point operator+(const Point &b) const {
-    return Point(x + b.x, y + b.y);
-  }
-
-  __device__ Point operator-(const Point &b) const {
-    return Point(x - b.x, y - b.y);
-  }
-};
-
-__device__ inline float cross(const Point &a, const Point &b) {
-  return a.x * b.y - a.y * b.x;
-}
-
-__device__ inline float cross(const Point &p1, const Point &p2,
-                              const Point &p0) {
-  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
-}
-
-__device__ int check_rect_cross(const Point &p1, const Point &p2,
-                                const Point &q1, const Point &q2) {
-  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
-            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
-            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
-            min(q1.y, q2.y) <= max(p1.y, p2.y);
-  return ret;
-}
-
-__device__ inline int check_in_box2d(const float *box, const Point &p) {
-  // params: box (5) [x1, y1, x2, y2, angle]
-  const float MARGIN = 1e-5;
-
-  float center_x = (box[0] + box[2]) / 2;
-  float center_y = (box[1] + box[3]) / 2;
-  float angle_cos = cos(-box[4]),
-        angle_sin =
-            sin(-box[4]);  // rotate the point in the opposite direction of box
-  float rot_x =
-      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
-  float rot_y =
-      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
-#ifdef DEBUG
-  printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2],
-         box[3], box[4]);
-  printf(
-      "center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, "
-      "%.3f)\n",
-      center_x, center_y, angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
-#endif
-  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
-          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
-}
-
-__device__ inline int intersection(const Point &p1, const Point &p0,
-                                   const Point &q1, const Point &q0,
-                                   Point &ans) {
-  // fast exclusion
-  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
-
-  // check cross standing
-  float s1 = cross(q0, p1, p0);
-  float s2 = cross(p1, q1, p0);
-  float s3 = cross(p0, q1, q0);
-  float s4 = cross(q1, p1, q0);
-
-  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
-
-  // calculate intersection of two lines
-  float s5 = cross(q1, p1, p0);
-  if (fabs(s5 - s1) > EPS) {
-    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
-    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
-
-  } else {
-    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
-    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
-    float D = a0 * b1 - a1 * b0;
-
-    ans.x = (b0 * c1 - b1 * c0) / D;
-    ans.y = (a1 * c0 - a0 * c1) / D;
-  }
-
-  return 1;
-}
-
-__device__ inline void rotate_around_center(const Point &center,
-                                            const float angle_cos,
-                                            const float angle_sin, Point &p) {
-  float new_x =
-      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
-  float new_y =
-      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
-  p.set(new_x, new_y);
-}
-
-__device__ inline int point_cmp(const Point &a, const Point &b,
-                                const Point &center) {
-  return atan2(a.y - center.y, a.x - center.x) >
-         atan2(b.y - center.y, b.x - center.x);
-}
-
-__device__ inline float box_overlap(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
-
-  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
-        a_angle = box_a[4];
-  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
-        b_angle = box_b[4];
-
-  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
-  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
-#ifdef DEBUG
-  printf(
-      "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
-      a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
-  printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
-         center_b.x, center_b.y);
-#endif
-
-  Point box_a_corners[5];
-  box_a_corners[0].set(a_x1, a_y1);
-  box_a_corners[1].set(a_x2, a_y1);
-  box_a_corners[2].set(a_x2, a_y2);
-  box_a_corners[3].set(a_x1, a_y2);
-
-  Point box_b_corners[5];
-  box_b_corners[0].set(b_x1, b_y1);
-  box_b_corners[1].set(b_x2, b_y1);
-  box_b_corners[2].set(b_x2, b_y2);
-  box_b_corners[3].set(b_x1, b_y2);
-
-  // get oriented corners
-  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
-  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
-
-  for (int k = 0; k < 4; k++) {
-#ifdef DEBUG
-    printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
-           box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
-           box_b_corners[k].y);
-#endif
-    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
-    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
-#ifdef DEBUG
-    printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
-           box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
-#endif
-  }
-
-  box_a_corners[4] = box_a_corners[0];
-  box_b_corners[4] = box_b_corners[0];
-
-  // get intersection of lines
-  Point cross_points[16];
-  Point poly_center;
-  int cnt = 0, flag = 0;
-
-  poly_center.set(0, 0);
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
-                          box_b_corners[j + 1], box_b_corners[j],
-                          cross_points[cnt]);
-      if (flag) {
-        poly_center = poly_center + cross_points[cnt];
-        cnt++;
-      }
-    }
-  }
-
-  // check corners
-  for (int k = 0; k < 4; k++) {
-    if (check_in_box2d(box_a, box_b_corners[k])) {
-      poly_center = poly_center + box_b_corners[k];
-      cross_points[cnt] = box_b_corners[k];
-      cnt++;
-    }
-    if (check_in_box2d(box_b, box_a_corners[k])) {
-      poly_center = poly_center + box_a_corners[k];
-      cross_points[cnt] = box_a_corners[k];
-      cnt++;
-    }
-  }
-
-  poly_center.x /= cnt;
-  poly_center.y /= cnt;
-
-  // sort the points of polygon
-  Point temp;
-  for (int j = 0; j < cnt - 1; j++) {
-    for (int i = 0; i < cnt - j - 1; i++) {
-      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
-        temp = cross_points[i];
-        cross_points[i] = cross_points[i + 1];
-        cross_points[i + 1] = temp;
-      }
-    }
-  }
-
-#ifdef DEBUG
-  printf("cnt=%d\n", cnt);
-  for (int i = 0; i < cnt; i++) {
-    printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
-           cross_points[i].y);
-  }
-#endif
-
-  // get the overlap areas
-  float area = 0;
-  for (int k = 0; k < cnt - 1; k++) {
-    area += cross(cross_points[k] - cross_points[0],
-                  cross_points[k + 1] - cross_points[0]);
-  }
-
-  return fabs(area) / 2.0;
-}
-
-__device__ inline float iou_bev(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
-  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
-  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
-  float s_overlap = box_overlap(box_a, box_b);
-  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
-}
-
-__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
-                                     const int num_b, const float *boxes_b,
-                                     float *ans_overlap) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
-  }
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float s_overlap = box_overlap(cur_box_a, cur_box_b);
-  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
-}
-
-__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
-                                     const int num_b, const float *boxes_b,
-                                     float *ans_iou) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
-  }
-
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
-  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
-}
-
-__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
-                           const float *boxes, unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
-  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
-
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-__device__ inline float iou_normal(float const *const a, float const *const b) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
-  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
-  return interS / fmaxf(Sa + Sb - interS, EPS);
-}
-
-__global__ void nms_normal_kernel(const int boxes_num,
-                                  const float nms_overlap_thresh,
-                                  const float *boxes,
-                                  unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
-  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
-
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-void boxesoverlapLauncher(const int num_a, const float *boxes_a,
-                          const int num_b, const float *boxes_b,
-                          float *ans_overlap) {
-  dim3 blocks(
-      DIVUP(num_b, THREADS_PER_BLOCK),
-      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
-
-  boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
-                                            ans_overlap);
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-
-void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
-                         const float *boxes_b, float *ans_iou) {
-  dim3 blocks(
-      DIVUP(num_b, THREADS_PER_BLOCK),
-      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
-  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
-
-  boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-}
-
-void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
-                 float nms_overlap_thresh) {
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
-  dim3 threads(THREADS_PER_BLOCK_NMS);
-  nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
-}
-
-void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
-                       int boxes_num, float nms_overlap_thresh) {
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
-  dim3 threads(THREADS_PER_BLOCK_NMS);
-  nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes,
-                                         mask);
-}
--- a/mmdet3d/ops/knn/__init__.py
+++ b/mmdet3d/ops/knn/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .knn import knn
-
-__all__ = ['knn']
--- a/mmdet3d/ops/knn/knn.py
+++ b/mmdet3d/ops/knn/knn.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.autograd import Function
-
-from . import knn_ext
-
-
-class KNN(Function):
-    r"""KNN (CUDA) based on heap data structure.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/pointops/src/knnquery_heap>`_.
-
-    Find k-nearest points.
-    """
-
-    @staticmethod
-    def forward(ctx,
-                k: int,
-                xyz: torch.Tensor,
-                center_xyz: torch.Tensor = None,
-                transposed: bool = False) -> torch.Tensor:
-        """Forward.
-
-        Args:
-            k (int): number of nearest neighbors.
-            xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
-                xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) if transposed == False,
-                else (B, 3, npoint). centers of the knn query.
-            transposed (bool): whether the input tensors are transposed.
-                defaults to False. Should not explicitly use this keyword
-                when calling knn (=KNN.apply), just add the fourth param.
-
-        Returns:
-            Tensor: (B, k, npoint) tensor with the indices of
-                the features that form k-nearest neighbours.
-        """
-        assert k > 0
-
-        if center_xyz is None:
-            center_xyz = xyz
-
-        if transposed:
-            xyz = xyz.transpose(2, 1).contiguous()
-            center_xyz = center_xyz.transpose(2, 1).contiguous()
-
-        assert xyz.is_contiguous()  # [B, N, 3]
-        assert center_xyz.is_contiguous()  # [B, npoint, 3]
-
-        center_xyz_device = center_xyz.get_device()
-        assert center_xyz_device == xyz.get_device(), \
-            'center_xyz and xyz should be put on the same device'
-        if torch.cuda.current_device() != center_xyz_device:
-            torch.cuda.set_device(center_xyz_device)
-
-        B, npoint, _ = center_xyz.shape
-        N = xyz.shape[1]
-
-        idx = center_xyz.new_zeros((B, npoint, k)).int()
-        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
-
-        knn_ext.knn_wrapper(B, N, npoint, k, xyz, center_xyz, idx, dist2)
-        # idx shape to [B, k, npoint]
-        idx = idx.transpose(2, 1).contiguous()
-        ctx.mark_non_differentiable(idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, a=None):
-        return None, None, None
-
-
-knn = KNN.apply