Release v1.0.0rc1

333536f6 · Wenwei Zhang · GitHub · 9c7270d0 · f747daab · 9c7270d0
Unverified Commit 333536f6 authored Apr 06, 2022 by Wenwei Zhang Committed by GitHub Apr 06, 2022
20 changed files
--- a/mmdet3d/ops/knn/src/knn.cpp
+++ b/mmdet3d/ops/knn/src/knn.cpp
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-#include <vector>
-#include <THC/THC.h>
-#include <ATen/cuda/CUDAContext.h>
-extern THCState *state;
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-void knn_kernel_launcher(
-    int b,
-    int n,
-    int m,
-    int nsample,
-    const float *xyz,
-    const float *new_xyz,
-    int *idx,
-    float *dist2,
-    cudaStream_t stream
-    );
-void knn_wrapper(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
-{
-    CHECK_INPUT(new_xyz_tensor);
-    CHECK_INPUT(xyz_tensor);
-    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
-    const float *xyz = xyz_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    float *dist2 = dist2_tensor.data_ptr<float>();
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    knn_kernel_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("knn_wrapper", &knn_wrapper, "knn_wrapper");
-}
--- a/mmdet3d/ops/knn/src/knn_cuda.cu
+++ b/mmdet3d/ops/knn/src/knn_cuda.cu
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
-#include <cmath>
-#include <cstdio>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-__device__ void swap_float(float *x, float *y)
-{
-    float tmp = *x;
-    *x = *y;
-    *y = tmp;
-}
-__device__ void swap_int(int *x, int *y)
-{
-    int tmp = *x;
-    *x = *y;
-    *y = tmp;
-}
-__device__ void reheap(float *dist, int *idx, int k)
-{
-    int root = 0;
-    int child = root * 2 + 1;
-    while (child < k)
-    {
-        if(child + 1 < k && dist[child+1] > dist[child])
-            child++;
-        if(dist[root] > dist[child])
-            return;
-        swap_float(&dist[root], &dist[child]);
-        swap_int(&idx[root], &idx[child]);
-        root = child;
-        child = root * 2 + 1;
-    }
-}
-__device__ void heap_sort(float *dist, int *idx, int k)
-{
-    int i;
-    for (i = k - 1; i > 0; i--)
-    {
-        swap_float(&dist[0], &dist[i]);
-        swap_int(&idx[0], &idx[i]);
-        reheap(dist, idx, i);
-    }
-}
-// input: xyz (b, n, 3) new_xyz (b, m, 3)
-// output: idx (b, m, nsample) dist2 (b, m, nsample)
-__global__ void knn_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
-    int bs_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || pt_idx >= m) return;
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
-    dist2 += bs_idx * m * nsample + pt_idx * nsample;
-    float new_x = new_xyz[0];
-    float new_y = new_xyz[1];
-    float new_z = new_xyz[2];
-    float best_dist[100];
-    int best_idx[100];
-    for(int i = 0; i < nsample; i++){
-        best_dist[i] = 1e10;
-        best_idx[i] = 0;
-    }
-    for(int i = 0; i < n; i++){
-        float x = xyz[i * 3 + 0];
-        float y = xyz[i * 3 + 1];
-        float z = xyz[i * 3 + 2];
-        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
-        if (d2 < best_dist[0]){
-            best_dist[0] = d2;
-            best_idx[0] = i;
-            reheap(best_dist, best_idx, nsample);
-        }
-    }
-    heap_sort(best_dist, best_idx, nsample);
-    for(int i = 0; i < nsample; i++){
-        idx[i] = best_idx[i];
-        dist2[i] = best_dist[i];
-    }
-}
-void knn_kernel_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
-    // param new_xyz: (B, m, 3)
-    // param xyz: (B, n, 3)
-    // param idx: (B, m, nsample)
-    cudaError_t err;
-    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-    knn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
-    // cudaDeviceSynchronize();  // for using printf in kernel function
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
--- a/mmdet3d/ops/paconv/__init__.py
+++ b/mmdet3d/ops/paconv/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .assign_score import assign_score_withk
 from .paconv import PAConv, PAConvCUDA
-__all__ = ['assign_score_withk', 'PAConv', 'PAConvCUDA']
+__all__ = ['PAConv', 'PAConvCUDA']
--- a/mmdet3d/ops/paconv/assign_score.py
+++ b/mmdet3d/ops/paconv/assign_score.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch.autograd import Function
-from . import assign_score_withk_ext
-class AssignScoreWithK(Function):
-    r"""Perform weighted sum to generate output features according to scores.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/paconv_lib/src/gpu>`_.
-    This is a memory-efficient CUDA implementation of assign_scores operation,
-        which first transform all point feature with weight bank, then assemble
-        neighbor features with `knn_idx` and perform weighted sum of `scores`.
-    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
-        more detailed descriptions.
-    Note:
-        This implementation assumes using ``neighbor`` kernel input, which is
-            (point_features - center_features, point_features).
-        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
-        pointnet2/paconv.py#L128 for more details.
-    """
-    @staticmethod
-    def forward(ctx,
-                scores,
-                point_features,
-                center_features,
-                knn_idx,
-                aggregate='sum'):
-        """Forward.
-        Args:
-            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
-                aggregate weight matrices in the weight bank.
-                ``npoint`` is the number of sampled centers.
-                ``K`` is the number of queried neighbors.
-                ``M`` is the number of weight matrices in the weight bank.
-            point_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed point features to be aggregated.
-            center_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed center features to be aggregated.
-            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
-                We assume the first idx in each row is the idx of the center.
-            aggregate (str, optional): Aggregation method.
-                Can be 'sum', 'avg' or 'max'. Defaults to 'sum'.
-        Returns:
-            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
-        """
-        agg = {'sum': 0, 'avg': 1, 'max': 2}
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-        output = point_features.new_zeros((B, out_dim, npoint, K))
-        assign_score_withk_ext.assign_score_withk_forward_wrapper(
-            B, N, npoint, M, K, out_dim, agg[aggregate],
-            point_features.contiguous(), center_features.contiguous(),
-            scores.contiguous(), knn_idx.contiguous(), output)
-        ctx.save_for_backward(output, point_features, center_features, scores,
-                              knn_idx)
-        ctx.agg = agg[aggregate]
-        return output
-    @staticmethod
-    def backward(ctx, grad_out):
-        """Backward.
-        Args:
-            grad_out (torch.Tensor): (B, out_dim, npoint, K)
-        Returns:
-            grad_scores (torch.Tensor): (B, npoint, K, M)
-            grad_point_features (torch.Tensor): (B, N, M, out_dim)
-            grad_center_features (torch.Tensor): (B, N, M, out_dim)
-        """
-        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
-        agg = ctx.agg
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-        grad_point_features = point_features.new_zeros(point_features.shape)
-        grad_center_features = center_features.new_zeros(center_features.shape)
-        grad_scores = scores.new_zeros(scores.shape)
-        assign_score_withk_ext.assign_score_withk_backward_wrapper(
-            B, N, npoint, M, K, out_dim, agg, grad_out.contiguous(),
-            point_features.contiguous(), center_features.contiguous(),
-            scores.contiguous(), knn_idx.contiguous(), grad_point_features,
-            grad_center_features, grad_scores)
-        return grad_scores, grad_point_features, \
-            grad_center_features, None, None
-assign_score_withk = AssignScoreWithK.apply
--- a/mmdet3d/ops/paconv/paconv.py
+++ b/mmdet3d/ops/paconv/paconv.py
@@ -4,10 +4,10 @@ import copy
 import torch
 from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer,
                      constant_init)
+from mmcv.ops import assign_score_withk as assign_score_cuda
 from torch import nn as nn
 from torch.nn import functional as F
-from .assign_score import assign_score_withk as assign_score_cuda
 from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist

--- a/mmdet3d/ops/paconv/src/assign_score_withk.cpp
+++ b/mmdet3d/ops/paconv/src/assign_score_withk.cpp
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
-#include <torch/torch.h>
-#include <torch/extension.h>
-void assign_score_withk_forward_wrapper(
-  int B, int N0, int N1, int M,
-  int K, int O, int aggregate,
-  const at::Tensor& points,
-  const at::Tensor& centers,
-  const at::Tensor& scores,
-  const at::Tensor& knn_idx,
-  at::Tensor& output
-  );
-void assign_score_withk_backward_wrapper(
-  int B, int N0, int N1, int M,
-  int K, int O, int aggregate,
-  const at::Tensor& grad_out,
-  const at::Tensor& points,
-  const at::Tensor& centers,
-  const at::Tensor& scores,
-  const at::Tensor& knn_idx,
-  at::Tensor& grad_points,
-  at::Tensor& grad_centers,
-  at::Tensor& grad_scores
-  );
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("assign_score_withk_forward_wrapper",
-        &assign_score_withk_forward_wrapper,
-        "Assign score kernel forward (GPU), save memory version");
-  m.def("assign_score_withk_backward_wrapper",
-        &assign_score_withk_backward_wrapper,
-        "Assign score kernel backward (GPU), save memory version");
-}
--- a/mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
+++ b/mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
-// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <cmath>
-#include <cstdint>
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/types.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CHECK_CONTIGUOUS(x)                                          \
-  do {                                                               \
-    AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
-  } while (0)
-#define CUDA_CHECK_ERRORS()                                           \
-  do {                                                                \
-    cudaError_t err = cudaGetLastError();                             \
-    if (cudaSuccess != err) {                                         \
-      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
-              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
-              __FILE__);                                              \
-      exit(-1);                                                       \
-    }                                                                 \
-  } while (0)
-// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
-// output: fout(B,O,N)
-// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
-//       i(k) = idx(b,i,k)
-//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
-//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
-//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
-__global__ void assign_score_withk_forward_kernel(const int B, const int N0, const int N1,
-                                                  const int M, const int K, const int O, const int aggregate,
-                                                  const float* points,
-                                                  const float* centers,
-                                                  const float* scores,
-                                                  const int64_t* knn_idx,
-                                                  float* output) {
-    // ----- parallel loop for B, N1, K and O ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*N1*K*O) return;
-    // ------- loop for M ----------
-    for (int m = 0; m < M; m++) {
-        int b = (int)(i / (O * N1 * K));
-        int o = (int)(i % (O * N1 * K) / (N1 * K));
-        int n = (int)(i % (N1 * K) / K);
-        int k = (int)(i % K);
-        int cn = (int) knn_idx[b*K*N1 + n*K + 0]; //The first neighbor is the center point
-        int kn = (int) knn_idx[b*K*N1 + n*K + k];
-        if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-            continue;
-        }
-        assert (b < B);
-        assert (kn < N0);
-        assert (cn < N0);
-        assert (o < O);
-        assert (n < N1);
-        atomicAdd(output + b*N1*O*K + o*N1*K + n*K + k,
-            points[b*N0*M*O + kn*M*O + m*O + o] * scores[b*N1*K*M + n*K*M + k*M + m]
-                - centers[b*N0*M*O + cn*M*O + m*O + o] * scores[b*N1*K*M + n*K*M + k*M + m]);
-    }
-}
-__global__ void assign_score_withk_backward_points_kernel(const int B, const int N0, const int N, const int M,
-                                                          const int K, const int O, const int aggregate,
-                                                          const float* grad_out,
-                                                          const float* scores,
-                                                          const int64_t* knn_idx,
-                                                          float* grad_points,
-                                                          float* grad_centers) {
-    // ----- parallel loop for B, M, O ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*M*O) return;
-    int b = (int)(i / (M * O));
-    int m = (int)(i % (M * O) / O);
-    int o = (int)(i % O);
-    // ----- loop for N,K ---------
-    for (int n = 0; n < N; n++) {
-        for (int k = 0; k < K; k++) {
-            int kn = knn_idx[b*N*K + n*K + k];
-            int cn = knn_idx[b*N*K + n*K + 0];
-            if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-                continue;
-            }
-            atomicAdd(grad_points + b*N0*M*O + kn*M*O + m*O + o,
-                scores[b*N*K*M + n*K*M + k*M + m] * grad_out[b*O*N*K + o*N*K + n*K + k]);
-            atomicAdd(grad_centers + b*N0*M*O + cn*M*O + m*O + o,
-                - scores[b*N*K*M + n*K*M + k*M + m] * grad_out[b*O*N*K + o*N*K + n*K + k]);
-            }
-    }
-}
-__global__ void assign_score_withk_backward_scores_kernel(const int B, const int N0, const int N, const int M,
-                                                          const int K, const int O, const int aggregate,
-                                                          const float* grad_out,
-                                                          const float* points,
-                                                          const float* centers,
-                                                          const int64_t* knn_idx,
-                                                          float* grad_scores) {
-    // ----- parallel loop for B, N, K, M ---------
-    long i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= B*N*K*M) return;
-    int b = (int)(i / (N * M * K));
-    int n = (int)(i % (N * M * K) / M / K);
-    int k = (int)(i % (M * K) / M);
-    int m = (int)(i % M);
-    int cn = knn_idx[b*N*K + n*K + 0];
-    int kn = knn_idx[b*N*K + n*K + k];
-    if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
-        return;
-    }
-    // -------------- loop for O ------------------------
-    for(int o = 0; o < O; o++) {
-        atomicAdd(grad_scores + b*N*K*M + n*K*M + k*M + m,
-            (points[b*N0*M*O + kn*M*O + m*O + o]
-                - centers[b*N0*M*O + cn*M*O + m*O + o])* grad_out[b*O*N*K + o*N*K + n*K + k]);
-    }
-}
-void assign_score_withk_forward_wrapper(int B, int N0, int N1, int M, int K, int O, int aggregate,
-                                        const at::Tensor& points,
-                                        const at::Tensor& centers,
-                                        const at::Tensor& scores,
-                                        const at::Tensor& knn_idx,
-                                        at::Tensor& output) {
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(output);
-    const float* points_data = points.data_ptr<float>();
-    const float* centers_data = centers.data_ptr<float>();
-    const float* scores_data = scores.data_ptr<float>();
-    const int64_t* knn_idx_data = knn_idx.data_ptr<int64_t>();
-    float* output_data = output.data_ptr<float>();
-    dim3 blocks(DIVUP(B*O*N1*K, THREADS_PER_BLOCK));
-    dim3 threads(THREADS_PER_BLOCK);
-    assign_score_withk_forward_kernel<<<blocks, threads, 0>>>(
-        B, N0, N1, M, K, O, aggregate, points_data, centers_data, scores_data, knn_idx_data, output_data);
-    CUDA_CHECK_ERRORS();
-}
-void assign_score_withk_backward_wrapper(int B, int N0, int N1, int M, int K, int O, int aggregate,
-                                         const at::Tensor& grad_out,
-                                         const at::Tensor& points,
-                                         const at::Tensor& centers,
-                                         const at::Tensor& scores,
-                                         const at::Tensor& knn_idx,
-                                         at::Tensor& grad_points,
-                                         at::Tensor& grad_centers,
-                                         at::Tensor& grad_scores) {
-    CHECK_CONTIGUOUS(grad_out);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(grad_scores);
-    CHECK_CONTIGUOUS(grad_points);
-    CHECK_CONTIGUOUS(grad_centers);
-    const float* grad_out_data = grad_out.data_ptr<float>();
-    const float* points_data = points.data_ptr<float>();
-    const float* centers_data = centers.data_ptr<float>();
-    const float* scores_data = scores.data_ptr<float>();
-    const int64_t* knn_idx_data = knn_idx.data_ptr<int64_t>();
-    float* grad_points_data = grad_points.data_ptr<float>();
-    float* grad_centers_data = grad_centers.data_ptr<float>();
-    float* grad_scores_data = grad_scores.data_ptr<float>();
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    dim3 blocks1(DIVUP(B*M*O, THREADS_PER_BLOCK));
-    dim3 threads1(THREADS_PER_BLOCK);
-    dim3 blocks2(DIVUP(B*N1*K*M, THREADS_PER_BLOCK));
-    dim3 threads2(THREADS_PER_BLOCK);
-    assign_score_withk_backward_points_kernel<<<blocks1, threads1, 0>>>(
-        B, N0, N1, M, K, O, aggregate, grad_out_data, scores_data, knn_idx_data, grad_points_data, grad_centers_data);
-    assign_score_withk_backward_scores_kernel<<<blocks2, threads2, 0>>>(
-        B, N0, N1, M, K, O, aggregate, grad_out_data, points_data, centers_data, knn_idx_data, grad_scores_data);
-    CUDA_CHECK_ERRORS();
-}
--- a/mmdet3d/ops/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_fp_module.py
@@ -3,11 +3,10 @@ from typing import List
 import torch
 from mmcv.cnn import ConvModule
+from mmcv.ops import three_interpolate, three_nn
 from mmcv.runner import BaseModule, force_fp32
 from torch import nn as nn
-from mmdet3d.ops import three_interpolate, three_nn
 class PointFPModule(BaseModule):
    """Point feature propagation module used in PointNets.

--- a/mmdet3d/ops/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from mmcv.cnn import ConvModule
+from mmcv.ops import GroupAll
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import QueryAndGroup, gather_points
 from torch import nn as nn
 from torch.nn import functional as F
-from mmdet3d.ops import (GroupAll, PAConv, Points_Sampler, QueryAndGroup,
+from mmdet3d.ops import PAConv
-                         gather_points)
 from .builder import SA_MODULES

--- a/mmdet3d/ops/roiaware_pool3d/__init__.py
+++ b/mmdet3d/ops/roiaware_pool3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
-                              points_in_boxes_part)
-from .roiaware_pool3d import RoIAwarePool3d
-__all__ = [
-    'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
-    'points_in_boxes_all'
-]
--- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from . import roiaware_pool3d_ext
-def points_in_boxes_part(points, boxes):
-    """Find the box in which each point is (CUDA).
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
-            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
-                                       dtype=torch.int).fill_(-1)
-    # If manually put the tensor 'points' or 'boxes' on a device
-    # which is not the current device, some temporary variables
-    # will be created on the current device in the cuda op,
-    # and the output will be incorrect.
-    # Therefore, we force the current device to be the same
-    # as the device of the tensors if it was not.
-    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
-    # for the incorrect output before the fix.
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-    roiaware_pool3d_ext.points_in_boxes_part(boxes.contiguous(),
-                                             points.contiguous(),
-                                             box_idxs_of_pts)
-    return box_idxs_of_pts
-def points_in_boxes_cpu(points, boxes):
-    """Find all boxes in which each point is (CPU). The CPU version of
-    :meth:`points_in_boxes_all`.
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in
-            LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
-                                     dtype=torch.int)
-    for b in range(batch_size):
-        roiaware_pool3d_ext.points_in_boxes_cpu(boxes[b].float().contiguous(),
-                                                points[b].float().contiguous(),
-                                                point_indices[b])
-    point_indices = point_indices.transpose(1, 2)
-    return point_indices
-def points_in_boxes_all(points, boxes):
-    """Find all boxes in which each point is (CUDA).
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert boxes.shape[0] == points.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {boxes.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
-                                       dtype=torch.int).fill_(0)
-    # Same reason as line 25-32
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-    roiaware_pool3d_ext.points_in_boxes_all(boxes.contiguous(),
-                                            points.contiguous(),
-                                            box_idxs_of_pts)
-    return box_idxs_of_pts
--- a/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
+++ b/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import torch
-from torch import nn as nn
-from torch.autograd import Function
-from . import roiaware_pool3d_ext
-class RoIAwarePool3d(nn.Module):
-    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
-        super().__init__()
-        """RoIAwarePool3d module
-        Args:
-            out_size (int or tuple): n or [n1, n2, n3]
-            max_pts_per_voxel (int): m
-            mode (str): 'max' or 'avg'
-        """
-        self.out_size = out_size
-        self.max_pts_per_voxel = max_pts_per_voxel
-        assert mode in ['max', 'avg']
-        pool_method_map = {'max': 0, 'avg': 1}
-        self.mode = pool_method_map[mode]
-    def forward(self, rois, pts, pts_feature):
-        """RoIAwarePool3d module forward.
-        Args:
-            rois (torch.Tensor): [N, 7],in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois
-            pts (torch.Tensor): [npoints, 3]
-            pts_feature (torch.Tensor): [npoints, C]
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
-        """
-        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
-                                            self.out_size,
-                                            self.max_pts_per_voxel, self.mode)
-class RoIAwarePool3dFunction(Function):
-    @staticmethod
-    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
-                mode):
-        """RoIAwarePool3d function forward.
-        Args:
-            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois
-            pts (torch.Tensor): [npoints, 3]
-            pts_feature (torch.Tensor): [npoints, C]
-            out_size (int or tuple): n or [n1, n2, n3]
-            max_pts_per_voxel (int): m
-            mode (int): 0 (max pool) or 1 (average pool)
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
-        """
-        if isinstance(out_size, int):
-            out_x = out_y = out_z = out_size
-        else:
-            assert len(out_size) == 3
-            assert mmcv.is_tuple_of(out_size, int)
-            out_x, out_y, out_z = out_size
-        num_rois = rois.shape[0]
-        num_channels = pts_feature.shape[-1]
-        num_pts = pts.shape[0]
-        pooled_features = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels))
-        argmax = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
-        pts_idx_of_voxels = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
-            dtype=torch.int)
-        roiaware_pool3d_ext.forward(rois, pts, pts_feature, argmax,
-                                    pts_idx_of_voxels, pooled_features, mode)
-        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
-                                            num_pts, num_channels)
-        return pooled_features
-    @staticmethod
-    def backward(ctx, grad_out):
-        """RoIAwarePool3d function forward.
-        Args:
-            grad_out (torch.Tensor): [N, out_x, out_y, out_z, C]
-        Returns:
-            grad_in (torch.Tensor): [npoints, C]
-        """
-        ret = ctx.roiaware_pool3d_for_backward
-        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
-        grad_in = grad_out.new_zeros((num_pts, num_channels))
-        roiaware_pool3d_ext.backward(pts_idx_of_voxels, argmax,
-                                     grad_out.contiguous(), grad_in, mode)
-        return None, None, grad_in, None, None, None
-if __name__ == '__main__':
-    pass
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-// #define DEBUG
-inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
-                                      float &local_x, float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
-                                 float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor pts_indices_tensor) {
-  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the
-  // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
-  // in LiDAR coordinate params pts_indices: (N, npoints)
-  CHECK_CONTIGUOUS(boxes_tensor);
-  CHECK_CONTIGUOUS(pts_tensor);
-  CHECK_CONTIGUOUS(pts_indices_tensor);
-  int boxes_num = boxes_tensor.size(0);
-  int pts_num = pts_tensor.size(0);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *pts_indices = pts_indices_tensor.data_ptr<int>();
-  float local_x = 0, local_y = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    for (int j = 0; j < pts_num; j++) {
-      int cur_in_flag =
-          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
-      pts_indices[i * pts_num + j] = cur_in_flag;
-    }
-  }
-  return 1;
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/serialize/tensor.h>
-#include <torch/types.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-__global__ void points_in_boxes_part_kernel(int batch_size, int boxes_num,
-                                            int pts_num, const float *boxes,
-                                            const float *pts,
-                                            int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num + pt_idx;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[0] = k;
-      break;
-    }
-  }
-}
-__global__ void points_in_boxes_all_kernel(int batch_size, int boxes_num,
-                                           int pts_num, const float *boxes,
-                                           const float *pts,
-                                           int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[k] = 1;
-    }
-    cur_in_flag = 0;
-  }
-}
-void points_in_boxes_part_launcher(int batch_size, int boxes_num, int pts_num,
-                                   const float *boxes, const float *pts,
-                                   int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  cudaError_t err;
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
-  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_part_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
-                                                   boxes, pts, box_idx_of_points);
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-void points_in_boxes_all_launcher(int batch_size, int boxes_num, int pts_num,
-                                  const float *boxes, const float *pts,
-                                  int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
-  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
-  cudaError_t err;
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
-  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_all_kernel<<<blocks, threads>>>(
-      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                         at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  CHECK_INPUT(boxes_tensor);
-  CHECK_INPUT(pts_tensor);
-  CHECK_INPUT(box_idx_of_points_tensor);
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-  points_in_boxes_part_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                                box_idx_of_points);
-  return 1;
-}
-int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
-  // coordinate params boxes_idx_of_points: (B, npoints), default -1
-  CHECK_INPUT(boxes_tensor);
-  CHECK_INPUT(pts_tensor);
-  CHECK_INPUT(box_idx_of_points_tensor);
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-  points_in_boxes_all_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                               box_idx_of_points);
-  return 1;
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
-                              int max_pts_each_voxel, int out_x, int out_y,
-                              int out_z, const float *rois, const float *pts,
-                              const float *pts_feature, int *argmax,
-                              int *pts_idx_of_voxels, float *pooled_features,
-                              int pool_method);
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
-                                       int out_z, int channels,
-                                       int max_pts_each_voxel,
-                                       const int *pts_idx_of_voxels,
-                                       const int *argmax, const float *grad_out,
-                                       float *grad_in, int pool_method);
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
-                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
-                        at::Tensor pooled_features, int pool_method);
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
-                                 at::Tensor argmax, at::Tensor grad_out,
-                                 at::Tensor grad_in, int pool_method);
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor pts_indices_tensor);
-int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                         at::Tensor box_idx_of_points_tensor);
-int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor);
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
-                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
-                        at::Tensor pooled_features, int pool_method) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-  // params pts_feature: (npoints, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params pooled_features: (N, out_x, out_y, out_z, C)
-  // params pool_method: 0: max_pool 1: avg_pool
-  CHECK_INPUT(rois);
-  CHECK_INPUT(pts);
-  CHECK_INPUT(pts_feature);
-  CHECK_INPUT(argmax);
-  CHECK_INPUT(pts_idx_of_voxels);
-  CHECK_INPUT(pooled_features);
-  int boxes_num = rois.size(0);
-  int pts_num = pts.size(0);
-  int channels = pts_feature.size(1);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  assert((out_x < 256) && (out_y < 256) &&
-         (out_z < 256));  // we encode index with 8bit
-  const float *rois_data = rois.data_ptr<float>();
-  const float *pts_data = pts.data_ptr<float>();
-  const float *pts_feature_data = pts_feature.data_ptr<float>();
-  int *argmax_data = argmax.data_ptr<int>();
-  int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-  float *pooled_features_data = pooled_features.data_ptr<float>();
-  roiaware_pool3d_launcher(
-      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-      rois_data, pts_data, pts_feature_data, argmax_data,
-      pts_idx_of_voxels_data, pooled_features_data, pool_method);
-  return 1;
-}
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
-                                 at::Tensor argmax, at::Tensor grad_out,
-                                 at::Tensor grad_in, int pool_method) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  // params pool_method: 0: max_pool 1: avg_pool
-  CHECK_INPUT(pts_idx_of_voxels);
-  CHECK_INPUT(argmax);
-  CHECK_INPUT(grad_out);
-  CHECK_INPUT(grad_in);
-  int boxes_num = pts_idx_of_voxels.size(0);
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int channels = grad_out.size(4);
-  const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-  const int *argmax_data = argmax.data_ptr<int>();
-  const float *grad_out_data = grad_out.data_ptr<float>();
-  float *grad_in_data = grad_in.data_ptr<float>();
-  roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
-                                    max_pts_each_voxel, pts_idx_of_voxels_data,
-                                    argmax_data, grad_out_data, grad_in_data,
-                                    pool_method);
-  return 1;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
-  m.def("backward", &roiaware_pool3d_gpu_backward,
-        "roiaware pool3d backward (CUDA)");
-  m.def("points_in_boxes_part", &points_in_boxes_part,
-        "points_in_boxes_part forward (CUDA)");
-  m.def("points_in_boxes_all", &points_in_boxes_all,
-        "points_in_boxes_all forward (CUDA)");
-  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
-        "points_in_boxes_cpu forward (CPU)");
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/serialize/tensor.h>
-#include <torch/types.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
-                                            int out_x, int out_y, int out_z,
-                                            const float *rois, const float *pts,
-                                            int *pts_mask) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z]
-  // params pts_mask: (N, npoints): -1 means point does not in this box,
-  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int box_idx = blockIdx.y;
-  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
-  pts += pt_idx * 3;
-  rois += box_idx * 7;
-  pts_mask += box_idx * pts_num + pt_idx;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
-  pts_mask[0] = -1;
-  if (cur_in_flag > 0) {
-    float local_z = pts[2] - rois[2];
-    float x_size = rois[3], y_size = rois[4], z_size = rois[5];
-    float x_res = x_size / out_x;
-    float y_res = y_size / out_y;
-    float z_res = z_size / out_z;
-    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
-    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
-    unsigned int z_idx = int(local_z / z_res);
-    x_idx = min(max(x_idx, 0), out_x - 1);
-    y_idx = min(max(y_idx, 0), out_y - 1);
-    z_idx = min(max(z_idx, 0), out_z - 1);
-    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
-#ifdef DEBUG
-    printf(
-        "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
-        "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
-        pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
-        z_idx, x_res, y_res, z_res, idx_encoding);
-#endif
-    pts_mask[0] = idx_encoding;
-  }
-}
-__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
-                                             int max_pts_each_voxel, int out_x,
-                                             int out_y, int out_z,
-                                             const int *pts_mask,
-                                             int *pts_idx_of_voxels) {
-  // params pts_mask: (N, npoints)  0 or 1
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (box_idx >= boxes_num) return;
-  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-  for (int k = 0; k < pts_num; k++) {
-    if (pts_mask[box_idx * pts_num + k] != -1) {
-      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-      unsigned int z_idx = idx_encoding & 0xFF;
-      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
-                                 y_idx * out_z * max_pts_each_voxel +
-                                 z_idx * max_pts_each_voxel;
-      unsigned int cnt = pts_idx_of_voxels[base_offset];
-      if (cnt < max_num_pts) {
-        pts_idx_of_voxels[base_offset + cnt + 1] = k;
-        pts_idx_of_voxels[base_offset]++;
-      }
-#ifdef DEBUG
-      printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
-             y_idx, z_idx, idx_encoding);
-#endif
-    }
-  }
-}
-__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
-                                   int max_pts_each_voxel, int out_x, int out_y,
-                                   int out_z, const float *pts_feature,
-                                   const int *pts_idx_of_voxels,
-                                   float *pooled_features, int *argmax) {
-  // params pts_feature: (npoints, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
-  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-#ifdef DEBUG
-  printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
-         argmax);
-#endif
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  int argmax_idx = -1;
-  float max_val = -1e50;
-  int total_pts = pts_idx_of_voxels[0];
-  for (int k = 1; k <= total_pts; k++) {
-    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
-      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-      argmax_idx = pts_idx_of_voxels[k];
-    }
-  }
-  if (argmax_idx != -1) {
-    pooled_features[0] = max_val;
-  }
-  argmax[0] = argmax_idx;
-#ifdef DEBUG
-  printf(
-      "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
-      "pts_idx: %p, argmax: (%p, %d)\n",
-      channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
-      pts_idx_of_voxels, argmax, argmax_idx);
-#endif
-}
-__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
-                                   int max_pts_each_voxel, int out_x, int out_y,
-                                   int out_z, const float *pts_feature,
-                                   const int *pts_idx_of_voxels,
-                                   float *pooled_features) {
-  // params pts_feature: (npoints, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
-  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-  float sum_val = 0;
-  int total_pts = pts_idx_of_voxels[0];
-  for (int k = 1; k <= total_pts; k++) {
-    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-  }
-  if (total_pts > 0) {
-    pooled_features[0] = sum_val / total_pts;
-  }
-}
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
-                              int max_pts_each_voxel, int out_x, int out_y,
-                              int out_z, const float *rois, const float *pts,
-                              const float *pts_feature, int *argmax,
-                              int *pts_idx_of_voxels, float *pooled_features,
-                              int pool_method) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-  // params pts_feature: (npoints, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params pooled_features: (N, out_x, out_y, out_z, C)
-  // params pool_method: 0: max_pool 1: avg_pool
-  int *pts_mask = NULL;
-  cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
-  cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
-  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
-  dim3 threads(THREADS_PER_BLOCK);
-  generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
-      boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
-  // TODO: Merge the collect and pool functions, SS
-  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
-  collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
-      boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
-      pts_idx_of_voxels);
-  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-                   boxes_num);
-  if (pool_method == 0) {
-    roiaware_maxpool3d<<<blocks_pool, threads>>>(
-        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-        pts_feature, pts_idx_of_voxels, pooled_features, argmax);
-  } else if (pool_method == 1) {
-    roiaware_avgpool3d<<<blocks_pool, threads>>>(
-        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-        pts_feature, pts_idx_of_voxels, pooled_features);
-  }
-  cudaFree(pts_mask);
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
-                                            int out_x, int out_y, int out_z,
-                                            const int *argmax,
-                                            const float *grad_out,
-                                            float *grad_in) {
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-  if (argmax[0] == -1) return;
-  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
-}
-__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
-                                            int out_x, int out_y, int out_z,
-                                            int max_pts_each_voxel,
-                                            const int *pts_idx_of_voxels,
-                                            const float *grad_out,
-                                            float *grad_in) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-  int total_pts = pts_idx_of_voxels[0];
-  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-  for (int k = 1; k <= total_pts; k++) {
-    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-              grad_out[0] * cur_grad);
-  }
-}
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
-                                       int out_z, int channels,
-                                       int max_pts_each_voxel,
-                                       const int *pts_idx_of_voxels,
-                                       const int *argmax, const float *grad_out,
-                                       float *grad_in, int pool_method) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  // params pool_method: 0: max_pool, 1: avg_pool
-  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-              boxes_num);
-  dim3 threads(THREADS_PER_BLOCK);
-  if (pool_method == 0) {
-    roiaware_maxpool3d_backward<<<blocks, threads>>>(
-        boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
-  } else if (pool_method == 1) {
-    roiaware_avgpool3d_backward<<<blocks, threads>>>(
-        boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
-        pts_idx_of_voxels, grad_out, grad_in);
-  }
-}
--- a/mmdet3d/ops/roipoint_pool3d/__init__.py
+++ b/mmdet3d/ops/roipoint_pool3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .roipoint_pool3d import RoIPointPool3d
-__all__ = ['RoIPointPool3d']
--- a/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
+++ b/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn as nn
-from torch.autograd import Function
-from . import roipoint_pool3d_ext
-class RoIPointPool3d(nn.Module):
-    def __init__(self, num_sampled_points=512):
-        super().__init__()
-        """
-        Args:
-            num_sampled_points (int): Number of samples in each roi
-        """
-        self.num_sampled_points = num_sampled_points
-    def forward(self, points, point_features, boxes3d):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is BxNx3
-            point_features: (B, N, C)
-            boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
-        Returns:
-            torch.Tensor: (B, M, 512, 3 + C) pooled_features
-            torch.Tensor: (B, M) pooled_empty_flag
-        """
-        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
-                                            self.num_sampled_points)
-class RoIPointPool3dFunction(Function):
-    @staticmethod
-    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is (B, N, 3)
-            point_features (torch.Tensor): Input points features shape is \
-                (B, N, C)
-            boxes3d (torch.Tensor): Input bounding boxes whose shape is \
-                (B, M, 7)
-            num_sampled_points (int): the num of sampled points
-        Returns:
-            torch.Tensor: (B, M, 512, 3 + C) pooled_features
-            torch.Tensor: (B, M) pooled_empty_flag
-        """
-        assert points.shape.__len__() == 3 and points.shape[2] == 3
-        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
-            1], point_features.shape[2]
-        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
-        pooled_features = point_features.new_zeros(
-            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
-        pooled_empty_flag = point_features.new_zeros(
-            (batch_size, boxes_num)).int()
-        roipoint_pool3d_ext.forward(points.contiguous(),
-                                    pooled_boxes3d.contiguous(),
-                                    point_features.contiguous(),
-                                    pooled_features, pooled_empty_flag)
-        return pooled_features, pooled_empty_flag
-    @staticmethod
-    def backward(ctx, grad_out):
-        raise NotImplementedError
-if __name__ == '__main__':
-    pass
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
-/*
-Modified for
-https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-Point cloud feature pooling
-Written by Shaoshuai Shi
-All Rights Reserved 2018.
-*/
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-#define CHECK_CUDA(x) do { \
-  if (!x.type().is_cuda()) { \
-    fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
-    exit(-1); \
-  } \
-} while (0)
-#define CHECK_CONTIGUOUS(x) do { \
-  if (!x.is_contiguous()) { \
-    fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
-    exit(-1); \
-  } \
-} while (0)
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag);
-int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params boxes3d: (B, M, 7)
-    // params pts_feature: (B, N, C)
-    // params pooled_features: (B, M, 512, 3+C)
-    // params pooled_empty_flag: (B, M)
-    CHECK_INPUT(xyz);
-    CHECK_INPUT(boxes3d);
-    CHECK_INPUT(pts_feature);
-    CHECK_INPUT(pooled_features);
-    CHECK_INPUT(pooled_empty_flag);
-    int batch_size = xyz.size(0);
-    int pts_num = xyz.size(1);
-    int boxes_num = boxes3d.size(1);
-    int feature_in_len = pts_feature.size(2);
-    int sampled_pts_num = pooled_features.size(2);
-    const float * xyz_data = xyz.data<float>();
-    const float * boxes3d_data = boxes3d.data<float>();
-    const float * pts_feature_data = pts_feature.data<float>();
-    float * pooled_features_data = pooled_features.data<float>();
-    int * pooled_empty_flag_data = pooled_empty_flag.data<int>();
-    roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-                       xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data);
-    return 1;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)");
-}
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-/*
-Modified from
-https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-Point cloud feature pooling
-Written by Shaoshuai Shi
-All Rights Reserved 2018.
-*/
-#include <math.h>
-#include <stdio.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
-  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > dz / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
-                  (local_y > -dy / 2.0) & (local_y < dy / 2.0);
-  return in_flag;
-}
-__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){
-    // params xyz: (B, N, 3)
-    // params boxes3d: (B, M, 7)
-    // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    int bs_idx = blockIdx.z;
-    if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
-        return;
-    }
-    int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-    pts_assign[assign_idx] = 0;
-    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
-    float local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y);
-    pts_assign[assign_idx] = cur_in_flag;
-    // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
-}
-__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num,
-                               const int *pts_assign, int *pts_idx, int *pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params pts_feature: (B, N, C)
-    // params pts_assign: (B, N)
-    // params pts_idx: (B, M, 512)
-    // params pooled_empty_flag: (B, M)
-    int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (boxes_idx >= boxes_num){
-        return;
-    }
-    int bs_idx = blockIdx.y;
-    int cnt = 0;
-    for (int k = 0; k < pts_num; k++){
-        if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){
-            if (cnt < sampled_pts_num){
-                pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k;
-                cnt++;
-            }
-            else break;
-        }
-    }
-    if (cnt == 0){
-        pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-    }
-    else if (cnt < sampled_pts_num){
-        // duplicate same points for sampling
-        for (int k = cnt; k < sampled_pts_num; k++){
-            int duplicate_idx = k % cnt;
-            int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-            pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
-        }
-    }
-}
-__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                                   const float *xyz, const int *pts_idx, const float *pts_feature,
-                                   float *pooled_features, int *pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params pts_idx: (B, M, 512)
-    // params pts_feature: (B, N, C)
-    // params pooled_features: (B, M, 512, 3+C)
-    // params pooled_empty_flag: (B, M)
-    int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    int bs_idx = blockIdx.z;
-    if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
-        return;
-    }
-    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){
-        return;
-    }
-    int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx;
-    int src_pt_idx = pts_idx[temp_idx];
-    int dst_feature_offset = temp_idx * (3 + feature_in_len);
-    for (int j = 0; j < 3; j++)
-        pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-    int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-    for (int j = 0; j < feature_in_len; j++)
-        pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j];
-}
-void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){
-    // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
-    int *pts_assign = NULL;
-    cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int));  // (batch_size, N, M)
-    // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
-    dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-    assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign);
-    int *pts_idx = NULL;
-    cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int));  // (batch_size, M, sampled_pts_num)
-    dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);  // blockIdx.x(col), blockIdx.y(row)
-    get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag);
-    dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
-    roipool3d_forward<<<blocks_pool, threads>>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-                                                      xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
-    cudaFree(pts_assign);
-    cudaFree(pts_idx);
-#ifdef DEBUG
-    cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}