Feat pointnet ops

006b7ccf · wuyuefeng · zhangwenwei · 28e511cd · 006b7ccf · 006b7ccf
Commit 006b7ccf authored May 18, 2020 by wuyuefeng Committed by zhangwenwei May 18, 2020
20 changed files
--- a/mmdet3d/models/roi_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/__init__.py
@@ -2,9 +2,9 @@ from .base_3droi_head import Base3DRoIHead
 from .bbox_heads import PartA2BboxHead
 from .mask_heads import PointwiseSemanticHead
 from .part_aggregation_roi_head import PartAggregationROIHead
-from .roi_extractors import Single3DRoIAwareExtractor
+from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor
 __all__ = [
    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
-    'Single3DRoIAwareExtractor', 'PartA2BboxHead'
+    'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor'
 ]
--- a/mmdet3d/ops/__init__.py
+++ b/mmdet3d/ops/__init__.py
 from mmdet.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version,
                       get_compiling_cuda_version, nms, roi_align,
                       sigmoid_focal_loss)
+from .ball_query import ball_query
+from .furthest_point_sample import furthest_point_sample
+from .gather_points import gather_points
+from .group_points import group_points, grouping_operation
+from .interpolate import three_interpolate, three_nn
 from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
 from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_cpu,
                              points_in_boxes_gpu)
@@ -15,5 +20,7 @@ __all__ = [
    'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss',
    'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck',
    'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
-    'make_sparse_convmodule'
+    'make_sparse_convmodule', 'ball_query', 'furthest_point_sample',
+    'three_interpolate', 'three_nn', 'gather_points', 'grouping_operation',
+    'group_points'
 ]
--- a/mmdet3d/ops/ball_query/__init__.py
+++ b/mmdet3d/ops/ball_query/__init__.py
+from .ball_query import ball_query
+__all__ = ['ball_query']
--- a/mmdet3d/ops/ball_query/ball_query.py
+++ b/mmdet3d/ops/ball_query/ball_query.py
+import torch
+from torch.autograd import Function
+from . import ball_query_ext
+class BallQuery(Function):
+    """Ball Query
+    Find nearby points in spherical space.
+    """
+    @staticmethod
+    def forward(ctx, radius: float, sample_num: int, xyz: torch.Tensor,
+                center_xyz: torch.Tensor) -> torch.Tensor:
+        """forward.
+        Args:
+            radius (float): radius of the balls.
+            sample_num (int): maximum number of features in the balls.
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
+        Returns:
+            Tensor: (B, npoint, nsample) tensor with the indicies of
+                the features that form the query balls.
+        """
+        assert center_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+        B, N, _ = xyz.size()
+        npoint = center_xyz.size(1)
+        idx = torch.cuda.IntTensor(B, npoint, sample_num).zero_()
+        ball_query_ext.ball_query_wrapper(B, N, npoint, radius, sample_num,
+                                          center_xyz, xyz, idx)
+        ctx.mark_non_differentiable(idx)
+        return idx
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+ball_query = BallQuery.apply
--- a/mmdet3d/ops/ball_query/src/ball_query.cpp
+++ b/mmdet3d/ops/ball_query/src/ball_query.cpp
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <torch/extension.h>
+extern THCState *state;
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
+	const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
+int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    ball_query_kernel_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
+    return 1;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper");
+}
--- a/mmdet3d/ops/ball_query/src/ball_query_cuda.cu
+++ b/mmdet3d/ops/ball_query/src/ball_query_cuda.cu
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void ball_query_kernel(int b, int n, int m, float radius, int nsample,
+    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+}
+void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample, \
+    const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_kernel<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
--- a/mmdet3d/ops/furthest_point_sample/__init__.py
+++ b/mmdet3d/ops/furthest_point_sample/__init__.py
+from .furthest_point_sample import furthest_point_sample
+__all__ = ['furthest_point_sample']
--- a/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
+++ b/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
+import torch
+from torch.autograd import Function
+from . import furthest_point_sample_ext
+class FurthestPointSampling(Function):
+    """Furthest Point Sampling.
+    Uses iterative furthest point sampling to select a set of
+    features whose corresponding points have the furthest distance.
+    """
+    @staticmethod
+    def forward(ctx, points_xyz: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """forward.
+        Args:
+            points_xyz (Tensor): (B, N, 3) where N > num_points.
+            num_points (int): Number of points in the sampled set.
+        Returns:
+             Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_xyz.is_contiguous()
+        B, N, _ = points_xyz.size()
+        output = torch.cuda.IntTensor(B, num_points)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+        furthest_point_sample_ext.furthest_point_sampling_wrapper(
+            B, N, num_points, points_xyz, temp, output)
+        ctx.mark_non_differentiable(output)
+        return output
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+furthest_point_sample = FurthestPointSampling.apply
--- a/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
+++ b/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+#include <THC/THC.h>
+#include <torch/extension.h>
+extern THCState *state;
+int furthest_point_sampling_wrapper(int b, int n, int m,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
+void furthest_point_sampling_kernel_launcher(int b, int n, int m,
+    const float *dataset, float *temp, int *idxs, cudaStream_t stream);
+int furthest_point_sampling_wrapper(int b, int n, int m,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
+    const float *points = points_tensor.data<float>();
+    float *temp = temp_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
+    return 1;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
+}
--- a/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
+++ b/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
+#include <stdio.h>
+#include <stdlib.h>
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+    return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(int b, int n, int m,
+    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    if (m <= 0) return;
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+    int batch_index = blockIdx.x;
+    dataset += batch_index * n * 3;
+    temp += batch_index * n;
+    idxs += batch_index * m;
+    int tid = threadIdx.x;
+    const int stride = block_size;
+    int old = 0;
+    if (threadIdx.x == 0)
+    idxs[0] = old;
+    __syncthreads();
+    for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+        float x2, y2, z2;
+        x2 = dataset[k * 3 + 0];
+        y2 = dataset[k * 3 + 1];
+        z2 = dataset[k * 3 + 2];
+        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+        // if (mag <= 1e-3)
+        // continue;
+        float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+        float d2 = min(d, temp[k]);
+        temp[k] = d2;
+        besti = d2 > best ? k : besti;
+        best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+    if (block_size >= 1024) {
+        if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 512) {
+        if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 256) {
+        if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 128) {
+        if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 64) {
+        if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 32) {
+        if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 16) {
+        if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 8) {
+        if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 4) {
+        if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 2) {
+        if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+        }
+        __syncthreads();
+    }
+    old = dists_i[0];
+    if (tid == 0)
+        idxs[j] = old;
+    }
+}
+void furthest_point_sampling_kernel_launcher(int b, int n, int m,
+    const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    cudaError_t err;
+    unsigned int n_threads = opt_n_threads(n);
+    switch (n_threads) {
+        case 1024:
+        furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 512:
+        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 256:
+        furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 128:
+        furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 64:
+        furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 32:
+        furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 16:
+        furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 8:
+        furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 4:
+        furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 2:
+        furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        case 1:
+        furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+        default:
+        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+    }
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
--- a/mmdet3d/ops/gather_points/__init__.py
+++ b/mmdet3d/ops/gather_points/__init__.py
+from .gather_points import gather_points
+__all__ = ['gather_points']
--- a/mmdet3d/ops/gather_points/gather_points.py
+++ b/mmdet3d/ops/gather_points/gather_points.py
+import torch
+from torch.autograd import Function
+from . import gather_points_ext
+class GatherPoints(Function):
+    """Gather Points
+    Gather points with given index.
+    """
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indicies: torch.Tensor) -> torch.Tensor:
+        """forward.
+        Args:
+            features (Tensor): (B, C, N) features to gather.
+            indicies (Tensor): (B, M) where M is the number of points.
+        Returns:
+            Tensor: (B, C, M) where M is the number of points.
+        """
+        assert features.is_contiguous()
+        assert indicies.is_contiguous()
+        B, npoint = indicies.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, npoint)
+        gather_points_ext.gather_points_wrapper(B, C, N, npoint, features,
+                                                indicies, output)
+        ctx.for_backwards = (indicies, C, N)
+        ctx.mark_non_differentiable(indicies)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+        grad_out_data = grad_out.data.contiguous()
+        gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint,
+                                                     grad_out_data, idx,
+                                                     grad_features.data)
+        return grad_features, None
+gather_points = GatherPoints.apply
--- a/mmdet3d/ops/gather_points/src/gather_points.cpp
+++ b/mmdet3d/ops/gather_points/src/gather_points.cpp
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+#include <THC/THC.h>
+#include <torch/extension.h>
+extern THCState *state;
+int gather_points_wrapper(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void gather_points_kernel_launcher(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out, cudaStream_t stream);
+int gather_points_grad_wrapper(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+int gather_points_wrapper(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
+    return 1;
+}
+int gather_points_grad_wrapper(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_points = grad_points_tensor.data<float>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx, grad_points, stream);
+    return 1;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("gather_points_wrapper", &gather_points_wrapper, "gather_points_wrapper");
+    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper, "gather_points_grad_wrapper");
+}
--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+#include <stdio.h>
+#include <stdlib.h>
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void gather_points_kernel(int b, int c, int n, int m,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, M)
+    // output:
+    //      out: (B, C, M)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+}
+void gather_points_kernel_launcher(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out, cudaStream_t stream) {
+    // points: (B, C, N)
+    // idx: (B, npoints)
+    // output:
+    //      out: (B, C, npoints)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, M)
+    // idx: (B, M)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+}
+void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+    // grad_out: (B, C, npoints)
+    // idx: (B, npoints)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
--- a/mmdet3d/ops/group_points/__init__.py
+++ b/mmdet3d/ops/group_points/__init__.py
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+__all__ = ['QueryAndGroup', 'GroupAll', 'grouping_operation']
--- a/mmdet3d/ops/group_points/group_points.py
+++ b/mmdet3d/ops/group_points/group_points.py
+from typing import Tuple
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from ..ball_query import ball_query
+from . import group_points_ext
+class QueryAndGroup(nn.Module):
+    """Query and Group.
+    Groups with a ball query of radius
+    Args:
+        radius (float): radius of the balls.
+        sample_num (int): Maximum number of features to gather in the ball.
+        use_xyz (bool): Whether to use xyz.
+            Default: True.
+        return_grouped_xyz (bool): Whether to return grouped xyz.
+            Default: False.
+        normalize_xyz (bool): Whether to normalize xyz.
+            Default: False.
+        uniform_sample (bool): Whether to sample uniformly.
+            Default: False
+        return_unique_cnt (bool): Whether to return the count of
+            unique samples.
+            Default: False.
+    """
+    def __init__(self,
+                 radius,
+                 sample_num,
+                 use_xyz=True,
+                 return_grouped_xyz=False,
+                 normalize_xyz=False,
+                 uniform_sample=False,
+                 return_unique_cnt=False):
+        super(QueryAndGroup, self).__init__()
+        self.radius = radius
+        self.sample_num = sample_num
+        self.use_xyz = use_xyz
+        self.return_grouped_xyz = return_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.uniform_sample = uniform_sample
+        self.return_unique_cnt = return_unique_cnt
+        if self.return_unique_cnt:
+            assert self.uniform_sample
+    def forward(self, points_xyz, center_xyz, features=None):
+        """forward
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (Tensor): (B, npoint, 3) Centriods.
+            features (Tensor): (B, C, N) Descriptors of the features.
+        Return：
+            Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
+        """
+        idx = ball_query(self.radius, self.sample_num, points_xyz, center_xyz)
+        if self.uniform_sample:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(
+                        0,
+                        num_unique, (self.sample_num - num_unique, ),
+                        dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+        xyz_trans = points_xyz.transpose(1, 2).contiguous()
+        # (B, 3, npoint, sample_num)
+        grouped_xyz = grouping_operation(xyz_trans, idx)
+        grouped_xyz -= center_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                # (B, C + 3, npoint, sample_num)
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            assert (self.use_xyz
+                    ), 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz
+        ret = [new_features]
+        if self.return_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.return_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+class GroupAll(nn.Module):
+    """Group All.
+    Group xyz with feature.
+    Args:
+        use_xyz (bool): Whether to use xyz.
+    """
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: torch.Tensor = None):
+        """forward.
+        Args:
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            new_xyz (Tensor): Ignored.
+            features (Tensor): (B, C, N) features to group.
+        Return:
+            Tensor: (B, C + 3, 1, N) Grouped feature.
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+        return new_features
+class GroupingOperation(Function):
+    """Grouping Operation.
+    Group feature with given index.
+    """
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """forward.
+        Args:
+            features (Tensor): (B, C, N) tensor of features to group.
+            indices (Tensor): (B, npoint, nsample) the indicies of
+                features to group with.
+        Returns:
+            Tensor: (B, C, npoint, nsample) Grouped features.
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+        B, nfeatures, nsample = indices.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+        group_points_ext.forward(B, C, N, nfeatures, nsample, features,
+                                 indices, output)
+        ctx.for_backwards = (indices, N)
+        return output
+    @staticmethod
+    def backward(ctx,
+                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """backward.
+        Args:
+            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
+                of the output from forward.
+        Returns:
+            Tensor: (B, C, N) gradient of the features.
+        """
+        idx, N = ctx.for_backwards
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+        grad_out_data = grad_out.data.contiguous()
+        group_points_ext.backward(B, C, N, npoint, nsample, grad_out_data, idx,
+                                  grad_features.data)
+        return grad_features, None
+grouping_operation = GroupingOperation.apply
--- a/mmdet3d/ops/group_points/src/group_points.cpp
+++ b/mmdet3d/ops/group_points/src/group_points.cpp
+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <THC/THC.h>
+#include <torch/extension.h>
+extern THCState *state;
+int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out, cudaStream_t stream);
+int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const float *grad_out = grad_out_tensor.data<float>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
+    return 1;
+}
+int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream);
+    return 1;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &group_points_wrapper, "group_points_wrapper");
+    m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
+}
--- a/mmdet3d/ops/group_points/src/group_points_cuda.cu
+++ b/mmdet3d/ops/group_points/src/group_points_cuda.cu
+#include <stdio.h>
+#include <stdlib.h>
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+}
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    out[out_idx] = points[in_idx];
+}
+void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out, cudaStream_t stream) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
--- a/mmdet3d/ops/interpolate/__init__.py
+++ b/mmdet3d/ops/interpolate/__init__.py
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+__all__ = ['three_nn', 'three_interpolate']
--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <THC/THC.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <torch/extension.h>
+extern THCState *state;
+void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
+  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
+	const float *known, float *dist2, int *idx, cudaStream_t stream);
+void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_kernel_launcher(int b, int c, int m, int n,
+    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
+void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
+    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
+void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
+    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    const float *unknown = unknown_tensor.data<float>();
+    const float *known = known_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
+}
+void three_interpolate_wrapper(int b, int c, int m, int n,
+                         at::Tensor points_tensor,
+                         at::Tensor idx_tensor,
+                         at::Tensor weight_tensor,
+                         at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *out = out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
+}
+void three_interpolate_grad_wrapper(int b, int c, int n, int m,
+                            at::Tensor grad_out_tensor,
+                            at::Tensor idx_tensor,
+                            at::Tensor weight_tensor,
+                            at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
+    m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper");
+    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper");
+}