merge master

f27d308f · yinchimaoliang · c66ae813 · 27ebcfac · f27d308f · f27d308f
Commit f27d308f authored Jun 07, 2020 by yinchimaoliang
20 changed files
--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
@@ -3,82 +3,92 @@
 #define TOTAL_THREADS 1024
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 __global__ void gather_points_kernel(int b, int c, int n, int m,
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+                                     const float *__restrict__ points,
-    // points: (B, C, N)
+                                     const int *__restrict__ idx,
-    // idx: (B, M)
+                                     float *__restrict__ out) {
-    // output:
+  // points: (B, C, N)
-    //      out: (B, C, M)
+  // idx: (B, M)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      out: (B, C, M)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    out += bs_idx * c * m + c_idx * m + pt_idx;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-    idx += bs_idx * m + pt_idx;
-    points += bs_idx * c * n + c_idx * n;
+  out += bs_idx * c * m + c_idx * m + pt_idx;
-    out[0] = points[idx[0]];
+  idx += bs_idx * m + pt_idx;
+  points += bs_idx * c * n + c_idx * n;
+  out[0] = points[idx[0]];
 }
 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                   const float *points, const int *idx,
-    // points: (B, C, N)
+                                   float *out, cudaStream_t stream) {
-    // idx: (B, npoints)
+  // points: (B, C, N)
-    // output:
+  // idx: (B, npoints)
-    //      out: (B, C, npoints)
+  // output:
+  //      out: (B, C, npoints)
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  cudaError_t err;
-    dim3 threads(THREADS_PER_BLOCK);
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
+  dim3 threads(THREADS_PER_BLOCK);
-    err = cudaGetLastError();
+  gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
-    if (cudaSuccess != err) {
+                                                       idx, out);
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+  err = cudaGetLastError();
-    }
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
-__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
-    const int *__restrict__ idx, float *__restrict__ grad_points) {
+                                          const float *__restrict__ grad_out,
-    // grad_out: (B, C, M)
+                                          const int *__restrict__ idx,
-    // idx: (B, M)
+                                          float *__restrict__ grad_points) {
-    // output:
+  // grad_out: (B, C, M)
-    //      grad_points: (B, C, N)
+  // idx: (B, M)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      grad_points: (B, C, N)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-    idx += bs_idx * m + pt_idx;
-    grad_points += bs_idx * c * n + c_idx * n;
+  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
-    atomicAdd(grad_points + idx[0], grad_out[0]);
+  grad_points += bs_idx * c * n + c_idx * n;
+  atomicAdd(grad_points + idx[0], grad_out[0]);
 }
 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+                                        const float *grad_out, const int *idx,
-    // grad_out: (B, C, npoints)
+                                        float *grad_points,
-    // idx: (B, npoints)
+                                        cudaStream_t stream) {
-    // output:
+  // grad_out: (B, C, npoints)
-    //      grad_points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
-    cudaError_t err;
+  //      grad_points: (B, C, N)
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-    gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+      b, c, n, npoints, grad_out, idx, grad_points);
-        exit(-1);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/group_points/src/group_points.cpp
+++ b/mmdet3d/ops/group_points/src/group_points.cpp
-#include <torch/serialize/tensor.h>
+#include <THC/THC.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <vector>
-#include <THC/THC.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor);
 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream);
+                                  const float *points, const int *idx,
+                                  float *out, cudaStream_t stream);
 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor);
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+                                       int nsample, const float *grad_out,
+                                       const int *idx, float *grad_points,
+                                       cudaStream_t stream);
 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor) {
-    float *grad_points = grad_points_tensor.data<float>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
-    const int *idx = idx_tensor.data<int>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    const float *grad_out = grad_out_tensor.data<float>();
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
+  group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
-    return 1;
+                                    grad_points, stream);
+  return 1;
 }
 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor) {
-    const float *points = points_tensor.data<float>();
+  const float *points = points_tensor.data_ptr<float>();
-    const int *idx = idx_tensor.data<int>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    float *out = out_tensor.data<float>();
+  float *out = out_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream);
+  group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
-    return 1;
+                               stream);
+  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &group_points_wrapper, "group_points_wrapper");
+  m.def("forward", &group_points_wrapper, "group_points_wrapper");
-    m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
+  m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
 }
--- a/mmdet3d/ops/group_points/src/group_points_cuda.cu
+++ b/mmdet3d/ops/group_points/src/group_points_cuda.cu
@@ -2,84 +2,97 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample,
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
-    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+                                         int nsample,
-    // grad_out: (B, C, npoints, nsample)
+                                         const float *__restrict__ grad_out,
-    // idx: (B, npoints, nsample)
+                                         const int *__restrict__ idx,
-    // output:
+                                         float *__restrict__ grad_points) {
-    //      grad_points: (B, C, N)
+  // grad_out: (B, C, npoints, nsample)
-    int bs_idx = blockIdx.z;
+  // idx: (B, npoints, nsample)
-    int c_idx = blockIdx.y;
+  // output:
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
+  //      grad_points: (B, C, N)
-    int pt_idx = index / nsample;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-    int sample_idx = index % nsample;
+  int sample_idx = index % nsample;
-    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+              pt_idx * nsample + sample_idx;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
 }
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+                                       int nsample, const float *grad_out,
-    // grad_out: (B, C, npoints, nsample)
+                                       const int *idx, float *grad_points,
-    // idx: (B, npoints, nsample)
+                                       cudaStream_t stream) {
-    // output:
+  // grad_out: (B, C, npoints, nsample)
-    //      grad_points: (B, C, N)
+  // idx: (B, npoints, nsample)
-    cudaError_t err;
+  // output:
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  //      grad_points: (B, C, N)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+  group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
-    err = cudaGetLastError();
+  err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+    exit(-1);
-    }
+  }
 }
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample,
+  int sample_idx = index % nsample;
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
-    // points: (B, C, N)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      out: (B, C, npoints, nsample)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int pt_idx = index / nsample;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-    int sample_idx = index % nsample;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  out[out_idx] = points[in_idx];
-    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    out[out_idx] = points[in_idx];
 }
 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                  const float *points, const int *idx,
-    // points: (B, C, N)
+                                  float *out, cudaStream_t stream) {
-    // idx: (B, npoints, nsample)
+  // points: (B, C, N)
-    // output:
+  // idx: (B, npoints, nsample)
-    //      out: (B, C, npoints, nsample)
+  // output:
-    cudaError_t err;
+  //      out: (B, C, npoints, nsample)
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  cudaError_t err;
-    dim3 threads(THREADS_PER_BLOCK);
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
+  group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
-    // cudaDeviceSynchronize();  // for using printf in kernel function
+                                                      points, idx, out);
-    err = cudaGetLastError();
+  // cudaDeviceSynchronize();  // for using printf in kernel function
-    if (cudaSuccess != err) {
+  err = cudaGetLastError();
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+  if (cudaSuccess != err) {
-        exit(-1);
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    }
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
-#include <torch/serialize/tensor.h>
-#include <vector>
 #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
+                      at::Tensor idx_tensor);
 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-	const float *known, float *dist2, int *idx, cudaStream_t stream);
+                              const float *known, float *dist2, int *idx,
+                              cudaStream_t stream);
+void three_interpolate_wrapper(int b, int c, int m, int n,
-void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+                               at::Tensor weight_tensor, at::Tensor out_tensor);
 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
+                                       const float *points, const int *idx,
+                                       const float *weight, float *out,
+                                       cudaStream_t stream);
+void three_interpolate_grad_wrapper(int b, int c, int n, int m,
+                                    at::Tensor grad_out_tensor,
+                                    at::Tensor idx_tensor,
+                                    at::Tensor weight_tensor,
+                                    at::Tensor grad_points_tensor);
-void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor,
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+                                            const float *grad_out,
+                                            const int *idx, const float *weight,
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
+                                            float *grad_points,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
+                                            cudaStream_t stream);
 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-    const float *unknown = unknown_tensor.data<float>();
+                      at::Tensor idx_tensor) {
-    const float *known = known_tensor.data<float>();
+  const float *unknown = unknown_tensor.data_ptr<float>();
-    float *dist2 = dist2_tensor.data<float>();
+  const float *known = known_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data<int>();
+  float *dist2 = dist2_tensor.data_ptr<float>();
+  int *idx = idx_tensor.data_ptr<int>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
 }
 void three_interpolate_wrapper(int b, int c, int m, int n,
-                         at::Tensor points_tensor,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                         at::Tensor idx_tensor,
+                               at::Tensor weight_tensor,
-                         at::Tensor weight_tensor,
+                               at::Tensor out_tensor) {
-                         at::Tensor out_tensor) {
+  const float *points = points_tensor.data_ptr<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
-    const float *points = points_tensor.data<float>();
+  float *out = out_tensor.data_ptr<float>();
-    const float *weight = weight_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    float *out = out_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
-    cudaStream_t stream = THCState_getCurrentStream(state);
+                                    stream);
-    three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
 }
 void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                            at::Tensor grad_out_tensor,
+                                    at::Tensor grad_out_tensor,
-                            at::Tensor idx_tensor,
+                                    at::Tensor idx_tensor,
-                            at::Tensor weight_tensor,
+                                    at::Tensor weight_tensor,
-                            at::Tensor grad_points_tensor) {
+                                    at::Tensor grad_points_tensor) {
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
-    const float *grad_out = grad_out_tensor.data<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
-    const float *weight = weight_tensor.data<float>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
-    float *grad_points = grad_points_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    const int *idx = idx_tensor.data<int>();
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
-    three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream);
+                                         grad_points, stream);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
+  m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
-    m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper");
+  m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
-    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper");
+        "three_interpolate_wrapper");
+  m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
+        "three_interpolate_grad_wrapper");
 }
--- a/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
@@ -3,91 +3,103 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
-__global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points,
+                                         const float *__restrict__ points,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+                                         const int *__restrict__ idx,
-    // points: (B, C, M)
+                                         const float *__restrict__ weight,
-    // idx: (B, N, 3)
+                                         float *__restrict__ out) {
-    // weight: (B, N, 3)
+  // points: (B, C, M)
-    // output:
+  // idx: (B, N, 3)
-    //      out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      out: (B, C, N)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    weight += bs_idx * n * 3 + pt_idx * 3;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-    points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  weight += bs_idx * n * 3 + pt_idx * 3;
-    out += bs_idx * c * n + c_idx * n;
+  points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+  out += bs_idx * c * n + c_idx * n;
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
 }
 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
+                                       const float *points, const int *idx,
-    // points: (B, C, M)
+                                       const float *weight, float *out,
-    // idx: (B, N, 3)
+                                       cudaStream_t stream) {
-    // weight: (B, N, 3)
+  // points: (B, C, M)
-    // output:
+  // idx: (B, N, 3)
-    //      out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
-    cudaError_t err;
+  //      out: (B, C, N)
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
-    three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    err = cudaGetLastError();
+  dim3 threads(THREADS_PER_BLOCK);
-    if (cudaSuccess != err) {
+  three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+                                                           idx, weight, out);
-        exit(-1);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
+__global__ void three_interpolate_grad_kernel(
-__global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+    const int *__restrict__ idx, const float *__restrict__ weight,
-    // grad_out: (B, C, N)
+    float *__restrict__ grad_points) {
-    // weight: (B, N, 3)
+  // grad_out: (B, C, N)
-    // output:
+  // weight: (B, N, 3)
-    //      grad_points: (B, C, M)
+  // output:
+  //      grad_points: (B, C, M)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    weight += bs_idx * n * 3 + pt_idx * 3;
+  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    grad_points += bs_idx * c * m + c_idx * m;
+  weight += bs_idx * n * 3 + pt_idx * 3;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  grad_points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
 }
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
+                                            const float *grad_out,
-    // grad_out: (B, C, N)
+                                            const int *idx, const float *weight,
-    // weight: (B, N, 3)
+                                            float *grad_points,
-    // output:
+                                            cudaStream_t stream) {
-    //      grad_points: (B, C, M)
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
-    cudaError_t err;
+  // output:
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  //      grad_points: (B, C, M)
-    dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+  cudaError_t err;
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-    err = cudaGetLastError();
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    if (cudaSuccess != err) {
+  dim3 threads(THREADS_PER_BLOCK);
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+  three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
-        exit(-1);
+      b, c, n, m, grad_out, idx, weight, grad_points);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
@@ -3,72 +3,84 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
-__global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown,
+  int bs_idx = blockIdx.y;
-    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    // unknown: (B, N, 3)
+  if (bs_idx >= b || pt_idx >= n) return;
-    // known: (B, M, 3)
-    // output:
-    //      dist2: (B, N, 3)
-    //      idx: (B, N, 3)
-    int bs_idx = blockIdx.y;
+  unknown += bs_idx * n * 3 + pt_idx * 3;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  known += bs_idx * m * 3;
-    if (bs_idx >= b || pt_idx >= n) return;
+  dist2 += bs_idx * n * 3 + pt_idx * 3;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    unknown += bs_idx * n * 3 + pt_idx * 3;
+  float ux = unknown[0];
-    known += bs_idx * m * 3;
+  float uy = unknown[1];
-    dist2 += bs_idx * n * 3 + pt_idx * 3;
+  float uz = unknown[2];
-    idx += bs_idx * n * 3 + pt_idx * 3;
-    float ux = unknown[0];
+  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-    float uy = unknown[1];
+  int besti1 = 0, besti2 = 0, besti3 = 0;
-    float uz = unknown[2];
+  for (int k = 0; k < m; ++k) {
+    float x = known[k * 3 + 0];
-    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    float y = known[k * 3 + 1];
-    int besti1 = 0, besti2 = 0, besti3 = 0;
+    float z = known[k * 3 + 2];
-    for (int k = 0; k < m; ++k) {
+    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-        float x = known[k * 3 + 0];
+    if (d < best1) {
-        float y = known[k * 3 + 1];
+      best3 = best2;
-        float z = known[k * 3 + 2];
+      besti3 = besti2;
-        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      best2 = best1;
-        if (d < best1) {
+      besti2 = besti1;
-            best3 = best2; besti3 = besti2;
+      best1 = d;
-            best2 = best1; besti2 = besti1;
+      besti1 = k;
-            best1 = d; besti1 = k;
+    } else if (d < best2) {
-        }
+      best3 = best2;
-        else if (d < best2) {
+      besti3 = besti2;
-            best3 = best2; besti3 = besti2;
+      best2 = d;
-            best2 = d; besti2 = k;
+      besti2 = k;
-        }
+    } else if (d < best3) {
-        else if (d < best3) {
+      best3 = d;
-            best3 = d; besti3 = k;
+      besti3 = k;
-        }
    }
-    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+  }
-    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
 }
 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-    const float *known, float *dist2, int *idx, cudaStream_t stream) {
+                              const float *known, float *dist2, int *idx,
-    // unknown: (B, N, 3)
+                              cudaStream_t stream) {
-    // known: (B, M, 3)
+  // unknown: (B, N, 3)
-    // output:
+  // known: (B, M, 3)
-    //      dist2: (B, N, 3)
+  // output:
-    //      idx: (B, N, 3)
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
-    cudaError_t err;
+  cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
-    dim3 threads(THREADS_PER_BLOCK);
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
+  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
+                                                  dist2, idx);
-    err = cudaGetLastError();
+  err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+    exit(-1);
-    }
+  }
 }
--- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
--- a/mmdet3d/ops/roiaware_pool3d/__init__.py
+++ b/mmdet3d/ops/roiaware_pool3d/__init__.py
-from .points_in_boxes import points_in_boxes_cpu, points_in_boxes_gpu
+from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu,
+                              points_in_boxes_gpu)
 from .roiaware_pool3d import RoIAwarePool3d
-__all__ = ['RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu']
+__all__ = [
+    'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
+    'points_in_boxes_batch'
+]
--- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
@@ -53,3 +53,29 @@ def points_in_boxes_cpu(points, boxes):
                                            point_indices)
    return point_indices
+def points_in_boxes_batch(points, boxes):
+    """Find points that are in boxes (CUDA)
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
+            (x, y, z) is the bottom center
+    Returns:
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0
+    """
+    assert boxes.shape[0] == points.shape[0]
+    assert boxes.shape[2] == 7
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+    roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(),
+                                              points.contiguous(),
+                                              box_idxs_of_pts)
+    return box_idxs_of_pts
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
--- a/mmdet3d/ops/spconv/include/paramsgrid.h
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
--- a/mmdet3d/ops/spconv/include/prettyprint.h
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
--- a/mmdet3d/ops/spconv/include/spconv/box_iou.h
+++ b/mmdet3d/ops/spconv/include/spconv/box_iou.h
--- a/mmdet3d/ops/spconv/include/spconv/geometry.h
+++ b/mmdet3d/ops/spconv/include/spconv/geometry.h
--- a/mmdet3d/ops/spconv/include/spconv/indice.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
--- a/mmdet3d/ops/spconv/include/spconv/indice.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.h
--- a/mmdet3d/ops/spconv/include/spconv/maxpool.h
+++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h
--- a/mmdet3d/ops/spconv/include/spconv/mp_helper.h
+++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h