merge master

f27d308f · yinchimaoliang · c66ae813 · 27ebcfac · f27d308f · f27d308f
Commit f27d308f authored Jun 07, 2020 by yinchimaoliang
20 changed files
--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
@@ -3,11 +3,12 @@

 #define TOTAL_THREADS 1024
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

 __global__ void gather_points_kernel(int b, int c, int n, int m,
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+                                     const float *__restrict__ points,
+                                     const int *__restrict__ idx,
+                                     float *__restrict__ out) {
  // points: (B, C, N)
  // idx: (B, M)
  // output:
@@ -25,17 +26,20 @@ __global__ void gather_points_kernel(int b, int c, int n, int m,
 }

 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                   const float *points, const int *idx,
+                                   float *out, cudaStream_t stream) {
  // points: (B, C, N)
  // idx: (B, npoints)
  // output:
  //      out: (B, C, npoints)

  cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);

-    gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
+  gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
+                                                       idx, out);

  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -44,8 +48,10 @@ void gather_points_kernel_launcher(int b, int c, int n, int npoints,
  }
 }

-__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, float *__restrict__ grad_points) {
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
+                                          const float *__restrict__ grad_out,
+                                          const int *__restrict__ idx,
+                                          float *__restrict__ grad_points) {
  // grad_out: (B, C, M)
  // idx: (B, M)
  // output:
@@ -64,17 +70,21 @@ __global__ void gather_points_grad_kernel(int b, int c, int n, int m, const floa
 }

 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+                                        const float *grad_out, const int *idx,
+                                        float *grad_points,
+                                        cudaStream_t stream) {
  // grad_out: (B, C, npoints)
  // idx: (B, npoints)
  // output:
  //      grad_points: (B, C, N)

  cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);

-    gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
+  gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
+      b, c, n, npoints, grad_out, idx, grad_points);

  err = cudaGetLastError();
  if (cudaSuccess != err) {

--- a/mmdet3d/ops/group_points/src/group_points.cpp
+++ b/mmdet3d/ops/group_points/src/group_points.cpp
-#include <torch/serialize/tensor.h>
+#include <THC/THC.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <vector>
-#include <THC/THC.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+
+#include <vector>

 extern THCState *state;

 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor);

 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream);
+                                  const float *points, const int *idx,
+                                  float *out, cudaStream_t stream);

 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
-
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor);

+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+                                       int nsample, const float *grad_out,
+                                       const int *idx, float *grad_points,
+                                       cudaStream_t stream);

 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
-
-    float *grad_points = grad_points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    const float *grad_out = grad_out_tensor.data<float>();
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor) {
+  float *grad_points = grad_points_tensor.data_ptr<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
+  const float *grad_out = grad_out_tensor.data_ptr<float>();

  cudaStream_t stream = THCState_getCurrentStream(state);

-    group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
+  group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
+                                    grad_points, stream);
  return 1;
 }

-
 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
-
-    const float *points = points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    float *out = out_tensor.data<float>();
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor) {
+  const float *points = points_tensor.data_ptr<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
+  float *out = out_tensor.data_ptr<float>();

  cudaStream_t stream = THCState_getCurrentStream(state);

-    group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream);
+  group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
+                               stream);
  return 1;
 }


--- a/mmdet3d/ops/group_points/src/group_points_cuda.cu
+++ b/mmdet3d/ops/group_points/src/group_points_cuda.cu
@@ -2,10 +2,13 @@
 #include <stdlib.h>

 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

-__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample,
-    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
+                                         int nsample,
+                                         const float *__restrict__ grad_out,
+                                         const int *__restrict__ idx,
+                                         float *__restrict__ grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
@@ -17,23 +20,28 @@ __global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int n
  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;

  int sample_idx = index % nsample;
-    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+              pt_idx * nsample + sample_idx;
  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;

-    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
 }

-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+                                       int nsample, const float *grad_out,
+                                       const int *idx, float *grad_points,
+                                       cudaStream_t stream) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)
  cudaError_t err;
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);

-    group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+  group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);

  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -42,9 +50,11 @@ void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsa
  }
 }

-
-__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample,
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
@@ -59,23 +69,26 @@ __global__ void group_points_kernel(int b, int c, int n, int npoints, int nsampl

  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;

  out[out_idx] = points[in_idx];
 }

-
 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                  const float *points, const int *idx,
+                                  float *out, cudaStream_t stream) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  cudaError_t err;
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);

-    group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
+  group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
+                                                      points, idx, out);
  // cudaDeviceSynchronize();  // for using printf in kernel function
  err = cudaGetLastError();
  if (cudaSuccess != err) {

--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
-#include <torch/serialize/tensor.h>
-#include <vector>
 #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+
+#include <vector>

 extern THCState *state;

 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
+                      at::Tensor idx_tensor);

 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-	const float *known, float *dist2, int *idx, cudaStream_t stream);
+                              const float *known, float *dist2, int *idx,
+                              cudaStream_t stream);

-
-void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_wrapper(int b, int c, int m, int n,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
+                               at::Tensor weight_tensor, at::Tensor out_tensor);

 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
+                                       const float *points, const int *idx,
+                                       const float *weight, float *out,
+                                       cudaStream_t stream);

+void three_interpolate_grad_wrapper(int b, int c, int n, int m,
+                                    at::Tensor grad_out_tensor,
+                                    at::Tensor idx_tensor,
+                                    at::Tensor weight_tensor,
+                                    at::Tensor grad_points_tensor);

-void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
-
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
-
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
+                                            const float *grad_out,
+                                            const int *idx, const float *weight,
+                                            float *grad_points,
+                                            cudaStream_t stream);

 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
-    const float *unknown = unknown_tensor.data<float>();
-    const float *known = known_tensor.data<float>();
-    float *dist2 = dist2_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
+                      at::Tensor idx_tensor) {
+  const float *unknown = unknown_tensor.data_ptr<float>();
+  const float *known = known_tensor.data_ptr<float>();
+  float *dist2 = dist2_tensor.data_ptr<float>();
+  int *idx = idx_tensor.data_ptr<int>();

  cudaStream_t stream = THCState_getCurrentStream(state);
  three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
 }

-
 void three_interpolate_wrapper(int b, int c, int m, int n,
-                         at::Tensor points_tensor,
-                         at::Tensor idx_tensor,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
                               at::Tensor weight_tensor,
                               at::Tensor out_tensor) {
-
-    const float *points = points_tensor.data<float>();
-    const float *weight = weight_tensor.data<float>();
-    float *out = out_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
+  const float *points = points_tensor.data_ptr<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
+  float *out = out_tensor.data_ptr<float>();
+  const int *idx = idx_tensor.data_ptr<int>();

  cudaStream_t stream = THCState_getCurrentStream(state);
-    three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
+  three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
+                                    stream);
 }

 void three_interpolate_grad_wrapper(int b, int c, int n, int m,
@@ -63,19 +71,20 @@ void three_interpolate_grad_wrapper(int b, int c, int n, int m,
                                    at::Tensor idx_tensor,
                                    at::Tensor weight_tensor,
                                    at::Tensor grad_points_tensor) {
-
-    const float *grad_out = grad_out_tensor.data<float>();
-    const float *weight = weight_tensor.data<float>();
-    float *grad_points = grad_points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
+  const int *idx = idx_tensor.data_ptr<int>();

  cudaStream_t stream = THCState_getCurrentStream(state);
-    three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream);
+  three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
+                                         grad_points, stream);
 }

-
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
-    m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper");
-    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper");
+  m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
+        "three_interpolate_wrapper");
+  m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
+        "three_interpolate_grad_wrapper");
 }
--- a/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
@@ -3,11 +3,13 @@
 #include <stdlib.h>

 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

-
-__global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
+                                         const float *__restrict__ points,
+                                         const int *__restrict__ idx,
+                                         const float *__restrict__ weight,
+                                         float *__restrict__ out) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
@@ -25,11 +27,14 @@ __global__ void three_interpolate_kernel(int b, int c, int m, int n, const float
  idx += bs_idx * n * 3 + pt_idx * 3;
  out += bs_idx * c * n + c_idx * n;

-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
 }

 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
+                                       const float *points, const int *idx,
+                                       const float *weight, float *out,
+                                       cudaStream_t stream) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
@@ -37,9 +42,11 @@ void three_interpolate_kernel_launcher(int b, int c, int m, int n,
  //      out: (B, C, N)

  cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
+  three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
+                                                           idx, weight, out);

  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -48,9 +55,10 @@ void three_interpolate_kernel_launcher(int b, int c, int m, int n,
  }
 }

-
-__global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+__global__ void three_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight,
+    float *__restrict__ grad_points) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
@@ -67,23 +75,27 @@ __global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const
  grad_points += bs_idx * c * m + c_idx * m;
  idx += bs_idx * n * 3 + pt_idx * 3;

-
  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
 }

-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
+                                            const float *grad_out,
+                                            const int *idx, const float *weight,
+                                            float *grad_points,
+                                            cudaStream_t stream) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
  //      grad_points: (B, C, M)

  cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+  three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
+      b, c, n, m, grad_out, idx, weight, grad_points);

  err = cudaGetLastError();
  if (cudaSuccess != err) {

--- a/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
@@ -3,11 +3,13 @@
 #include <stdlib.h>

 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

-
-__global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown,
-    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
@@ -35,25 +37,33 @@ __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ u
    float z = known[k * 3 + 2];
    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
    if (d < best1) {
-            best3 = best2; besti3 = besti2;
-            best2 = best1; besti2 = besti1;
-            best1 = d; besti1 = k;
-        }
-        else if (d < best2) {
-            best3 = best2; besti3 = besti2;
-            best2 = d; besti2 = k;
+      best3 = best2;
+      besti3 = besti2;
+      best2 = best1;
+      besti2 = besti1;
+      best1 = d;
+      besti1 = k;
+    } else if (d < best2) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = d;
+      besti2 = k;
+    } else if (d < best3) {
+      best3 = d;
+      besti3 = k;
    }
-        else if (d < best3) {
-            best3 = d; besti3 = k;
  }
-    }
-    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
-    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
 }

-
 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-    const float *known, float *dist2, int *idx, cudaStream_t stream) {
+                              const float *known, float *dist2, int *idx,
+                              cudaStream_t stream) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
@@ -61,10 +71,12 @@ void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
  //      idx: (B, N, 3)

  cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
+              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);

-    three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
+  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
+                                                  dist2, idx);

  err = cudaGetLastError();
  if (cudaSuccess != err) {

--- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
--- a/mmdet3d/ops/roiaware_pool3d/__init__.py
+++ b/mmdet3d/ops/roiaware_pool3d/__init__.py
-from .points_in_boxes import points_in_boxes_cpu, points_in_boxes_gpu
+from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu,
+                              points_in_boxes_gpu)
 from .roiaware_pool3d import RoIAwarePool3d

-__all__ = ['RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu']
+__all__ = [
+    'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
+    'points_in_boxes_batch'
+]
--- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
@@ -53,3 +53,29 @@ def points_in_boxes_cpu(points, boxes):
                                            point_indices)

    return point_indices
+
+
+def points_in_boxes_batch(points, boxes):
+    """Find points that are in boxes (CUDA)
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
+            (x, y, z) is the bottom center
+
+    Returns:
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0
+    """
+    assert boxes.shape[0] == points.shape[0]
+    assert boxes.shape[2] == 7
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+    roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(),
+                                              points.contiguous(),
+                                              box_idxs_of_pts)
+
+    return box_idxs_of_pts
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -77,6 +77,34 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
  }
 }

+__global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num,
+                                             int pts_num, const float *boxes,
+                                             const float *pts,
+                                             int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[k] = 1;
+    }
+    cur_in_flag = 0;
+  }
+}
+
 void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
                              const float *boxes, const float *pts,
                              int *box_idx_of_points) {
@@ -102,6 +130,30 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
 #endif
 }

+void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
+                                    const float *boxes, const float *pts,
+                                    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
+  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  cudaError_t err;
+
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+  points_in_boxes_batch_kernel<<<blocks, threads>>>(
+      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
 int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
                        at::Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
@@ -126,3 +178,27 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,

  return 1;
 }
+
+int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                          at::Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
+  // coordinate params boxes_idx_of_points: (B, npoints), default -1
+
+  CHECK_INPUT(boxes_tensor);
+  CHECK_INPUT(pts_tensor);
+  CHECK_INPUT(box_idx_of_points_tensor);
+
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
+
+  points_in_boxes_batch_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                                 box_idx_of_points);
+
+  return 1;
+}
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
@@ -44,6 +44,9 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
 int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
                        at::Tensor box_idx_of_points_tensor);

+int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                          at::Tensor box_idx_of_points_tensor);
+
 int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
                        at::Tensor pooled_features, int pool_method) {
@@ -127,6 +130,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "roiaware pool3d backward (CUDA)");
  m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
        "points_in_boxes_gpu forward (CUDA)");
+  m.def("points_in_boxes_batch", &points_in_boxes_batch,
+        "points_in_boxes_batch forward (CUDA)");
  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
        "points_in_boxes_cpu forward (CPU)");
 }
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
@@ -6,6 +6,21 @@ from mmdet.models.backbones.resnet import BasicBlock, Bottleneck


 class SparseBottleneck(Bottleneck, spconv.SparseModule):
+    """Sparse bottleneck block for PartA^2.
+
+    Bottleneck block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        stride (int): stride of the first block. Default: 1
+        downsample (None | Module): down sample module for block.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
    expansion = 4

    def __init__(self,
@@ -15,10 +30,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
                 downsample=None,
                 conv_cfg=None,
                 norm_cfg=None):
-        """Sparse bottleneck block for PartA^2.

-        Bottleneck block implemented with submanifold sparse convolution.
-        """
        spconv.SparseModule.__init__(self)
        Bottleneck.__init__(
            self,
@@ -53,6 +65,21 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):


 class SparseBasicBlock(BasicBlock, spconv.SparseModule):
+    """Sparse basic block for PartA^2.
+
+    Sparse basic block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        stride (int): stride of the first block. Default: 1
+        downsample (None | Module): down sample module for block.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
    expansion = 1

    def __init__(self,
@@ -62,10 +89,6 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
                 downsample=None,
                 conv_cfg=None,
                 norm_cfg=None):
-        """Sparse basic block for PartA^2.
-
-        Sparse basic block implemented with submanifold sparse convolution.
-        """
        spconv.SparseModule.__init__(self)
        BasicBlock.__init__(
            self,
@@ -125,6 +148,7 @@ def make_sparse_convmodule(in_channels,
        spconv.SparseSequential: sparse convolution module.
    """
    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}

    conv_cfg = dict(type=conv_type, indice_key=indice_key)


--- a/mmdet3d/ops/spconv/include/paramsgrid.h
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
@@ -18,13 +18,19 @@
 #include <vector>

 namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
+template <class T>
+int getTotalSize(std::vector<T> arg) {
+  return arg.size();
+}

 template <class T, class... TArgs>
 int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
  return arg.size() * getTotalSize(args...);
 }
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
+template <typename T>
+int getSize(std::vector<T> arg) {
+  return arg.size();
+}

 template <int Idx, class TT, class T>
 void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {

--- a/mmdet3d/ops/spconv/include/prettyprint.h
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
--- a/mmdet3d/ops/spconv/include/spconv/box_iou.h
+++ b/mmdet3d/ops/spconv/include/spconv/box_iou.h
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-
 #ifndef BOX_IOU_H
 #define BOX_IOU_H

 #include <pybind11/pybind11.h>
 // must include pybind11/eigen.h if using eigen matrix as arguments.
+#include <pybind11/numpy.h>
+
 #include <algorithm>
 #include <boost/geometry.hpp>
-#include <pybind11/numpy.h>

 namespace spconv {
 // #include "voxelnet/core/cc/pybind11_helper.h"
@@ -40,9 +40,10 @@ inline py::array_t<DType> zeros(std::vector<long int> shape) {
 }

 template <typename DType>
-py::array_t<DType>
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
+py::array_t<DType> rbbox_iou(py::array_t<DType> box_corners,
+                             py::array_t<DType> qbox_corners,
+                             py::array_t<DType> standup_iou,
+                             DType standup_thresh) {
  namespace bg = boost::geometry;
  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
  typedef bg::model::polygon<point_t> polygon_t;
@@ -61,8 +62,7 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
  }
  for (int k = 0; k < K; ++k) {
    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
+      if (standup_iou_r(n, k) <= standup_thresh) continue;
      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
@@ -99,9 +99,10 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
 }

 template <typename DType>
-py::array_t<DType>
-rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
+py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
+                                      py::array_t<DType> qbox_corners,
+                                      py::array_t<DType> standup_iou,
+                                      DType standup_thresh) {
  namespace bg = boost::geometry;
  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
  typedef bg::model::polygon<point_t> polygon_t;
@@ -120,8 +121,7 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
  }
  for (int k = 0; k < K; ++k) {
    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
+      if (standup_iou_r(n, k) <= standup_thresh) continue;
      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
@@ -152,6 +152,5 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
  return overlaps;
 }

-
 }  // namespace spconv
 #endif
--- a/mmdet3d/ops/spconv/include/spconv/geometry.h
+++ b/mmdet3d/ops/spconv/include/spconv/geometry.h
@@ -15,9 +15,10 @@
 #ifndef SPCONV_GEOMETRY_H_
 #define SPCONV_GEOMETRY_H_

+#include <tensorview/tensorview.h>
+
 #include <iostream>
 #include <limits>
-#include <tensorview/tensorview.h>

 namespace spconv {
 template <typename Index, unsigned NDim>
@@ -70,8 +71,7 @@ TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
    }

    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
+    if (valid) ++pointCounter;
    counter[NDim - 1] += 1;
 #pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
@@ -128,8 +128,7 @@ TV_HOST_DEVICE Index getValidOutPosTranspose(
      m *= kernelSize[j];
    }
    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
+    if (valid) ++pointCounter;
    counter[NDim - 1] += 1;
 #pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
@@ -167,7 +166,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
@@ -218,7 +217,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
@@ -252,7 +251,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<Index> indiceNum,
                         const Index *const kernelSize,
                         const Index *const stride, const Index *const padding,
-                         const Index *dilation, const Index *const outSpatialShape) {
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
@@ -269,7 +269,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  Index numValidPoints = 0;
  // Index validPoints[kernelVolume * (NDim + 1)];
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {

--- a/mmdet3d/ops/spconv/include/spconv/indice.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
@@ -14,9 +14,9 @@

 #ifndef INDICE_CU_H_
 #define INDICE_CU_H_
-#include <tensorview/tensorview.h>
-#include <tensorview/helper_kernel.cu.h>
 #include <spconv/geometry.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/tensorview.h>

 namespace spconv {
 template <typename Index, typename IndexGrid, unsigned NDim,
@@ -115,7 +115,6 @@ __global__ void assignGridAndIndiceOutKernel(
    int numAct, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-
  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX<int>(numAct)) {
@@ -128,13 +127,11 @@ __global__ void assignGridAndIndiceOutKernel(
 }

 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
-                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                        tv::TensorView<Index> indicePairs,
+__global__ void assignIndicePairsKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numActIn, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-
  Index index;
  int kernelVolume = indicePairs.dim(0);
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
@@ -148,9 +145,8 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
 }

 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
-                  tv::TensorView<IndexGrid> gridsOut,
+__global__ void prepareSubMGridKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
@@ -216,10 +212,9 @@ __global__ void resetGridKernel(const Index *indicePairUnique,
 }

 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-                    const tv::SimpleVector<Index, NDim> outSpatialShape,
-                    int numAct) {
+__global__ void resetGridSubMKernel(
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
  int outSpatialShapeReg[NDim];
  for (int i = 0; i < NDim; ++i) {
    outSpatialShapeReg[i] = outSpatialShape[i];

--- a/mmdet3d/ops/spconv/include/spconv/indice.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.h
@@ -16,62 +16,63 @@
 #define SPARSE_CONV_INDICE_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1
-{
-    Index operator()(
-        const Device& d, tv::TensorView<const Index> indicesIn,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
 };

 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2
-{
-    Index operator()(
-        const Device& d, tv::TensorView<const Index> indicesIn,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
-        bool resetGrid=false);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
 };

 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor
-{
-    Index operator()(
-        const Device& d, tv::TensorView<const Index> indicesIn,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
 };

 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor
-{
-    Index operator()(
-        const Device& d, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
 };
 }  // namespace functor
 }  // namespace spconv

--- a/mmdet3d/ops/spconv/include/spconv/maxpool.h
+++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h
@@ -16,25 +16,20 @@
 #define SPARSE_MAXPOOL_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename T, typename Index>
-struct SparseMaxPoolForwardFunctor
-{
+struct SparseMaxPoolForwardFunctor {
  void operator()(const Device& d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const Index> indices, int size);
 };

 template <typename Device, typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor
-{
+struct SparseMaxPoolBackwardFunctor {
  void operator()(const Device& d, tv::TensorView<const T> outFeatures,
                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout,
-                  tv::TensorView<T> din,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
                  tv::TensorView<const Index> indices, int size);
 };


--- a/mmdet3d/ops/spconv/include/spconv/mp_helper.h
+++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h
@@ -4,7 +4,8 @@
 #include <utility>

 namespace spconv {
-template <class... T> struct mp_list {};
+template <class... T>
+struct mp_list {};

 template <class T, T... I>
 using mp_list_c = mp_list<std::integral_constant<T, I>...>;
@@ -16,7 +17,8 @@ constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
 }

-template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
  return std::forward<F>(f);
 }

@@ -24,7 +26,8 @@ template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {

 namespace detail {

-template <class A, template <class...> class B> struct mp_rename_impl {
+template <class A, template <class...> class B>
+struct mp_rename_impl {
  // An error "no type named 'type'" here means that the first argument to
  // mp_rename is not a list
 };
@@ -39,7 +42,8 @@ struct mp_rename_impl<A<T...>, B> {
 template <class A, template <class...> class B>
 using mp_rename = typename detail::mp_rename_impl<A, B>::type;

-template <class L, class F> constexpr F mp_for_each(F &&f) {
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
 }
 }  // namespace spconv