Merge branch 'clean_data-ptr' into 'master'

clean c files See merge request open-mmlab/mmdet.3d!53

Merge branch 'clean_data-ptr' into 'master'
clean c files See merge request open-mmlab/mmdet.3d!53
3fff6789 · zhangwenwei · 16c3f6e1 · d1b9ae40 · 3fff6789 · 3fff6789
Commit 3fff6789 authored Jun 06, 2020 by zhangwenwei
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,11 +10,12 @@ repos:
  - repo: https://github.com/timothycrosley/isort
    rev: 4.3.21
    hooks:
-        - id: isort
+      - id: isort
  - repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.30.0
    hooks:
      - id: yapf
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.5.0
    hooks:

--- a/mmdet3d/ops/ball_query/src/ball_query.cpp
+++ b/mmdet3d/ops/ball_query/src/ball_query.cpp
-#include <torch/serialize/tensor.h>
-#include <vector>
 #include <THC/THC.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) \
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
 int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
-	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+                       at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
+                       at::Tensor idx_tensor);
 void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
-	const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
+                                const float *xyz, const float *new_xyz,
+                                int *idx, cudaStream_t stream);
 int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
-    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+                       at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
-    CHECK_INPUT(new_xyz_tensor);
+                       at::Tensor idx_tensor) {
-    CHECK_INPUT(xyz_tensor);
+  CHECK_INPUT(new_xyz_tensor);
-    const float *new_xyz = new_xyz_tensor.data<float>();
+  CHECK_INPUT(xyz_tensor);
-    const float *xyz = xyz_tensor.data<float>();
+  const float *new_xyz = new_xyz_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data<int>();
+  const float *xyz = xyz_tensor.data_ptr<float>();
+  int *idx = idx_tensor.data_ptr<int>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    ball_query_kernel_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    return 1;
+  ball_query_kernel_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx,
+                             stream);
+  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper");
+  m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper");
 }
--- a/mmdet3d/ops/ball_query/src/ball_query_cuda.cu
+++ b/mmdet3d/ops/ball_query/src/ball_query_cuda.cu
@@ -3,65 +3,70 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void ball_query_kernel(int b, int n, int m, float radius,
+                                  int nsample,
+                                  const float *__restrict__ new_xyz,
+                                  const float *__restrict__ xyz,
+                                  int *__restrict__ idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= m) return;
-__global__ void ball_query_kernel(int b, int n, int m, float radius, int nsample,
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
+  xyz += bs_idx * n * 3;
-    // new_xyz: (B, M, 3)
+  idx += bs_idx * m * nsample + pt_idx * nsample;
-    // xyz: (B, N, 3)
-    // output:
-    //      idx: (B, M, nsample)
-    int bs_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || pt_idx >= m) return;
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  float radius2 = radius * radius;
-    xyz += bs_idx * n * 3;
+  float new_x = new_xyz[0];
-    idx += bs_idx * m * nsample + pt_idx * nsample;
+  float new_y = new_xyz[1];
+  float new_z = new_xyz[2];
-    float radius2 = radius * radius;
+  int cnt = 0;
-    float new_x = new_xyz[0];
+  for (int k = 0; k < n; ++k) {
-    float new_y = new_xyz[1];
+    float x = xyz[k * 3 + 0];
-    float new_z = new_xyz[2];
+    float y = xyz[k * 3 + 1];
+    float z = xyz[k * 3 + 2];
-    int cnt = 0;
+    float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-    for (int k = 0; k < n; ++k) {
+               (new_z - z) * (new_z - z);
-        float x = xyz[k * 3 + 0];
+    if (d2 < radius2) {
-        float y = xyz[k * 3 + 1];
+      if (cnt == 0) {
-        float z = xyz[k * 3 + 2];
+        for (int l = 0; l < nsample; ++l) {
-        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+          idx[l] = k;
-        if (d2 < radius2){
-            if (cnt == 0){
-                for (int l = 0; l < nsample; ++l) {
-                    idx[l] = k;
-                }
-            }
-            idx[cnt] = k;
-            ++cnt;
-            if (cnt >= nsample) break;
        }
+      }
+      idx[cnt] = k;
+      ++cnt;
+      if (cnt >= nsample) break;
    }
+  }
 }
+void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
+                                const float *new_xyz, const float *xyz,
+                                int *idx, cudaStream_t stream) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
-void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample, \
+  cudaError_t err;
-    const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
-    // new_xyz: (B, M, 3)
-    // xyz: (B, N, 3)
-    // output:
-    //      idx: (B, M, nsample)
-    cudaError_t err;
-    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK),
-    dim3 threads(THREADS_PER_BLOCK);
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    ball_query_kernel<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
+  ball_query_kernel<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample,
-    // cudaDeviceSynchronize();  // for using printf in kernel function
+                                                    new_xyz, xyz, idx);
-    err = cudaGetLastError();
+  // cudaDeviceSynchronize();  // for using printf in kernel function
-    if (cudaSuccess != err) {
+  err = cudaGetLastError();
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+  if (cudaSuccess != err) {
-        exit(-1);
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    }
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
+++ b/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
-#include <torch/serialize/tensor.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <vector>
 #include <THC/THC.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 int furthest_point_sampling_wrapper(int b, int n, int m,
-    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
+                                    at::Tensor points_tensor,
+                                    at::Tensor temp_tensor,
+                                    at::Tensor idx_tensor);
 void furthest_point_sampling_kernel_launcher(int b, int n, int m,
-    const float *dataset, float *temp, int *idxs, cudaStream_t stream);
+                                             const float *dataset, float *temp,
+                                             int *idxs, cudaStream_t stream);
 int furthest_point_sampling_wrapper(int b, int n, int m,
-    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
+                                    at::Tensor points_tensor,
+                                    at::Tensor temp_tensor,
-    const float *points = points_tensor.data<float>();
+                                    at::Tensor idx_tensor) {
-    float *temp = temp_tensor.data<float>();
+  const float *points = points_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data<int>();
+  float *temp = temp_tensor.data_ptr<float>();
+  int *idx = idx_tensor.data_ptr<int>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    return 1;
+  furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
+  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
+  m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper,
+        "furthest_point_sampling_wrapper");
 }
--- a/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
+++ b/mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
@@ -3,179 +3,204 @@
 #define TOTAL_THREADS 1024
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 inline int opt_n_threads(int work_size) {
-    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
-    return max(min(1 << pow_2, TOTAL_THREADS), 1);
+  return max(min(1 << pow_2, TOTAL_THREADS), 1);
 }
-__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
-    const float v1 = dists[idx1], v2 = dists[idx2];
+                         int idx1, int idx2) {
-    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  const float v1 = dists[idx1], v2 = dists[idx2];
-    dists[idx1] = max(v1, v2);
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
-    dists_i[idx1] = v2 > v1 ? i2 : i1;
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
 }
 template <unsigned int block_size>
-__global__ void furthest_point_sampling_kernel(int b, int n, int m,
+__global__ void furthest_point_sampling_kernel(
-    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
+    int b, int n, int m, const float *__restrict__ dataset,
-    // dataset: (B, N, 3)
+    float *__restrict__ temp, int *__restrict__ idxs) {
-    // tmp: (B, N)
+  // dataset: (B, N, 3)
-    // output:
+  // tmp: (B, N)
-    //      idx: (B, M)
+  // output:
+  //      idx: (B, M)
-    if (m <= 0) return;
-    __shared__ float dists[block_size];
+  if (m <= 0) return;
-    __shared__ int dists_i[block_size];
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
-    int batch_index = blockIdx.x;
-    dataset += batch_index * n * 3;
+  int batch_index = blockIdx.x;
-    temp += batch_index * n;
+  dataset += batch_index * n * 3;
-    idxs += batch_index * m;
+  temp += batch_index * n;
+  idxs += batch_index * m;
-    int tid = threadIdx.x;
-    const int stride = block_size;
+  int tid = threadIdx.x;
+  const int stride = block_size;
-    int old = 0;
-    if (threadIdx.x == 0)
+  int old = 0;
-    idxs[0] = old;
+  if (threadIdx.x == 0) idxs[0] = old;
-    __syncthreads();
+  __syncthreads();
-    for (int j = 1; j < m; j++) {
+  for (int j = 1; j < m; j++) {
    int besti = 0;
    float best = -1;
    float x1 = dataset[old * 3 + 0];
    float y1 = dataset[old * 3 + 1];
    float z1 = dataset[old * 3 + 2];
    for (int k = tid; k < n; k += stride) {
-        float x2, y2, z2;
+      float x2, y2, z2;
-        x2 = dataset[k * 3 + 0];
+      x2 = dataset[k * 3 + 0];
-        y2 = dataset[k * 3 + 1];
+      y2 = dataset[k * 3 + 1];
-        z2 = dataset[k * 3 + 2];
+      z2 = dataset[k * 3 + 2];
-        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
-        // if (mag <= 1e-3)
+      // if (mag <= 1e-3)
-        // continue;
+      // continue;
-        float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d =
-        float d2 = min(d, temp[k]);
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
-        temp[k] = d2;
+      float d2 = min(d, temp[k]);
-        besti = d2 > best ? k : besti;
+      temp[k] = d2;
-        best = d2 > best ? d2 : best;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
    }
    dists[tid] = best;
    dists_i[tid] = besti;
    __syncthreads();
    if (block_size >= 1024) {
-        if (tid < 512) {
+      if (tid < 512) {
-            __update(dists, dists_i, tid, tid + 512);
+        __update(dists, dists_i, tid, tid + 512);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 512) {
-        if (tid < 256) {
+      if (tid < 256) {
-            __update(dists, dists_i, tid, tid + 256);
+        __update(dists, dists_i, tid, tid + 256);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 256) {
-        if (tid < 128) {
+      if (tid < 128) {
-            __update(dists, dists_i, tid, tid + 128);
+        __update(dists, dists_i, tid, tid + 128);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 128) {
-        if (tid < 64) {
+      if (tid < 64) {
-            __update(dists, dists_i, tid, tid + 64);
+        __update(dists, dists_i, tid, tid + 64);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 64) {
-        if (tid < 32) {
+      if (tid < 32) {
-            __update(dists, dists_i, tid, tid + 32);
+        __update(dists, dists_i, tid, tid + 32);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 32) {
-        if (tid < 16) {
+      if (tid < 16) {
-            __update(dists, dists_i, tid, tid + 16);
+        __update(dists, dists_i, tid, tid + 16);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 16) {
-        if (tid < 8) {
+      if (tid < 8) {
-            __update(dists, dists_i, tid, tid + 8);
+        __update(dists, dists_i, tid, tid + 8);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 8) {
-        if (tid < 4) {
+      if (tid < 4) {
-            __update(dists, dists_i, tid, tid + 4);
+        __update(dists, dists_i, tid, tid + 4);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 4) {
-        if (tid < 2) {
+      if (tid < 2) {
-            __update(dists, dists_i, tid, tid + 2);
+        __update(dists, dists_i, tid, tid + 2);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    if (block_size >= 2) {
-        if (tid < 1) {
+      if (tid < 1) {
-            __update(dists, dists_i, tid, tid + 1);
+        __update(dists, dists_i, tid, tid + 1);
-        }
+      }
-        __syncthreads();
+      __syncthreads();
    }
    old = dists_i[0];
-    if (tid == 0)
+    if (tid == 0) idxs[j] = old;
-        idxs[j] = old;
+  }
-    }
 }
 void furthest_point_sampling_kernel_launcher(int b, int n, int m,
-    const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
+                                             const float *dataset, float *temp,
-    // dataset: (B, N, 3)
+                                             int *idxs, cudaStream_t stream) {
-    // tmp: (B, N)
+  // dataset: (B, N, 3)
-    // output:
+  // tmp: (B, N)
-    //      idx: (B, M)
+  // output:
+  //      idx: (B, M)
-    cudaError_t err;
-    unsigned int n_threads = opt_n_threads(n);
+  cudaError_t err;
+  unsigned int n_threads = opt_n_threads(n);
-    switch (n_threads) {
-        case 1024:
+  switch (n_threads) {
-        furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 1024:
-        case 512:
+      furthest_point_sampling_kernel<1024>
-        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        case 256:
+      break;
-        furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 512:
-        case 128:
+      furthest_point_sampling_kernel<512>
-        furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        case 64:
+      break;
-        furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 256:
-        case 32:
+      furthest_point_sampling_kernel<256>
-        furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        case 16:
+      break;
-        furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 128:
-        case 8:
+      furthest_point_sampling_kernel<128>
-        furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        case 4:
+      break;
-        furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 64:
-        case 2:
+      furthest_point_sampling_kernel<64>
-        furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        case 1:
+      break;
-        furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
+    case 32:
-        default:
+      furthest_point_sampling_kernel<32>
-        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-    }
+      break;
+    case 16:
-    err = cudaGetLastError();
+      furthest_point_sampling_kernel<16>
-    if (cudaSuccess != err) {
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+      break;
-        exit(-1);
+    case 8:
-    }
+      furthest_point_sampling_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/gather_points/src/gather_points.cpp
+++ b/mmdet3d/ops/gather_points/src/gather_points.cpp
-#include <torch/serialize/tensor.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <vector>
 #include <THC/THC.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 int gather_points_wrapper(int b, int c, int n, int npoints,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+                          at::Tensor points_tensor, at::Tensor idx_tensor,
+                          at::Tensor out_tensor);
 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-    const float *points, const int *idx, float *out, cudaStream_t stream);
+                                   const float *points, const int *idx,
+                                   float *out, cudaStream_t stream);
 int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+                               at::Tensor grad_out_tensor,
+                               at::Tensor idx_tensor,
+                               at::Tensor grad_points_tensor);
 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+                                        const float *grad_out, const int *idx,
+                                        float *grad_points,
+                                        cudaStream_t stream);
 int gather_points_wrapper(int b, int c, int n, int npoints,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
+                          at::Tensor points_tensor, at::Tensor idx_tensor,
-    const float *points = points_tensor.data<float>();
+                          at::Tensor out_tensor) {
-    const int *idx = idx_tensor.data<int>();
+  const float *points = points_tensor.data_ptr<float>();
-    float *out = out_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
+  float *out = out_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    return 1;
+  gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
+  return 1;
 }
 int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+                               at::Tensor grad_out_tensor,
+                               at::Tensor idx_tensor,
-    const float *grad_out = grad_out_tensor.data<float>();
+                               at::Tensor grad_points_tensor) {
-    const int *idx = idx_tensor.data<int>();
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
-    float *grad_points = grad_points_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx, grad_points, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    return 1;
+  gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx,
+                                     grad_points, stream);
+  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("gather_points_wrapper", &gather_points_wrapper, "gather_points_wrapper");
+  m.def("gather_points_wrapper", &gather_points_wrapper,
-    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper, "gather_points_grad_wrapper");
+        "gather_points_wrapper");
+  m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,
+        "gather_points_grad_wrapper");
 }
--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
@@ -3,82 +3,92 @@
 #define TOTAL_THREADS 1024
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 __global__ void gather_points_kernel(int b, int c, int n, int m,
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+                                     const float *__restrict__ points,
-    // points: (B, C, N)
+                                     const int *__restrict__ idx,
-    // idx: (B, M)
+                                     float *__restrict__ out) {
-    // output:
+  // points: (B, C, N)
-    //      out: (B, C, M)
+  // idx: (B, M)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      out: (B, C, M)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    out += bs_idx * c * m + c_idx * m + pt_idx;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-    idx += bs_idx * m + pt_idx;
-    points += bs_idx * c * n + c_idx * n;
+  out += bs_idx * c * m + c_idx * m + pt_idx;
-    out[0] = points[idx[0]];
+  idx += bs_idx * m + pt_idx;
+  points += bs_idx * c * n + c_idx * n;
+  out[0] = points[idx[0]];
 }
 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                   const float *points, const int *idx,
-    // points: (B, C, N)
+                                   float *out, cudaStream_t stream) {
-    // idx: (B, npoints)
+  // points: (B, C, N)
-    // output:
+  // idx: (B, npoints)
-    //      out: (B, C, npoints)
+  // output:
+  //      out: (B, C, npoints)
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  cudaError_t err;
-    dim3 threads(THREADS_PER_BLOCK);
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
+  dim3 threads(THREADS_PER_BLOCK);
-    err = cudaGetLastError();
+  gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
-    if (cudaSuccess != err) {
+                                                       idx, out);
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+  err = cudaGetLastError();
-    }
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
-__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
-    const int *__restrict__ idx, float *__restrict__ grad_points) {
+                                          const float *__restrict__ grad_out,
-    // grad_out: (B, C, M)
+                                          const int *__restrict__ idx,
-    // idx: (B, M)
+                                          float *__restrict__ grad_points) {
-    // output:
+  // grad_out: (B, C, M)
-    //      grad_points: (B, C, N)
+  // idx: (B, M)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      grad_points: (B, C, N)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-    idx += bs_idx * m + pt_idx;
-    grad_points += bs_idx * c * n + c_idx * n;
+  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
-    atomicAdd(grad_points + idx[0], grad_out[0]);
+  grad_points += bs_idx * c * n + c_idx * n;
+  atomicAdd(grad_points + idx[0], grad_out[0]);
 }
 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+                                        const float *grad_out, const int *idx,
-    // grad_out: (B, C, npoints)
+                                        float *grad_points,
-    // idx: (B, npoints)
+                                        cudaStream_t stream) {
-    // output:
+  // grad_out: (B, C, npoints)
-    //      grad_points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
-    cudaError_t err;
+  //      grad_points: (B, C, N)
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-    gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+      b, c, n, npoints, grad_out, idx, grad_points);
-        exit(-1);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/group_points/src/group_points.cpp
+++ b/mmdet3d/ops/group_points/src/group_points.cpp
-#include <torch/serialize/tensor.h>
+#include <THC/THC.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <vector>
-#include <THC/THC.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor);
 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream);
+                                  const float *points, const int *idx,
+                                  float *out, cudaStream_t stream);
 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor);
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
+                                       int nsample, const float *grad_out,
+                                       const int *idx, float *grad_points,
+                                       cudaStream_t stream);
 int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+                              at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+                              at::Tensor grad_points_tensor) {
-    float *grad_points = grad_points_tensor.data<float>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
-    const int *idx = idx_tensor.data<int>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    const float *grad_out = grad_out_tensor.data<float>();
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
+  group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
-    return 1;
+                                    grad_points, stream);
+  return 1;
 }
 int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+                         at::Tensor points_tensor, at::Tensor idx_tensor,
+                         at::Tensor out_tensor) {
-    const float *points = points_tensor.data<float>();
+  const float *points = points_tensor.data_ptr<float>();
-    const int *idx = idx_tensor.data<int>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    float *out = out_tensor.data<float>();
+  float *out = out_tensor.data_ptr<float>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream);
+  group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
-    return 1;
+                               stream);
+  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &group_points_wrapper, "group_points_wrapper");
+  m.def("forward", &group_points_wrapper, "group_points_wrapper");
-    m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
+  m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
 }
--- a/mmdet3d/ops/group_points/src/group_points_cuda.cu
+++ b/mmdet3d/ops/group_points/src/group_points_cuda.cu
@@ -2,84 +2,97 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample,
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
-    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+                                         int nsample,
-    // grad_out: (B, C, npoints, nsample)
+                                         const float *__restrict__ grad_out,
-    // idx: (B, npoints, nsample)
+                                         const int *__restrict__ idx,
-    // output:
+                                         float *__restrict__ grad_points) {
-    //      grad_points: (B, C, N)
+  // grad_out: (B, C, npoints, nsample)
-    int bs_idx = blockIdx.z;
+  // idx: (B, npoints, nsample)
-    int c_idx = blockIdx.y;
+  // output:
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
+  //      grad_points: (B, C, N)
-    int pt_idx = index / nsample;
+  int bs_idx = blockIdx.z;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-    int sample_idx = index % nsample;
+  int sample_idx = index % nsample;
-    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+              pt_idx * nsample + sample_idx;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
 }
-void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
+void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
+                                       int nsample, const float *grad_out,
-    // grad_out: (B, C, npoints, nsample)
+                                       const int *idx, float *grad_points,
-    // idx: (B, npoints, nsample)
+                                       cudaStream_t stream) {
-    // output:
+  // grad_out: (B, C, npoints, nsample)
-    //      grad_points: (B, C, N)
+  // idx: (B, npoints, nsample)
-    cudaError_t err;
+  // output:
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  //      grad_points: (B, C, N)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+  group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
-    err = cudaGetLastError();
+  err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+    exit(-1);
-    }
+  }
 }
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample,
+  int sample_idx = index % nsample;
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
-    // points: (B, C, N)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      out: (B, C, npoints, nsample)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int pt_idx = index / nsample;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-    int sample_idx = index % nsample;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  out[out_idx] = points[in_idx];
-    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    out[out_idx] = points[in_idx];
 }
 void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
+                                  const float *points, const int *idx,
-    // points: (B, C, N)
+                                  float *out, cudaStream_t stream) {
-    // idx: (B, npoints, nsample)
+  // points: (B, C, N)
-    // output:
+  // idx: (B, npoints, nsample)
-    //      out: (B, C, npoints, nsample)
+  // output:
-    cudaError_t err;
+  //      out: (B, C, npoints, nsample)
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  cudaError_t err;
-    dim3 threads(THREADS_PER_BLOCK);
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
+  group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
-    // cudaDeviceSynchronize();  // for using printf in kernel function
+                                                      points, idx, out);
-    err = cudaGetLastError();
+  // cudaDeviceSynchronize();  // for using printf in kernel function
-    if (cudaSuccess != err) {
+  err = cudaGetLastError();
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+  if (cudaSuccess != err) {
-        exit(-1);
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    }
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/interpolate/src/interpolate.cpp
+++ b/mmdet3d/ops/interpolate/src/interpolate.cpp
-#include <torch/serialize/tensor.h>
-#include <vector>
 #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
 extern THCState *state;
 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
+                      at::Tensor idx_tensor);
 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-	const float *known, float *dist2, int *idx, cudaStream_t stream);
+                              const float *known, float *dist2, int *idx,
+                              cudaStream_t stream);
+void three_interpolate_wrapper(int b, int c, int m, int n,
-void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+                               at::Tensor weight_tensor, at::Tensor out_tensor);
 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
+                                       const float *points, const int *idx,
+                                       const float *weight, float *out,
+                                       cudaStream_t stream);
+void three_interpolate_grad_wrapper(int b, int c, int n, int m,
+                                    at::Tensor grad_out_tensor,
+                                    at::Tensor idx_tensor,
+                                    at::Tensor weight_tensor,
+                                    at::Tensor grad_points_tensor);
-void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor,
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+                                            const float *grad_out,
+                                            const int *idx, const float *weight,
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
+                                            float *grad_points,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
+                                            cudaStream_t stream);
 void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
-    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+                      at::Tensor known_tensor, at::Tensor dist2_tensor,
-    const float *unknown = unknown_tensor.data<float>();
+                      at::Tensor idx_tensor) {
-    const float *known = known_tensor.data<float>();
+  const float *unknown = unknown_tensor.data_ptr<float>();
-    float *dist2 = dist2_tensor.data<float>();
+  const float *known = known_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data<int>();
+  float *dist2 = dist2_tensor.data_ptr<float>();
+  int *idx = idx_tensor.data_ptr<int>();
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
 }
 void three_interpolate_wrapper(int b, int c, int m, int n,
-                         at::Tensor points_tensor,
+                               at::Tensor points_tensor, at::Tensor idx_tensor,
-                         at::Tensor idx_tensor,
+                               at::Tensor weight_tensor,
-                         at::Tensor weight_tensor,
+                               at::Tensor out_tensor) {
-                         at::Tensor out_tensor) {
+  const float *points = points_tensor.data_ptr<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
-    const float *points = points_tensor.data<float>();
+  float *out = out_tensor.data_ptr<float>();
-    const float *weight = weight_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    float *out = out_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
-    cudaStream_t stream = THCState_getCurrentStream(state);
+                                    stream);
-    three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
 }
 void three_interpolate_grad_wrapper(int b, int c, int n, int m,
-                            at::Tensor grad_out_tensor,
+                                    at::Tensor grad_out_tensor,
-                            at::Tensor idx_tensor,
+                                    at::Tensor idx_tensor,
-                            at::Tensor weight_tensor,
+                                    at::Tensor weight_tensor,
-                            at::Tensor grad_points_tensor) {
+                                    at::Tensor grad_points_tensor) {
+  const float *grad_out = grad_out_tensor.data_ptr<float>();
-    const float *grad_out = grad_out_tensor.data<float>();
+  const float *weight = weight_tensor.data_ptr<float>();
-    const float *weight = weight_tensor.data<float>();
+  float *grad_points = grad_points_tensor.data_ptr<float>();
-    float *grad_points = grad_points_tensor.data<float>();
+  const int *idx = idx_tensor.data_ptr<int>();
-    const int *idx = idx_tensor.data<int>();
+  cudaStream_t stream = THCState_getCurrentStream(state);
-    cudaStream_t stream = THCState_getCurrentStream(state);
+  three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
-    three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream);
+                                         grad_points, stream);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
+  m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
-    m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper");
+  m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
-    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper");
+        "three_interpolate_wrapper");
+  m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
+        "three_interpolate_grad_wrapper");
 }
--- a/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
@@ -3,91 +3,103 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
-__global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points,
+                                         const float *__restrict__ points,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+                                         const int *__restrict__ idx,
-    // points: (B, C, M)
+                                         const float *__restrict__ weight,
-    // idx: (B, N, 3)
+                                         float *__restrict__ out) {
-    // weight: (B, N, 3)
+  // points: (B, C, M)
-    // output:
+  // idx: (B, N, 3)
-    //      out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
-    int bs_idx = blockIdx.z;
+  //      out: (B, C, N)
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    weight += bs_idx * n * 3 + pt_idx * 3;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-    points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  weight += bs_idx * n * 3 + pt_idx * 3;
-    out += bs_idx * c * n + c_idx * n;
+  points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+  out += bs_idx * c * n + c_idx * n;
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
 }
 void three_interpolate_kernel_launcher(int b, int c, int m, int n,
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
+                                       const float *points, const int *idx,
-    // points: (B, C, M)
+                                       const float *weight, float *out,
-    // idx: (B, N, 3)
+                                       cudaStream_t stream) {
-    // weight: (B, N, 3)
+  // points: (B, C, M)
-    // output:
+  // idx: (B, N, 3)
-    //      out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
-    cudaError_t err;
+  //      out: (B, C, N)
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
+  cudaError_t err;
-    three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    err = cudaGetLastError();
+  dim3 threads(THREADS_PER_BLOCK);
-    if (cudaSuccess != err) {
+  three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+                                                           idx, weight, out);
-        exit(-1);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
+__global__ void three_interpolate_grad_kernel(
-__global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+    const int *__restrict__ idx, const float *__restrict__ weight,
-    // grad_out: (B, C, N)
+    float *__restrict__ grad_points) {
-    // weight: (B, N, 3)
+  // grad_out: (B, C, N)
-    // output:
+  // weight: (B, N, 3)
-    //      grad_points: (B, C, M)
+  // output:
+  //      grad_points: (B, C, M)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    weight += bs_idx * n * 3 + pt_idx * 3;
+  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    grad_points += bs_idx * c * m + c_idx * m;
+  weight += bs_idx * n * 3 + pt_idx * 3;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  grad_points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
 }
-void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
+void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
+                                            const float *grad_out,
-    // grad_out: (B, C, N)
+                                            const int *idx, const float *weight,
-    // weight: (B, N, 3)
+                                            float *grad_points,
-    // output:
+                                            cudaStream_t stream) {
-    //      grad_points: (B, C, M)
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
-    cudaError_t err;
+  // output:
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+  //      grad_points: (B, C, M)
-    dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+  cudaError_t err;
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
-    err = cudaGetLastError();
+              b);  // blockIdx.x(col), blockIdx.y(row)
-    if (cudaSuccess != err) {
+  dim3 threads(THREADS_PER_BLOCK);
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+  three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
-        exit(-1);
+      b, c, n, m, grad_out, idx, weight, grad_points);
-    }
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
 }
--- a/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+++ b/mmdet3d/ops/interpolate/src/three_nn_cuda.cu
@@ -3,72 +3,84 @@
 #include <stdlib.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
-__global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown,
+  int bs_idx = blockIdx.y;
-    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    // unknown: (B, N, 3)
+  if (bs_idx >= b || pt_idx >= n) return;
-    // known: (B, M, 3)
-    // output:
-    //      dist2: (B, N, 3)
-    //      idx: (B, N, 3)
-    int bs_idx = blockIdx.y;
+  unknown += bs_idx * n * 3 + pt_idx * 3;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  known += bs_idx * m * 3;
-    if (bs_idx >= b || pt_idx >= n) return;
+  dist2 += bs_idx * n * 3 + pt_idx * 3;
+  idx += bs_idx * n * 3 + pt_idx * 3;
-    unknown += bs_idx * n * 3 + pt_idx * 3;
+  float ux = unknown[0];
-    known += bs_idx * m * 3;
+  float uy = unknown[1];
-    dist2 += bs_idx * n * 3 + pt_idx * 3;
+  float uz = unknown[2];
-    idx += bs_idx * n * 3 + pt_idx * 3;
-    float ux = unknown[0];
+  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-    float uy = unknown[1];
+  int besti1 = 0, besti2 = 0, besti3 = 0;
-    float uz = unknown[2];
+  for (int k = 0; k < m; ++k) {
+    float x = known[k * 3 + 0];
-    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    float y = known[k * 3 + 1];
-    int besti1 = 0, besti2 = 0, besti3 = 0;
+    float z = known[k * 3 + 2];
-    for (int k = 0; k < m; ++k) {
+    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-        float x = known[k * 3 + 0];
+    if (d < best1) {
-        float y = known[k * 3 + 1];
+      best3 = best2;
-        float z = known[k * 3 + 2];
+      besti3 = besti2;
-        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      best2 = best1;
-        if (d < best1) {
+      besti2 = besti1;
-            best3 = best2; besti3 = besti2;
+      best1 = d;
-            best2 = best1; besti2 = besti1;
+      besti1 = k;
-            best1 = d; besti1 = k;
+    } else if (d < best2) {
-        }
+      best3 = best2;
-        else if (d < best2) {
+      besti3 = besti2;
-            best3 = best2; besti3 = besti2;
+      best2 = d;
-            best2 = d; besti2 = k;
+      besti2 = k;
-        }
+    } else if (d < best3) {
-        else if (d < best3) {
+      best3 = d;
-            best3 = d; besti3 = k;
+      besti3 = k;
-        }
    }
-    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+  }
-    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
 }
 void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
-    const float *known, float *dist2, int *idx, cudaStream_t stream) {
+                              const float *known, float *dist2, int *idx,
-    // unknown: (B, N, 3)
+                              cudaStream_t stream) {
-    // known: (B, M, 3)
+  // unknown: (B, N, 3)
-    // output:
+  // known: (B, M, 3)
-    //      dist2: (B, N, 3)
+  // output:
-    //      idx: (B, N, 3)
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
-    cudaError_t err;
+  cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
-    dim3 threads(THREADS_PER_BLOCK);
+              b);  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK);
-    three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
+  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
+                                                  dist2, idx);
-    err = cudaGetLastError();
+  err = cudaGetLastError();
-    if (cudaSuccess != err) {
+  if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
+    exit(-1);
-    }
+  }
 }
--- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -78,9 +78,9 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
 }
 __global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num,
-                                       int pts_num, const float *boxes,
+                                             int pts_num, const float *boxes,
-                                       const float *pts,
+                                             const float *pts,
-                                       int *box_idx_of_points) {
+                                             int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
@@ -131,17 +131,17 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
 }
 void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
-                              const float *boxes, const float *pts,
+                                    const float *boxes, const float *pts,
-                              int *box_idx_of_points) {
+                                    int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
-  //LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
  cudaError_t err;
  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_batch_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
+  points_in_boxes_batch_kernel<<<blocks, threads>>>(
-                                              boxes, pts, box_idx_of_points);
+      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -180,7 +180,7 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
 }
 int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor) {
+                          at::Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
  // coordinate params boxes_idx_of_points: (B, npoints), default -1

--- a/mmdet3d/ops/spconv/include/paramsgrid.h
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
@@ -18,13 +18,19 @@
 #include <vector>
 namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
+template <class T>
+int getTotalSize(std::vector<T> arg) {
+  return arg.size();
+}
 template <class T, class... TArgs>
 int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
  return arg.size() * getTotalSize(args...);
 }
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
+template <typename T>
+int getSize(std::vector<T> arg) {
+  return arg.size();
+}
 template <int Idx, class TT, class T>
 void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
@@ -37,7 +43,7 @@ void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
  std::get<Idx>(src) = arg[counter[Idx]];
  assigner<Idx + 1>(src, counter, args...);
 }
-} // namespace detail
+}  // namespace detail
 template <class... TArgs>
 std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
  int length = detail::getTotalSize(args...);

--- a/mmdet3d/ops/spconv/include/prettyprint.h
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
--- a/mmdet3d/ops/spconv/include/spconv/box_iou.h
+++ b/mmdet3d/ops/spconv/include/spconv/box_iou.h
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef BOX_IOU_H
 #define BOX_IOU_H
 #include <pybind11/pybind11.h>
 // must include pybind11/eigen.h if using eigen matrix as arguments.
+#include <pybind11/numpy.h>
 #include <algorithm>
 #include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
 namespace spconv {
 // #include "voxelnet/core/cc/pybind11_helper.h"
@@ -40,9 +40,10 @@ inline py::array_t<DType> zeros(std::vector<long int> shape) {
 }
 template <typename DType>
-py::array_t<DType>
+py::array_t<DType> rbbox_iou(py::array_t<DType> box_corners,
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
+                             py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
+                             py::array_t<DType> standup_iou,
+                             DType standup_thresh) {
  namespace bg = boost::geometry;
  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
  typedef bg::model::polygon<point_t> polygon_t;
@@ -61,8 +62,7 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
  }
  for (int k = 0; k < K; ++k) {
    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
+      if (standup_iou_r(n, k) <= standup_thresh) continue;
-        continue;
      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
@@ -99,9 +99,10 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
 }
 template <typename DType>
-py::array_t<DType>
+py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
-rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
+                                      py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
+                                      py::array_t<DType> standup_iou,
+                                      DType standup_thresh) {
  namespace bg = boost::geometry;
  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
  typedef bg::model::polygon<point_t> polygon_t;
@@ -120,8 +121,7 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
  }
  for (int k = 0; k < K; ++k) {
    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
+      if (standup_iou_r(n, k) <= standup_thresh) continue;
-        continue;
      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
@@ -152,6 +152,5 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
  return overlaps;
 }
+}  // namespace spconv
-} // namespace spconv
 #endif
--- a/mmdet3d/ops/spconv/include/spconv/geometry.h
+++ b/mmdet3d/ops/spconv/include/spconv/geometry.h
@@ -15,9 +15,10 @@
 #ifndef SPCONV_GEOMETRY_H_
 #define SPCONV_GEOMETRY_H_
+#include <tensorview/tensorview.h>
 #include <iostream>
 #include <limits>
-#include <tensorview/tensorview.h>
 namespace spconv {
 template <typename Index, unsigned NDim>
@@ -70,8 +71,7 @@ TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
    }
    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
+    if (valid) ++pointCounter;
-      ++pointCounter;
    counter[NDim - 1] += 1;
 #pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
@@ -128,8 +128,7 @@ TV_HOST_DEVICE Index getValidOutPosTranspose(
      m *= kernelSize[j];
    }
    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
+    if (valid) ++pointCounter;
-      ++pointCounter;
    counter[NDim - 1] += 1;
 #pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
@@ -167,7 +166,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
@@ -218,7 +217,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
@@ -252,7 +251,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<Index> indiceNum,
                         const Index *const kernelSize,
                         const Index *const stride, const Index *const padding,
-                         const Index *dilation, const Index *const outSpatialShape) {
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
@@ -269,7 +269,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  Index numValidPoints = 0;
  // Index validPoints[kernelVolume * (NDim + 1)];
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index* validPoints = validPoints_.data();
+  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {
@@ -296,6 +296,6 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  return numActIn;
 }
-} // namespace spconv
+}  // namespace spconv
 #endif
--- a/mmdet3d/ops/spconv/include/spconv/indice.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
@@ -14,9 +14,9 @@
 #ifndef INDICE_CU_H_
 #define INDICE_CU_H_
-#include <tensorview/tensorview.h>
-#include <tensorview/helper_kernel.cu.h>
 #include <spconv/geometry.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/tensorview.h>
 namespace spconv {
 template <typename Index, typename IndexGrid, unsigned NDim,
@@ -115,7 +115,6 @@ __global__ void assignGridAndIndiceOutKernel(
    int numAct, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX<int>(numAct)) {
@@ -128,13 +127,11 @@ __global__ void assignGridAndIndiceOutKernel(
 }
 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
+__global__ void assignIndicePairsKernel(
-assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
+    int numActIn, tv::TensorView<Index> indicePairs,
-                        tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
-                        tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-                        const tv::SimpleVector<Index, NDim> outSpatialShape) {
  Index index;
  int kernelVolume = indicePairs.dim(0);
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
@@ -148,10 +145,9 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
 }
 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
+__global__ void prepareSubMGridKernel(
-prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-                  tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-                  const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
 #pragma unroll
@@ -216,10 +212,9 @@ __global__ void resetGridKernel(const Index *indicePairUnique,
 }
 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
+__global__ void resetGridSubMKernel(
-resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-                    const tv::SimpleVector<Index, NDim> outSpatialShape,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
-                    int numAct) {
  int outSpatialShapeReg[NDim];
  for (int i = 0; i < NDim; ++i) {
    outSpatialShapeReg[i] = outSpatialShape[i];
@@ -238,6 +233,6 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
  }
 }
-} // namespace spconv
+}  // namespace spconv
 #endif
--- a/mmdet3d/ops/spconv/include/spconv/indice.h
+++ b/mmdet3d/ops/spconv/include/spconv/indice.h
@@ -16,64 +16,65 @@
 #define SPARSE_CONV_INDICE_FUNCTOR_H_
 #include <tensorview/tensorview.h>
-namespace spconv
+namespace spconv {
-{
+namespace functor {
-namespace functor
-{
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1
+struct CreateConvIndicePairFunctorP1 {
-{
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-    Index operator()(
+                   tv::TensorView<Index> indicesOut,
-        const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indiceNum,
-        tv::TensorView<Index> indicePairUnique,
+                   tv::TensorView<Index> indicePairUnique,
-        const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
-        const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> stride,
-        const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> padding,
-        const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
 };
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2
+struct CreateConvIndicePairFunctorP2 {
-{
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-    Index operator()(
+                   tv::TensorView<Index> indicesOut,
-        const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indiceNum,
-        tv::TensorView<Index> indicePairUnique,
+                   tv::TensorView<Index> indicePairUnique,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-        bool resetGrid=false);
+                   bool transpose, bool resetGrid = false);
 };
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor
+struct CreateConvIndicePairFunctor {
-{
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-    Index operator()(
+                   tv::TensorView<Index> indicesOut,
-        const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
-        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indiceNum,
-        const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
-        const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> stride,
-        const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> padding,
-        const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
 };
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor
+struct CreateSubMIndicePairFunctor {
-{
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-    Index operator()(
+                   tv::TensorView<IndexGrid> gridsOut,
-        const Device& d, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
-        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indiceNum,
-        const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
-        const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> stride,
-        const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> padding,
-        const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
 };
-} // namespace functor
+}  // namespace functor
-} // namespace spconv
+}  // namespace spconv
 #endif