merge master

f27d308f · yinchimaoliang · c66ae813 · 27ebcfac · f27d308f · f27d308f
Commit f27d308f authored Jun 07, 2020 by yinchimaoliang
20 changed files
--- a/mmdet3d/ops/spconv/include/spconv/nms.h
+++ b/mmdet3d/ops/spconv/include/spconv/nms.h
@@ -16,11 +16,13 @@
 #define NMS_CPU_H
 #include <pybind11/pybind11.h>
 // must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
+#include <algorithm>
+#include <boost/geometry.hpp>
 #include <vector>
+
 #include "box_iou.h"
 #include "nms_gpu.h"
 namespace spconv {
@@ -48,13 +50,11 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
  DType xx1, xx2, w, h, inter, ovr;
  for (int _i = 0; _i < ndets; ++_i) {
    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
+    if (suppressed_rw(i) == 1) continue;
    keep.push_back(i);
    for (int _j = _i + 1; _j < ndets; ++_j) {
      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
+      if (suppressed_rw(j) == 1) continue;
      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
      w = xx2 - xx1 + eps;
@@ -65,8 +65,7 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
        if (h > 0) {
          inter = w * h;
          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
-            suppressed_rw(j) = 1;
+          if (ovr >= thresh) suppressed_rw(j) = 1;
        }
      }
    }
@@ -97,15 +96,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,

  for (int _i = 0; _i < ndets; ++_i) {
    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
+    if (suppressed_rw(i) == 1) continue;
    keep.push_back(i);
    for (int _j = _i + 1; _j < ndets; ++_j) {
      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
+      if (suppressed_rw(j) == 1) continue;
+      if (standup_iou_r(i, j) <= 0.0) continue;
      // std::cout << "pre_poly" << std::endl;
      try {
        bg::append(poly,
@@ -164,13 +160,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
            }
        }*/
        // std::cout << "post_union" << poly_union.empty() << std::endl;
-        if (!poly_union.empty()) { // ignore invalid box
+        if (!poly_union.empty()) {  // ignore invalid box
          union_area = bg::area(poly_union.front());
          // std::cout << "post union area" << std::endl;
          // std::cout << union_area << "debug" << std::endl;
          overlap = inter_area / union_area;
-          if (overlap >= thresh)
-            suppressed_rw(j) = 1;
+          if (overlap >= thresh) suppressed_rw(j) = 1;
          poly_union.clear();
        }
      }
@@ -197,5 +192,5 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
                                          nms_overlap_thresh, device_id);
 }

-} // namespace spconv
+}  // namespace spconv
 #endif
--- a/mmdet3d/ops/spconv/include/spconv/nms_functor.h
+++ b/mmdet3d/ops/spconv/include/spconv/nms_functor.h
@@ -16,27 +16,22 @@
 #define NMS_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor
-{
-    Index operator()(const Device& d, tv::TensorView<Index> keep,
-                  tv::TensorView<const T> boxes,
-                  T threshold, T eps);
+struct NonMaxSupressionFunctor {
+  Index operator()(const Device& d, tv::TensorView<Index> keep,
+                   tv::TensorView<const T> boxes, T threshold, T eps);
 };

 template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor
-{
-    Index operator()(const Device& d, tv::TensorView<Index> keep,
-                  tv::TensorView<const T> boxCorners,
-                  tv::TensorView<const T> standupIoU, T threshold);
+struct rotateNonMaxSupressionFunctor {
+  Index operator()(const Device& d, tv::TensorView<Index> keep,
+                   tv::TensorView<const T> boxCorners,
+                   tv::TensorView<const T> standupIoU, T threshold);
 };

-} // namespace functor
-} // namespace spconv
+}  // namespace functor
+}  // namespace spconv

 #endif
--- a/mmdet3d/ops/spconv/include/spconv/point2voxel.h
+++ b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
@@ -16,13 +16,15 @@
 #include <pybind11/pybind11.h>
 // must include pybind11/eigen.h if using eigen matrix as arguments.
 // must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
+#include <algorithm>
 // #include <vector>
-#include <iostream>
 #include <math.h>

+#include <iostream>
+
 namespace spconv {
 namespace py = pybind11;
 using namespace pybind11::literals;
@@ -64,13 +66,11 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        break;
+      if (voxel_num >= max_voxels) break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -87,20 +87,19 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-
  }
  return voxel_num;
 }

 template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> means,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto means_rw = means.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
@@ -131,13 +130,11 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        break;
+      if (voxel_num >= max_voxels) break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -151,14 +148,15 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
      }
      num_points_per_voxel_rw(voxelidx) += 1;
      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
      }
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j){
+    for (int j = num; j < max_points; ++j) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(i, j, k) = means_rw(i, k);
      }
@@ -168,15 +166,12 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
 }

 template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> height,
-                          py::array_t<DType> maxs,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
@@ -208,13 +203,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        break;
+      if (voxel_num >= max_voxels) break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -225,7 +218,8 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
      }
      num_points_per_voxel_rw(voxelidx) += 1;
@@ -241,15 +235,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
 }

 template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points,
-                          py::array_t<int> mask,
-                          py::array_t<DType> height,
-                          py::array_t<DType> maxs,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range,
-                          int max_voxels,
-                          DType eps) {
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<1>();
  auto maxs_rw = maxs.template mutable_unchecked<1>();
@@ -278,8 +268,7 @@ int block_filtering(py::array_t<DType> points,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
@@ -299,30 +288,23 @@ int block_filtering(py::array_t<DType> points,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
      mask(i) = 0;
    }
  }
 }

 template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> voxel_mask,
-                          py::array_t<DType> mins,
-                          py::array_t<DType> maxs,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range,
-                          int max_points,
-                          int max_voxels,
-                          int block_factor,
-                          int block_size,
-                          DType height_threshold) {
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto mins_rw = mins.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
@@ -361,13 +343,11 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
-      continue;
+    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        break;
+      if (voxel_num >= max_voxels) break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -381,8 +361,10 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
      }
      block_coor[0] = coor[1] / block_factor;
      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
@@ -394,13 +376,15 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
    block_coor[1] = coor[2] / block_factor;
    min_value = mins_rw(block_coor[0], block_coor[1]);
    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0]-block_size/2);
-    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
-    starty = std::max(0, block_coor[1]-block_size/2);
-    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);

-    for (int j = startx; j < stopx; ++j){
-      for (int k = starty; k < stopy; ++k){
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
        min_value = std::min(min_value, mins_rw(j, k));
        max_value = std::max(max_value, maxs_rw(j, k));
      }
@@ -410,5 +394,4 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
  return voxel_num;
 }

-
-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
@@ -156,6 +156,6 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
  }
 }

-} // namespace spconv
+}  // namespace spconv

 #endif
--- a/mmdet3d/ops/spconv/include/spconv/reordering.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.h
@@ -16,25 +16,23 @@
 #define SPARSE_REORDERING_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename T, typename Index>
-struct SparseGatherFunctor
-{
-    void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
-                    tv::TensorView<const Index> indices, int size);
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size);
 };

 template <typename Device, typename T, typename Index>
-struct SparseScatterAddFunctor
-{
-    void operator()(const Device& d, tv::TensorView<T> out_features,
-                    tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
-                    int size, bool stable=false);
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<T> out_features,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
 };
-} // namespace functor
-} // namespace spconv
+}  // namespace functor
+}  // namespace spconv

 #endif
--- a/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
 #pragma once
 // from tensorflow
-namespace tv
-{
-namespace detail
-{
+namespace tv {
+namespace detail {

 template <typename T>
-class KernelLoop
-{
-  struct Iterator
-  {
-    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(T index, T delta)
+        : index_(index), delta_(delta) {}
    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++()
-    {
+    __forceinline__ __device__ Iterator &operator++() {
      index_ += delta_;
      return *this;
    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const
-    {
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      // Anything past an end iterator (delta_ == 0) is equal.
      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_)
-      {
+      if (!other.delta_) {
        return less;
      }
-      if (!delta_)
-      {
+      if (!delta_) {
        return greater;
      }
      return less || greater;
    }

-  private:
+   private:
    T index_;
    const T delta_;
  };

-public:
+ public:
  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
      : begin_(begin), delta_(delta), end_(end) {}

-  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }

-private:
+ private:
  T begin_;
  T delta_;
  T end_;
 };

-} // namespace detail
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
-{
+}  // namespace detail
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                                  gridDim.x * blockDim.x * NumILP, count);
+                               gridDim.x * blockDim.x * NumILP, count);
 }

 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
 // Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
-{
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                                  gridDim.y * blockDim.y * NumILP, count);
+                               gridDim.y * blockDim.y * NumILP, count);
 }

 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
 // Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
-{
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                                  gridDim.z * blockDim.z * NumILP, count);
+                               gridDim.z * blockDim.z * NumILP, count);
 }

-} // namespace tv
+}  // namespace tv
--- a/mmdet3d/ops/spconv/include/tensorview/tensorview.h
+++ b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
@@ -13,10 +13,11 @@
 // limitations under the License.

 #pragma once
+#include <cuda_runtime_api.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <cuda_runtime_api.h>
 #include <iostream>
 #include <memory>
 // #include <prettyprint.h>
@@ -42,22 +43,22 @@ namespace tv {
 #define TV_HOST_DEVICE
 #endif

-#define TV_REQUIRE(expr, ...)                                                  \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      printf(__VA_ARGS__);                                                     \
-      assert(expr);                                                            \
-    }                                                                          \
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
  }

-#define TV_DEVICE_REQUIRE(expr, ...)                                           \
-  {                                                                            \
-    if (!(expr) && threadIdx.x == 0)                                           \
-      printf(__VA_ARGS__);                                                     \
-    assert(expr);                                                              \
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
  }

-template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
  ss << val;
 }

@@ -67,37 +68,37 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
  sstream_print(ss, args...);
 }

-#define TV_ASSERT_RT_ERR(expr, ...)                                            \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      throw std::invalid_argument(__macro_s.str());                            \
-    }                                                                          \
-  }
-
-#define TV_CHECK_CUDA_ERR()                                                    \
-  {                                                                            \
-    auto err = cudaGetLastError();                                             \
-    if (err != cudaSuccess) {                                                  \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << err;                 \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert faild. ";          \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert faild. ";          \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
  }

 struct GPU {
@@ -130,7 +131,7 @@ constexpr size_t calc_align(size_t ndim)
 */
 template <typename T, size_t MaxDim = TV_MAX_DIM>
 struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
-public:
+ public:
  TV_HOST_DEVICE_INLINE SimpleVector(){};
  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
    TV_ASSERT(q.size() <= MaxDim);
@@ -187,7 +188,7 @@ public:
  typedef size_t size_type;

  class iterator {
-  public:
+   public:
    typedef iterator self_type;
    typedef T value_type;
    typedef T &reference;
@@ -213,12 +214,12 @@ public:
      return ptr_ != rhs.ptr_;
    }

-  private:
+   private:
    pointer ptr_;
  };

  class const_iterator {
-  public:
+   public:
    typedef const_iterator self_type;
    typedef T value_type;
    typedef const T &reference;
@@ -244,7 +245,7 @@ public:
      return ptr_ != rhs.ptr_;
    }

-  private:
+   private:
    pointer ptr_;
  };

@@ -267,7 +268,7 @@ public:
    return const_iterator(mArray + mSize);
  }

-protected:
+ protected:
  T mArray[MaxDim];
  size_t mSize = 0;
 };
@@ -275,11 +276,9 @@ protected:
 template <typename T, size_t MaxDim>
 bool operator==(const SimpleVector<T, MaxDim> &lfs,
                const SimpleVector<T, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size())
-    return false;
+  if (lfs.size() != rfs.size()) return false;
  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i])
-      return false;
+    if (lfs[i] != rfs[i]) return false;
  }
  return true;
 }
@@ -287,12 +286,12 @@ bool operator==(const SimpleVector<T, MaxDim> &lfs,
 template <typename T, size_t MaxDim>
 bool operator!=(const SimpleVector<T, MaxDim> &lfs,
                const SimpleVector<T, MaxDim> &rfs) {
-
  return !(lfs == rfs);
 }

 struct Slice {
-  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
    SimpleVector<int, 3> slices{int(ints)...};
    mSlices[0] = -1;
@@ -333,7 +332,7 @@ struct Slice {
    return mSlices[idx];
  }

-protected:
+ protected:
  int mSlices[3];
 };

@@ -372,8 +371,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  }

  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0)
-      return 0;
+    if (this->mSize == 0) return 0;
    size_t s = 1;
    for (int i = 0; i < int(this->mSize); ++i) {
      s *= this->mArray[i];
@@ -384,16 +382,14 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1)
-        shape.push_back(this->mArray[i]);
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
    }
    return shape;
  }
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1)
-        shape.push_back(this->mArray[i]);
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
    }
    return shape;
  }
@@ -479,7 +475,8 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
  return index;
 }

-template <int N> struct ArrayIndexRowMajor {
+template <int N>
+struct ArrayIndexRowMajor {
  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
@@ -488,7 +485,8 @@ template <int N> struct ArrayIndexRowMajor {
  }
 };

-template <> struct ArrayIndexRowMajor<0> {
+template <>
+struct ArrayIndexRowMajor<0> {
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
    return 0;
@@ -496,24 +494,36 @@ template <> struct ArrayIndexRowMajor<0> {
 };

 namespace detail {
-template <typename T> constexpr const char *simpleTypeName(T val = T());
-template <> constexpr const char *simpleTypeName(float val) {
+template <typename T>
+constexpr const char *simpleTypeName(T val = T());
+template <>
+constexpr const char *simpleTypeName(float val) {
  return "float32";
 }
-template <> constexpr const char *simpleTypeName(double val) {
+template <>
+constexpr const char *simpleTypeName(double val) {
  return "float64";
 }
-template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
-template <> constexpr const char *simpleTypeName(unsigned val) {
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
  return "uint32";
 }
-template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
-template <> constexpr const char *simpleTypeName(unsigned long val) {
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
  return "uint64";
 }
-}; // namespace detail
+};  // namespace detail

-template <typename T, int Rank = -1> struct TensorView {
+template <typename T, int Rank = -1>
+struct TensorView {
  TV_HOST_DEVICE_INLINE TensorView() {}
  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
      : mPtr(ptr), mShape(shape) {}
@@ -526,29 +536,28 @@ template <typename T, int Rank = -1> struct TensorView {
    mShape = {int(shapes)...};
  }

-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
-  assign(const TensorView<T, Rank> &tensor) {
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
+      const TensorView<T, Rank> &tensor) {
    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
               "\n");
    T *ptr = mPtr;
    const T *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i)
-      *(ptr++) = *(other_ptr++);
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
    return *this;
  }

  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
-  assign(std::initializer_list<T1> seq) {
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
+      std::initializer_list<T1> seq) {
    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
               "\n");
    T *ptr = mPtr;
-    for (const T1 &s : seq)
-      *(ptr++) = T(s);
+    for (const T1 &s : seq) *(ptr++) = T(s);
    return *this;
  }

-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
 #ifdef TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
@@ -610,7 +619,8 @@ template <typename T, int Rank = -1> struct TensorView {
    return mPtr[0];
  }

-  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+  template <class T1>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
 #if defined TV_DEBUG
 #if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
@@ -711,7 +721,8 @@ template <typename T, int Rank = -1> struct TensorView {
    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  }

-  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
 #ifdef TV_DEBUG
 #if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
@@ -843,12 +854,12 @@ template <typename T, int Rank = -1> struct TensorView {
 #endif
    return mPtr[idx];
  }*/
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
-  operator[](SimpleVector<Slice> slice_vec) {
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
    return _subview(slice_vec);
  }
-  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
-  operator[](SimpleVector<Slice> slice_vec) const {
+  TV_HOST_DEVICE_INLINE const TensorView<T, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
    return _subview(slice_vec);
  }
  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
@@ -917,7 +928,7 @@ template <typename T, int Rank = -1> struct TensorView {
        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
        TV_ASSERT(new_shape[i] >= 0);
      } else {
-        new_shape[i] = 1; // reduce dim
+        new_shape[i] = 1;  // reduce dim
      }
    }
    auto offset = rowArrayIdx(mShape, start);
@@ -952,8 +963,7 @@ template <typename T, int Rank = -1> struct TensorView {

  std::string repr() const {
    std::ostringstream ss;
-    if (empty())
-      return "";
+    if (empty()) return "";
    if (mShape.ndim() == 0) {
      ss << *mPtr;
      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
@@ -980,14 +990,12 @@ template <typename T, int Rank = -1> struct TensorView {
          print_comma = false;
        }
      }
-      if (print_comma && i != this->size() - 1)
-        ss << ", ";
+      if (print_comma && i != this->size() - 1) ss << ", ";
      for (int j = 0; j < inc_count; ++j) {
        ss << "]";
      }
      if (i != this->size() - 1) {
-        if (inc_count != 0)
-          ss << "\n";
+        if (inc_count != 0) ss << "\n";
        for (int j = 0; j < inc_count; ++j) {
          ss << "[";
        }
@@ -1000,11 +1008,11 @@ template <typename T, int Rank = -1> struct TensorView {
    return ss.str();
  }

-protected:
+ protected:
  // TODO: make this function public.
  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
-  _subview(SimpleVector<Slice> slice_vec) {
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
    Shape new_shape;
    for (int i = 0; i < slice_vec.size(); ++i) {
      new_shape.push_back(slice_vec[i][0]);
@@ -1022,7 +1030,7 @@ protected:
        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
        TV_ASSERT(new_shape[i] >= 0);
      } else {
-        new_shape[i] = 1; // reduce dim
+        new_shape[i] = 1;  // reduce dim
      }
    }
    auto offset = rowArrayIdx(mShape, start);
@@ -1041,7 +1049,8 @@ protected:
    }
    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
  }
-  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
    return Slice{int(s), -1, -1};
  }

@@ -1064,26 +1073,38 @@ Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
 }

 namespace detail {
-template <typename T> constexpr const char *printfTypeFormat(T val = T());
-template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
-template <> constexpr const char *printfTypeFormat(double val) {
+template <typename T>
+constexpr const char *printfTypeFormat(T val = T());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
  return "%.2f";
 }
-template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
-template <> constexpr const char *printfTypeFormat(unsigned val) {
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
  return "%u";
 }
-template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
-template <> constexpr const char *printfTypeFormat(unsigned long val) {
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
  return "%lu";
 }
-}; // namespace detail
+};  // namespace detail

 template <typename T>
 TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
                                    const char *format) {
-  if (tensor.empty())
-    return;
+  if (tensor.empty()) return;
  if (tensor.ndim() == 0) {
    printf(format, tensor());
    printf("\n");
@@ -1108,14 +1129,12 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
        print_comma = false;
      }
    }
-    if (print_comma && i != tensor.size() - 1)
-      printf(", ");
+    if (print_comma && i != tensor.size() - 1) printf(", ");
    for (int j = 0; j < inc_count; ++j) {
      printf("]");
    }
    if (i != tensor.size() - 1) {
-      if (inc_count != 0)
-        printf("\n");
+      if (inc_count != 0) printf("\n");
      for (int j = 0; j < inc_count; ++j) {
        printf("[");
      }
@@ -1141,4 +1160,4 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
  return printTensorView(TensorView<const T>(ptr, shape), format);
 }

-} // namespace tv
+}  // namespace tv
--- a/mmdet3d/ops/spconv/src/indice.cc
+++ b/mmdet3d/ops/spconv/src/indice.cc
@@ -23,61 +23,57 @@ namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                     tv::TensorView<Index> indicesOut,
-                     tv::TensorView<IndexGrid> gridsOut,
-                     tv::TensorView<Index> indicePairs,
-                     tv::TensorView<Index> indiceNum,
-                     const tv::SimpleVector<Index, NDim> kernelSize,
-                     const tv::SimpleVector<Index, NDim> stride,
-                     const tv::SimpleVector<Index, NDim> padding,
-                     const tv::SimpleVector<Index, NDim> dilation,
-                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid) {
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
    if (transpose)
      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
-          gridsOut, indicePairs, indiceNum,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
    else
      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
-          gridsOut, indicePairs, indiceNum,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
-
  }
 };
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                     tv::TensorView<IndexGrid> gridsOut,
-                     tv::TensorView<Index> indicePairs,
-                     tv::TensorView<Index> indiceNum,
-                     const tv::SimpleVector<Index, NDim> kernelSize,
-                     const tv::SimpleVector<Index, NDim> stride,
-                     const tv::SimpleVector<Index, NDim> padding,
-                     const tv::SimpleVector<Index, NDim> dilation,
-                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid) {
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn,
-        gridsOut, indicePairs, indiceNum,
-        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
  }
 };
-} // namespace functor
-
-#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
-  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,  \
-                                                         NDIM>;
+}  // namespace functor

+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;

-#define DECLARE_CPU_INDEX(Index)                                               \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2);                                      \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+#define DECLARE_CPU_INDEX(Index)          \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);

 DECLARE_CPU_INDEX(int);
@@ -86,4 +82,4 @@ DECLARE_CPU_INDEX(long);
 #undef DECLARE_CPU_INDEX
 #undef DECLARE_CPU_SPECS_INDEX_NDIM

-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/indice_cuda.cu
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
@@ -13,16 +13,17 @@
 // limitations under the License.

 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/mp_helper.h>
-#include <spconv/indice.h>
 #include <spconv/indice.cu.h>
+#include <spconv/indice.h>
+#include <spconv/mp_helper.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
-#include <type_traits>
 #include <utility/timer.h>

+#include <chrono>
+#include <limits>
+#include <type_traits>
+
 namespace spconv {
 namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
@@ -41,21 +42,20 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
                   bool transpose) {
    Index batchSize = gridsOut.dim(0);
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
+    if (numActIn == 0) return 0;
    // auto timer = spconv::CudaContextTimer<>();
    if (transpose)
      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                           indiceNum, indicePairUnique, kernelSize, stride,
-                           padding, dilation, outSpatialShape);
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
    else
      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                           indiceNum, indicePairUnique, kernelSize, stride,
-                           padding, dilation, outSpatialShape);
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
    return 1;
@@ -75,18 +75,17 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
+    if (numActIn == 0) return 0;
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                         indicePairUnique, outSpatialShape, batchSize);
+                            indicePairUnique, outSpatialShape, batchSize);
    TV_CHECK_CUDA_ERR();
    assignIndicePairsKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                         indicePairUnique, outSpatialShape);
+                            indicePairUnique, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    if (resetGrid) {
      resetGridKernel<Index, IndexGrid, NDim>
@@ -111,8 +110,7 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
+    if (numActIn == 0) return 0;
    // auto timer = spconv::CudaContextTimer<>();
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
@@ -121,38 +119,40 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                         kernelSize, stride, padding, dilation, outSpatialShape);
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
    TV_CHECK_CUDA_ERR();
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;
  }
 };
-} // namespace functor
+}  // namespace functor

-#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
-  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,    \
-                                                       NDIM>;                  \
-  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int,  \
-                                                         NDIM>;                \
-  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int,  \
-                                                         NDIM>;                \
-  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,    \
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,   \
+                                                       NDIM>;                 \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
+                                                         NDIM>;               \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
+                                                         NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,   \
                                                       NDIM>;

-#define DECLARE_GPU_INDEX(Index)                                               \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1);                                      \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2);                                      \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);

 DECLARE_GPU_INDEX(int);

 #undef DECLARE_GPU_INDEX
 #undef DECLARE_GPU_SPECS_INDEX_NDIM
-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool.cc
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
@@ -62,14 +62,14 @@ struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
    }
  }
 };
-} // namespace functor
+}  // namespace functor

-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>;     \
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;

-#define DECLARE_CPU_SPECS(T)                                                   \
-  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
  DECLARE_CPU_SPECS_T_INDEX(T, long);

 DECLARE_CPU_SPECS(float);
@@ -79,4 +79,4 @@ DECLARE_CPU_SPECS(at::Half);
 #undef DECLARE_CPU_SPECS
 #undef DECLARE_CPU_SPECS_T_INDEX

-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
@@ -13,13 +13,14 @@
 // limitations under the License.

 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
 #include <spconv/maxpool.h>
 #include <spconv/mp_helper.h>
 #include <tensorview/helper_kernel.cu.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
 #include <type_traits>

 namespace spconv {
@@ -54,10 +55,11 @@ __global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
 }

 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
-                             const Index *indicesIn, const Index *indicesOut,
-                             int numHot, int numPlanes) {
+__global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
+                                             const T *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideX[NumILP];
  Index RI[NumILP];
@@ -160,10 +162,11 @@ __global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
 }

 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
-                      T *din, const Index *indicesIn, const Index *indicesOut,
-                      int numHot, int numPlanes) {
+__global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
+                                      const T *dout, T *din,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  T in, out;
  Index idxo, idxi;
@@ -226,10 +229,11 @@ __global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
 }

 template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
-                         const T *dout, T *din, const Index *indicesIn,
-                         const Index *indicesOut, int numHot, int numPlanes) {
+__global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
+                                         const T *inFeatures, const T *dout,
+                                         T *din, const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
@@ -255,7 +259,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
      reinterpret_cast<VecType *>(bufdo)[0] =
          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(din)[idxi];

 #pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
@@ -263,16 +268,18 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          bufdi[i] += bufdo[i];
        }
      }
-      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+      reinterpret_cast<VecType *>(din)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
    }
  }
 }

 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
-                        const T *dout, T *din, const Index *indicesIn,
-                        const Index *indicesOut, int numHot, int numPlanes) {
+__global__ void maxPoolBwdGenericKernel(const T *outFeatures,
+                                        const T *inFeatures, const T *dout,
+                                        T *din, const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideX[NumILP];
  Index RI[NumILP];
@@ -313,8 +320,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
-      return;
+    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -326,13 +332,14 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                 indices.subview(0).data(),
-                                 indices.subview(1).data(), numHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

@@ -340,9 +347,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    indices.subview(0).data() + numHotBlock,
-                                    indices.subview(1).data() + numHotBlock,
-                                    size - numHotBlock, numPlanes);
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -387,8 +394,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const T> dout, tv::TensorView<T> din,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
-      return;
+    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -400,14 +406,15 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                 dout.data(), din.data(),
-                                 indices.subview(0).data(),
-                                 indices.subview(1).data(), numHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    dout.data(), din.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

@@ -415,10 +422,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    dout.data(), din.data(),
-                                    indices.subview(0).data() + numHotBlock,
-                                    indices.subview(1).data() + numHotBlock,
-                                    size - numHotBlock, numPlanes);
+                                       dout.data(), din.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -454,10 +461,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
  }
 };

-} // namespace functor
+}  // namespace functor

-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>;     \
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;

 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
@@ -468,4 +475,4 @@ DECLARE_GPU_SPECS(at::Half);

 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_T_INDEX
-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering.cc
+++ b/mmdet3d/ops/spconv/src/reordering.cc
@@ -19,7 +19,8 @@ namespace spconv {
 namespace functor {
 template <typename T, typename Index>
 struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+  void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
    int numPlanes = features.dim(1);
    for (int i = 0; i < size; ++i) {
@@ -33,30 +34,29 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
 template <typename T, typename Index>
 struct SparseScatterAddFunctor<tv::CPU, T, Index> {
  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
-                  int size, bool stable) {
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
    int numPlanes = outFeatures.dim(1);
    const T* buf = buffer.data();
    T* out = outFeatures.data();
    for (int i = 0; i < size; ++i) {
      buf = buffer.data() + i * numPlanes;
      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j){
+      for (int j = 0; j < numPlanes; ++j) {
        out[j] += buf[j];
      }
    }
  }
 };

-} // namespace functor
+}  // namespace functor

-
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)               \
-  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;  \
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;

-#define DECLARE_CPU_SPECS(T)                                                   \
-  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
  DECLARE_CPU_SPECS_T_INDEX(T, long);

 DECLARE_CPU_SPECS(float);
@@ -66,4 +66,4 @@ DECLARE_CPU_SPECS(at::Half);
 #undef DECLARE_CPU_SPECS
 #undef DECLARE_CPU_SPECS_T_INDEX

-} // namespace spconv
+}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering_cuda.cu
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
@@ -13,17 +13,18 @@
 // limitations under the License.

 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
 #include <spconv/mp_helper.h>
-#include <spconv/reordering.h>
 #include <spconv/reordering.cu.h>
+#include <spconv/reordering.h>
 #include <tensorview/helper_kernel.cu.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
-#include <type_traits>
 #include <utility/timer.h>

+#include <chrono>
+#include <limits>
+#include <type_traits>
+
 namespace spconv {
 namespace functor {
 template <typename T, typename Index>
@@ -34,8 +35,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
-      return;
+    if (size <= 0) return;
    int numPlanes = features.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
-                                 nHotBlock, numPlanes / vecloadFactor);
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);

            TV_CHECK_CUDA_ERR();
          }
@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
                <<<dim3(1, numPlanes / NumTLP),
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                 features.data(), indices.data() + nHotBlock,
-                                 size - nHotBlock, numPlanes / vecloadFactor);
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -89,12 +91,11 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0)
-      return;
+    if (size <= 0) return;
    int numPlanes = outFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor =
-        sizeof(vecload_type_t) / sizeof(T); // important for half.
+        sizeof(vecload_type_t) / sizeof(T);  // important for half.
    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
                                 &notFound](auto NumTLP) {
      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
@@ -108,8 +109,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                 indices.data(), nHotBlock,
-                                 numPlanes / vecloadFactor);
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          if (size - nHotBlock > 0) {
@@ -137,11 +138,10 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
    }
  }
 };
-} // namespace functor
-
+}  // namespace functor

-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;

 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
@@ -152,4 +152,4 @@ DECLARE_GPU_SPECS(at::Half);

 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_T_INDEX
-} // namespace spconv
+}  // namespace spconv
--- a/tests/test_heads.py
+++ b/tests/test_heads.py
@@ -170,3 +170,87 @@ def test_parta2_rpnhead_getboxes():
    assert result_list[0]['labels_3d'].shape == torch.Size([512])
    assert result_list[0]['cls_preds'].shape == torch.Size([512, 3])
    assert result_list[0]['boxes_3d'].shape == torch.Size([512, 7])
+
+
+def test_vote_head():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    from mmdet3d.models.dense_heads import VoteHead
+    bbox_head_cfg = dict(
+        num_classes=10,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=10,
+            num_dir_bins=5,
+            with_rot=True,
+            mean_sizes=[[2.114256, 1.620300, 0.927272],
+                        [0.791118, 1.279516, 0.718182],
+                        [0.923508, 1.867419, 0.845495],
+                        [0.591958, 0.552978, 0.827272],
+                        [0.699104, 0.454178, 0.75625],
+                        [0.69519, 1.346299, 0.736364],
+                        [0.528526, 1.002642, 1.172878],
+                        [0.500618, 0.632163, 0.683424],
+                        [0.404671, 1.071108, 1.688889],
+                        [0.76584, 1.398258, 0.472728]]),
+        vote_moudule_cfg=dict(
+            in_channels=64,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(64, 64),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[64, 32, 32, 32],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(64, 64),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0))
+
+    train_cfg = dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+
+    self = VoteHead(train_cfg=train_cfg, **bbox_head_cfg).cuda()
+    fp_xyz = [torch.rand([2, 64, 3], dtype=torch.float32).cuda()]
+    fp_features = [torch.rand([2, 64, 64], dtype=torch.float32).cuda()]
+    fp_indices = [torch.randint(0, 128, [2, 64]).cuda()]
+
+    input_dict = dict(
+        fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices)
+    # test forward
+    ret_dict = self(input_dict, 'vote')
+    assert ret_dict['center'].shape == torch.Size([2, 256, 3])
+    assert ret_dict['obj_scores'].shape == torch.Size([2, 256, 2])
+    assert ret_dict['size_res'].shape == torch.Size([2, 256, 10, 3])
+    assert ret_dict['dir_res'].shape == torch.Size([2, 256, 5])
--- a/tests/test_losses.py
+++ b/tests/test_losses.py
+import pytest
+import torch
+
+
+def test_chamfer_disrance():
+    from mmdet3d.models.losses import ChamferDistance, chamfer_distance
+
+    with pytest.raises(AssertionError):
+        # test invalid mode
+        ChamferDistance(mode='smoothl1')
+        # test invalid type of reduction
+        ChamferDistance(mode='l2', reduction=None)
+
+    self = ChamferDistance(
+        mode='l2', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0)
+    source = torch.tensor([[[-0.9888, 0.9683, -0.8494],
+                            [-6.4536, 4.5146,
+                             1.6861], [2.0482, 5.6936, -1.4701],
+                            [-0.5173, 5.6472, 2.1748],
+                            [-2.8010, 5.4423, -1.2158],
+                            [2.4018, 2.4389, -0.2403],
+                            [-2.8811, 3.8486, 1.4750],
+                            [-0.2031, 3.8969,
+                             -1.5245], [1.3827, 4.9295, 1.1537],
+                            [-2.6961, 2.2621, -1.0976]],
+                           [[0.3692, 1.8409,
+                             -1.4983], [1.9995, 6.3602, 0.1798],
+                            [-2.1317, 4.6011,
+                             -0.7028], [2.4158, 3.1482, 0.3169],
+                            [-0.5836, 3.6250, -1.2650],
+                            [-1.9862, 1.6182, -1.4901],
+                            [2.5992, 1.2847, -0.8471],
+                            [-0.3467, 5.3681, -1.4755],
+                            [-0.8576, 3.3400, -1.7399],
+                            [2.7447, 4.6349, 0.1994]]])
+
+    target = torch.tensor([[[-0.4758, 1.0094, -0.8645],
+                            [-0.3130, 0.8564, -0.9061],
+                            [-0.1560, 2.0394, -0.8936],
+                            [-0.3685, 1.6467, -0.8271],
+                            [-0.2740, 2.2212, -0.7980]],
+                           [[1.4856, 2.5299,
+                             -1.0047], [2.3262, 3.3065, -0.9475],
+                            [2.4593, 2.5870,
+                             -0.9423], [0.0000, 0.0000, 0.0000],
+                            [0.0000, 0.0000, 0.0000]]])
+
+    loss_source, loss_target, indices1, indices2 = self(
+        source, target, return_indices=True)
+
+    assert torch.allclose(loss_source, torch.tensor(219.5936))
+    assert torch.allclose(loss_target, torch.tensor(22.3705))
+    assert (indices1 == indices1.new_tensor([[0, 4, 4, 4, 4, 2, 4, 4, 4, 3],
+                                             [0, 1, 0, 1, 0, 4, 2, 0, 0,
+                                              1]])).all()
+    assert (indices2 == indices2.new_tensor([[0, 0, 0, 0, 0], [0, 3, 6, 0,
+                                                               0]])).all()
+
+    loss_source, loss_target, indices1, indices2 = chamfer_distance(
+        source, target, reduction='sum')
+
+    assert torch.allclose(loss_source, torch.tensor(219.5936))
+    assert torch.allclose(loss_target, torch.tensor(22.3705))
+    assert (indices1 == indices1.new_tensor([[0, 4, 4, 4, 4, 2, 4, 4, 4, 3],
+                                             [0, 1, 0, 1, 0, 4, 2, 0, 0,
+                                              1]])).all()
+    assert (indices2 == indices2.new_tensor([[0, 0, 0, 0, 0], [0, 3, 6, 0,
+                                                               0]])).all()
--- a/tests/test_nms.py
+++ b/tests/test_nms.py
+import torch
+
+
+def test_aligned_3d_nms():
+    from mmdet3d.core.post_processing import aligned_3d_nms
+
+    boxes = torch.tensor([[1.2261, 0.6679, -1.2678, 2.6547, 1.0428, 0.1000],
+                          [5.0919, 0.6512, 0.7238, 5.4821, 1.2451, 2.1095],
+                          [6.8392, -1.2205, 0.8570, 7.6920, 0.3220, 3.2223],
+                          [3.6900, -0.4235, -1.0380, 4.4415, 0.2671, -0.1442],
+                          [4.8071, -1.4311, 0.7004, 5.5788, -0.6837, 1.2487],
+                          [2.1807, -1.5811, -1.1289, 3.0151, -0.1346, -0.5351],
+                          [4.4631, -4.2588, -1.1403, 5.3012, -3.4463, -0.3212],
+                          [4.7607, -3.3311, 0.5993, 5.2976, -2.7874, 1.2273],
+                          [3.1265, 0.7113, -0.0296, 3.8944, 1.3532, 0.9785],
+                          [5.5828, -3.5350, 1.0105, 8.2841, -0.0405, 3.3614],
+                          [3.0003, -2.1099, -1.0608, 5.3423, 0.0328, 0.6252],
+                          [2.7148, 0.6082, -1.1738, 3.6995, 1.2375, -0.0209],
+                          [4.9263, -0.2152, 0.2889, 5.6963, 0.3416, 1.3471],
+                          [5.0713, 1.3459, -0.2598, 5.6278, 1.9300, 1.2835],
+                          [4.5985, -2.3996, -0.3393, 5.2705, -1.7306, 0.5698],
+                          [4.1386, 0.5658, 0.0422, 4.8937, 1.1983, 0.9911],
+                          [2.7694, -1.9822, -1.0637, 4.0691, 0.3575, -0.1393],
+                          [4.6464, -3.0123, -1.0694, 5.1421, -2.4450, -0.3758],
+                          [3.4754, 0.4443, -1.1282, 4.6727, 1.3786, 0.2550],
+                          [2.5905, -0.3504, -1.1202, 3.1599, 0.1153, -0.3036],
+                          [4.1336, -3.4813, 1.1477, 6.2091, -0.8776, 2.6757],
+                          [3.9966, 0.2069, -1.1148, 5.0841, 1.0525, -0.0648],
+                          [4.3216, -1.8647, 0.4733, 6.2069, 0.6671, 3.3363],
+                          [4.7683, 0.4286, -0.0500, 5.5642, 1.2906, 0.8902],
+                          [1.7337, 0.7625, -1.0058, 3.0675, 1.3617, 0.3849],
+                          [4.7193, -3.3687, -0.9635, 5.1633, -2.7656, 1.1001],
+                          [4.4704, -2.7744, -1.1127, 5.0971, -2.0228, -0.3150],
+                          [2.7027, 0.6122, -0.9169, 3.3083, 1.2117, 0.6129],
+                          [4.8789, -2.0025, 0.8385, 5.5214, -1.3668, 1.3552],
+                          [3.7856, -1.7582, -0.1738, 5.3373, -0.6300, 0.5558]])
+
+    scores = torch.tensor([
+        3.6414e-03, 2.2901e-02, 2.7576e-04, 1.2238e-02, 5.9310e-04, 1.2659e-01,
+        2.4104e-02, 5.0742e-03, 2.3581e-03, 2.0946e-07, 8.8039e-01, 1.9127e-01,
+        5.0469e-05, 9.3638e-03, 3.0663e-03, 9.4350e-03, 5.3380e-02, 1.7895e-01,
+        2.0048e-01, 1.1294e-03, 3.0304e-08, 2.0237e-01, 1.0894e-08, 6.7972e-02,
+        6.7156e-01, 9.3986e-04, 7.9470e-01, 3.9736e-01, 1.8000e-04, 7.9151e-04
+    ])
+
+    cls = torch.tensor([
+        8, 8, 8, 3, 3, 1, 3, 3, 7, 8, 0, 6, 7, 8, 3, 7, 2, 7, 6, 3, 8, 6, 6, 7,
+        6, 8, 7, 6, 3, 1
+    ])
+
+    pick = aligned_3d_nms(boxes, scores, cls, 0.25)
+    expected_pick = torch.tensor([
+        10, 26, 24, 27, 21, 18, 17, 5, 23, 16, 6, 1, 3, 15, 13, 7, 0, 14, 8,
+        19, 25, 29, 4, 2, 28, 12, 9, 20, 22
+    ])
+
+    assert torch.all(pick == expected_pick)
--- a/tests/test_parta2_bbox_head.py
+++ b/tests/test_parta2_bbox_head.py
+import pytest
+import torch
+from mmcv import Config
+from torch.nn import BatchNorm1d, ReLU
+
+from mmdet3d.core.bbox.samplers import IoUNegPiecewiseSampler
+from mmdet3d.models import PartA2BboxHead
+from mmdet3d.ops import make_sparse_convmodule
+from mmdet3d.ops.spconv.conv import SubMConv3d
+
+
+def test_loss():
+    self = PartA2BboxHead(
+        num_classes=3,
+        seg_in_channels=16,
+        part_in_channels=4,
+        seg_conv_channels=[64, 64],
+        part_conv_channels=[64, 64],
+        merge_conv_channels=[128, 128],
+        down_conv_channels=[128, 256],
+        shared_fc_channels=[256, 512, 512, 512],
+        cls_channels=[256, 256],
+        reg_channels=[256, 256])
+
+    cls_score = torch.Tensor([[-3.6810], [-3.9413], [-5.3971], [-17.1281],
+                              [-5.9434], [-6.2251]])
+    bbox_pred = torch.Tensor(
+        [[
+            -6.3016e-03, -5.2294e-03, -1.2793e-02, -1.0602e-02, -7.4086e-04,
+            9.2471e-03, 7.3514e-03
+        ],
+         [
+             -1.1975e-02, -1.1578e-02, -3.1219e-02, 2.7754e-02, 6.9775e-03,
+             9.4042e-04, 9.0472e-04
+         ],
+         [
+             3.7539e-03, -9.1897e-03, -5.3666e-03, -1.0380e-05, 4.3467e-03,
+             4.2470e-03, 1.8355e-03
+         ],
+         [
+             -7.6093e-02, -1.2497e-01, -9.2942e-02, 2.1404e-02, 2.3750e-02,
+             1.0365e-01, -1.3042e-02
+         ],
+         [
+             2.7577e-03, -1.1514e-02, -1.1097e-02, -2.4946e-03, 2.3268e-03,
+             1.6797e-03, -1.4076e-03
+         ],
+         [
+             3.9635e-03, -7.8551e-03, -3.5125e-03, 2.1229e-04, 9.7042e-03,
+             1.7499e-03, -5.1254e-03
+         ]])
+    rois = torch.Tensor([
+        [0.0000, 13.3711, -12.5483, -1.9306, 1.7027, 4.2836, 1.4283, -1.1499],
+        [0.0000, 19.2472, -7.2655, -10.6641, 3.3078, 83.1976, 29.3337, 2.4501],
+        [0.0000, 13.8012, -10.9791, -3.0617, 0.2504, 1.2518, 0.8807, 3.1034],
+        [0.0000, 16.2736, -9.0284, -2.0494, 8.2697, 31.2336, 9.1006, 1.9208],
+        [0.0000, 10.4462, -13.6879, -3.1869, 7.3366, 0.3518, 1.7199, -0.7225],
+        [0.0000, 11.3374, -13.6671, -3.2332, 4.9934, 0.3750, 1.6033, -0.9665]
+    ])
+    labels = torch.Tensor([0.7100, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])
+    bbox_targets = torch.Tensor(
+        [[0.0598, 0.0243, -0.0984, -0.0454, 0.0066, 0.1114, 0.1714]])
+    pos_gt_bboxes = torch.Tensor(
+        [[13.6686, -12.5586, -2.1553, 1.6271, 4.3119, 1.5966, 2.1631]])
+    reg_mask = torch.Tensor([1, 0, 0, 0, 0, 0])
+    label_weights = torch.Tensor(
+        [0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078])
+    bbox_weights = torch.Tensor([1., 0., 0., 0., 0., 0.])
+
+    loss = self.loss(cls_score, bbox_pred, rois, labels, bbox_targets,
+                     pos_gt_bboxes, reg_mask, label_weights, bbox_weights)
+
+    expected_loss_cls = torch.Tensor([
+        2.0579e-02, 1.5005e-04, 3.5252e-05, 0.0000e+00, 2.0433e-05, 1.5422e-05
+    ])
+    expected_loss_bbox = torch.as_tensor(0.0622)
+    expected_loss_corner = torch.Tensor([0.1379])
+
+    assert torch.allclose(loss['loss_cls'], expected_loss_cls, 1e-3)
+    assert torch.allclose(loss['loss_bbox'], expected_loss_bbox, 1e-3)
+    assert torch.allclose(loss['loss_corner'], expected_loss_corner, 1e-3)
+
+
+def test_get_targets():
+    self = PartA2BboxHead(
+        num_classes=3,
+        seg_in_channels=16,
+        part_in_channels=4,
+        seg_conv_channels=[64, 64],
+        part_conv_channels=[64, 64],
+        merge_conv_channels=[128, 128],
+        down_conv_channels=[128, 256],
+        shared_fc_channels=[256, 512, 512, 512],
+        cls_channels=[256, 256],
+        reg_channels=[256, 256])
+
+    sampling_result = IoUNegPiecewiseSampler(
+        1,
+        pos_fraction=0.55,
+        neg_piece_fractions=[0.8, 0.2],
+        neg_iou_piece_thrs=[0.55, 0.1],
+        return_iou=True)
+    sampling_result.pos_bboxes = torch.Tensor(
+        [[8.1517, 0.0384, -1.9496, 1.5271, 4.1131, 1.4879, 1.2076]])
+    sampling_result.pos_gt_bboxes = torch.Tensor(
+        [[7.8417, -0.1405, -1.9652, 1.6122, 3.2838, 1.5331, -2.0835]])
+    sampling_result.iou = torch.Tensor([
+        6.7787e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 1.2839e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 7.0261e-04, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 5.8915e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 5.6628e-06,
+        5.0271e-02, 0.0000e+00, 1.9608e-01, 0.0000e+00, 0.0000e+00, 2.3519e-01,
+        1.6589e-02, 0.0000e+00, 1.0162e-01, 2.1634e-02, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 5.6326e-02,
+        1.3810e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        4.5455e-02, 0.0000e+00, 1.0929e-03, 0.0000e+00, 8.8191e-02, 1.1012e-01,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.6236e-01, 0.0000e+00, 1.1342e-01,
+        1.0636e-01, 9.9803e-02, 5.7394e-02, 0.0000e+00, 1.6773e-01, 0.0000e+00,
+        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 6.3464e-03,
+        0.0000e+00, 2.7977e-01, 0.0000e+00, 3.1252e-01, 2.1642e-01, 2.2945e-01,
+        0.0000e+00, 1.8297e-01, 0.0000e+00, 2.1908e-01, 1.1661e-01, 1.3513e-01,
+        1.5898e-01, 7.4368e-03, 1.2523e-01, 1.4735e-04, 0.0000e+00, 0.0000e+00,
+        0.0000e+00, 1.0948e-01, 2.5889e-01, 4.4585e-04, 8.6483e-02, 1.6376e-01,
+        0.0000e+00, 2.2894e-01, 2.7489e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+        1.8334e-01, 1.0193e-01, 2.3389e-01, 1.1035e-01, 3.3700e-01, 1.4397e-01,
+        1.0379e-01, 0.0000e+00, 1.1226e-01, 0.0000e+00, 0.0000e+00, 1.6201e-01,
+        0.0000e+00, 1.3569e-01
+    ])
+
+    rcnn_train_cfg = Config({
+        'assigner': [{
+            'type': 'MaxIoUAssigner',
+            'iou_calculator': {
+                'type': 'BboxOverlaps3D',
+                'coordinate': 'lidar'
+            },
+            'pos_iou_thr': 0.55,
+            'neg_iou_thr': 0.55,
+            'min_pos_iou': 0.55,
+            'ignore_iof_thr': -1
+        }, {
+            'type': 'MaxIoUAssigner',
+            'iou_calculator': {
+                'type': 'BboxOverlaps3D',
+                'coordinate': 'lidar'
+            },
+            'pos_iou_thr': 0.55,
+            'neg_iou_thr': 0.55,
+            'min_pos_iou': 0.55,
+            'ignore_iof_thr': -1
+        }, {
+            'type': 'MaxIoUAssigner',
+            'iou_calculator': {
+                'type': 'BboxOverlaps3D',
+                'coordinate': 'lidar'
+            },
+            'pos_iou_thr': 0.55,
+            'neg_iou_thr': 0.55,
+            'min_pos_iou': 0.55,
+            'ignore_iof_thr': -1
+        }],
+        'sampler': {
+            'type': 'IoUNegPiecewiseSampler',
+            'num': 128,
+            'pos_fraction': 0.55,
+            'neg_piece_fractions': [0.8, 0.2],
+            'neg_iou_piece_thrs': [0.55, 0.1],
+            'neg_pos_ub': -1,
+            'add_gt_as_proposals': False,
+            'return_iou': True
+        },
+        'cls_pos_thr':
+        0.75,
+        'cls_neg_thr':
+        0.25
+    })
+
+    label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights\
+        = self.get_targets([sampling_result], rcnn_train_cfg)
+
+    expected_label = torch.Tensor([
+        0.8557, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0595, 0.0000, 0.1250, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0178, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000, 0.0498, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.1740, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.0000, 0.0000
+    ])
+
+    expected_bbox_targets = torch.Tensor(
+        [[0.0805, 0.0130, 0.0047, 0.0542, -0.2252, 0.0299, -0.1495]])
+
+    expected_pos_gt_bboxes = torch.Tensor(
+        [[7.8417, -0.1405, -1.9652, 1.6122, 3.2838, 1.5331, -2.0835]])
+
+    expected_reg_mask = torch.LongTensor([
+        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0
+    ])
+
+    expected_label_weights = torch.Tensor([
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078, 0.0078,
+        0.0078, 0.0078
+    ])
+
+    expected_bbox_weights = torch.Tensor([
+        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0.
+    ])
+
+    assert torch.allclose(label, expected_label, 1e-2)
+    assert torch.allclose(bbox_targets, expected_bbox_targets, 1e-2)
+    assert torch.allclose(pos_gt_bboxes, expected_pos_gt_bboxes)
+    assert torch.all(reg_mask == expected_reg_mask)
+    assert torch.allclose(label_weights, expected_label_weights, 1e-2)
+    assert torch.allclose(bbox_weights, expected_bbox_weights)
+
+
+def test_get_bboxes():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    self = PartA2BboxHead(
+        num_classes=3,
+        seg_in_channels=16,
+        part_in_channels=4,
+        seg_conv_channels=[64, 64],
+        part_conv_channels=[64, 64],
+        merge_conv_channels=[128, 128],
+        down_conv_channels=[128, 256],
+        shared_fc_channels=[256, 512, 512, 512],
+        cls_channels=[256, 256],
+        reg_channels=[256, 256])
+
+    rois = torch.Tensor([[
+        0.0000e+00, 5.6284e+01, 2.5712e+01, -1.3196e+00, 1.5943e+00,
+        3.7509e+00, 1.4969e+00, 1.2105e-03
+    ],
+                         [
+                             0.0000e+00, 5.4685e+01, 2.9132e+01, -1.9178e+00,
+                             1.6337e+00, 4.1116e+00, 1.5472e+00, -1.7312e+00
+                         ],
+                         [
+                             0.0000e+00, 5.5927e+01, 2.5830e+01, -1.4099e+00,
+                             1.5958e+00, 3.8861e+00, 1.4911e+00, -2.9276e+00
+                         ],
+                         [
+                             0.0000e+00, 5.6306e+01, 2.6310e+01, -1.3729e+00,
+                             1.5893e+00, 3.7448e+00, 1.4924e+00, 1.6071e-01
+                         ],
+                         [
+                             0.0000e+00, 3.1633e+01, -5.8557e+00, -1.2541e+00,
+                             1.6517e+00, 4.1829e+00, 1.5593e+00, -1.6037e+00
+                         ],
+                         [
+                             0.0000e+00, 3.1789e+01, -5.5308e+00, -1.3012e+00,
+                             1.6412e+00, 4.1070e+00, 1.5487e+00, -1.6517e+00
+                         ]]).cuda()
+
+    cls_score = torch.Tensor([[-2.2061], [-2.1121], [-1.4478], [-2.9614],
+                              [-0.1761], [0.7357]]).cuda()
+
+    bbox_pred = torch.Tensor(
+        [[
+            -4.7917e-02, -1.6504e-02, -2.2340e-02, 5.1296e-03, -2.0984e-02,
+            1.0598e-02, -1.1907e-01
+        ],
+         [
+             -1.6261e-02, -5.4005e-02, 6.2480e-03, 1.5496e-03, -1.3285e-02,
+             8.1482e-03, -2.2707e-03
+         ],
+         [
+             -3.9423e-02, 2.0151e-02, -2.1138e-02, -1.1845e-03, -1.5343e-02,
+             5.7208e-03, 8.5646e-03
+         ],
+         [
+             6.3104e-02, -3.9307e-02, 2.3005e-02, -7.0528e-03, -9.2637e-05,
+             2.2656e-02, 1.6358e-02
+         ],
+         [
+             -1.4864e-03, 5.6840e-02, 5.8247e-03, -3.5541e-03, -4.9658e-03,
+             2.5036e-03, 3.0302e-02
+         ],
+         [
+             -4.3259e-02, -1.9963e-02, 3.5004e-02, 3.7546e-03, 1.0876e-02,
+             -3.9637e-04, 2.0445e-02
+         ]]).cuda()
+
+    class_labels = [torch.Tensor([2, 2, 2, 2, 2, 2]).cuda()]
+
+    class_pred = [
+        torch.Tensor([[1.0877e-05, 1.0318e-05, 2.6599e-01],
+                      [1.3105e-05, 1.1904e-05, 2.4432e-01],
+                      [1.4530e-05, 1.4619e-05, 2.4395e-01],
+                      [1.3251e-05, 1.3038e-05, 2.3703e-01],
+                      [2.9156e-05, 2.5521e-05, 2.2826e-01],
+                      [3.1665e-05, 2.9054e-05, 2.2077e-01]]).cuda()
+    ]
+
+    cfg = Config(
+        dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1))
+    result_list = self.get_bboxes(rois, cls_score, bbox_pred, class_labels,
+                                  class_pred, None, cfg)
+    selected_bboxes, selected_scores, selected_label_preds = result_list[0]
+
+    expected_selected_bboxes = torch.Tensor(
+        [[56.2170, 25.9074, -1.3610, 1.6025, 3.6730, 1.5128, -0.1179],
+         [54.6521, 28.8846, -1.9145, 1.6362, 4.0573, 1.5599, -1.7335],
+         [31.6179, -5.6004, -1.2470, 1.6458, 4.1622, 1.5632, -1.5734]]).cuda()
+    expected_selected_scores = torch.Tensor([-2.2061, -2.1121, -0.1761]).cuda()
+    expected_selected_label_preds = torch.Tensor([2., 2., 2.]).cuda()
+
+    assert torch.allclose(selected_bboxes, expected_selected_bboxes, 1e-3)
+    assert torch.allclose(selected_scores, expected_selected_scores, 1e-3)
+    assert torch.allclose(selected_label_preds, expected_selected_label_preds)
+
+
+def test_multi_class_nms():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    self = PartA2BboxHead(
+        num_classes=3,
+        seg_in_channels=16,
+        part_in_channels=4,
+        seg_conv_channels=[64, 64],
+        part_conv_channels=[64, 64],
+        merge_conv_channels=[128, 128],
+        down_conv_channels=[128, 256],
+        shared_fc_channels=[256, 512, 512, 512],
+        cls_channels=[256, 256],
+        reg_channels=[256, 256])
+
+    box_probs = torch.Tensor([[1.0877e-05, 1.0318e-05, 2.6599e-01],
+                              [1.3105e-05, 1.1904e-05, 2.4432e-01],
+                              [1.4530e-05, 1.4619e-05, 2.4395e-01],
+                              [1.3251e-05, 1.3038e-05, 2.3703e-01],
+                              [2.9156e-05, 2.5521e-05, 2.2826e-01],
+                              [3.1665e-05, 2.9054e-05, 2.2077e-01],
+                              [5.5738e-06, 6.2453e-06, 2.1978e-01],
+                              [9.0193e-06, 9.2154e-06, 2.1418e-01],
+                              [1.4004e-05, 1.3209e-05, 2.1316e-01],
+                              [7.9210e-06, 8.1767e-06, 2.1304e-01]]).cuda()
+
+    box_preds = torch.Tensor(
+        [[
+            5.6217e+01, 2.5908e+01, -1.3611e+00, 1.6025e+00, 3.6730e+00,
+            1.5129e+00, -1.1786e-01
+        ],
+         [
+             5.4653e+01, 2.8885e+01, -1.9145e+00, 1.6362e+00, 4.0574e+00,
+             1.5599e+00, -1.7335e+00
+         ],
+         [
+             5.5809e+01, 2.5686e+01, -1.4457e+00, 1.5939e+00, 3.8270e+00,
+             1.4997e+00, -2.9191e+00
+         ],
+         [
+             5.6107e+01, 2.6082e+01, -1.3557e+00, 1.5782e+00, 3.7444e+00,
+             1.5266e+00, 1.7707e-01
+         ],
+         [
+             3.1618e+01, -5.6004e+00, -1.2470e+00, 1.6459e+00, 4.1622e+00,
+             1.5632e+00, -1.5734e+00
+         ],
+         [
+             3.1605e+01, -5.6342e+00, -1.2467e+00, 1.6474e+00, 4.1519e+00,
+             1.5481e+00, -1.6313e+00
+         ],
+         [
+             5.6211e+01, 2.7294e+01, -1.5350e+00, 1.5422e+00, 3.7733e+00,
+             1.5140e+00, 9.5846e-02
+         ],
+         [
+             5.5907e+01, 2.7155e+01, -1.4712e+00, 1.5416e+00, 3.7611e+00,
+             1.5142e+00, -5.2059e-02
+         ],
+         [
+             5.4000e+01, 3.0585e+01, -1.6874e+00, 1.6495e+00, 4.0376e+00,
+             1.5554e+00, -1.7900e+00
+         ],
+         [
+             5.6007e+01, 2.6300e+01, -1.3945e+00, 1.5716e+00, 3.7064e+00,
+             1.4715e+00, -2.9639e+00
+         ]]).cuda()
+
+    selected = self.multi_class_nms(box_probs, box_preds, 0.1, 0.001)
+    expected_selected = torch.Tensor([0, 1, 4, 8]).cuda()
+
+    assert torch.all(selected == expected_selected)
+
+
+def test_make_sparse_convmodule():
+    with pytest.raises(AssertionError):
+        # assert invalid order setting
+        make_sparse_convmodule(
+            in_channels=4,
+            out_channels=8,
+            kernel_size=3,
+            indice_key='rcnn_part2',
+            norm_cfg=dict(type='BN1d'),
+            order=('norm', 'act', 'conv', 'norm'))
+
+        # assert invalid type of order
+        make_sparse_convmodule(
+            in_channels=4,
+            out_channels=8,
+            kernel_size=3,
+            indice_key='rcnn_part2',
+            norm_cfg=dict(type='BN1d'),
+            order=['norm', 'conv'])
+
+        # assert invalid elements of order
+        make_sparse_convmodule(
+            in_channels=4,
+            out_channels=8,
+            kernel_size=3,
+            indice_key='rcnn_part2',
+            norm_cfg=dict(type='BN1d'),
+            order=('conv', 'normal', 'activate'))
+
+    sparse_convmodule = make_sparse_convmodule(
+        in_channels=4,
+        out_channels=64,
+        kernel_size=3,
+        padding=1,
+        indice_key='rcnn_part0',
+        norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01))
+
+    assert isinstance(sparse_convmodule[0], SubMConv3d)
+    assert isinstance(sparse_convmodule[1], BatchNorm1d)
+    assert isinstance(sparse_convmodule[2], ReLU)
+    assert sparse_convmodule[1].num_features == 64
+    assert sparse_convmodule[1].eps == 0.001
+    assert sparse_convmodule[1].affine is True
+    assert sparse_convmodule[1].track_running_stats is True
+    assert isinstance(sparse_convmodule[2], ReLU)
+    assert sparse_convmodule[2].inplace is True
+
+    pre_act = make_sparse_convmodule(
+        in_channels=4,
+        out_channels=8,
+        kernel_size=3,
+        indice_key='rcnn_part1',
+        norm_cfg=dict(type='BN1d'),
+        order=('norm', 'act', 'conv'))
+    assert isinstance(pre_act[0], BatchNorm1d)
+    assert isinstance(pre_act[1], ReLU)
+    assert isinstance(pre_act[2], SubMConv3d)
--- a/tests/test_roiaware_pool3d.py
+++ b/tests/test_roiaware_pool3d.py
 import pytest
 import torch

-from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_cpu,
+from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch,
+                                         points_in_boxes_cpu,
                                         points_in_boxes_gpu)


@@ -83,3 +84,29 @@ def test_points_in_boxes_cpu():
        dtype=torch.int32)
    assert point_indices.shape == torch.Size([2, 15])
    assert (point_indices == expected_point_indices).all()
+
+
+def test_points_in_boxes_batch():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32).cuda(
+        )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_batch(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([1, 15, 2])
+    assert (point_indices == expected_point_indices).all()
--- a/tests/test_voting_module.py
+++ b/tests/test_voting_module.py
 import torch


-def test_voting_module():
-    from mmdet3d.ops import VoteModule
+def test_vote_module():
+    from mmdet3d.models.model_utils import VoteModule

-    self = VoteModule(vote_per_seed=3, in_channels=8)
+    vote_loss = dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='none',
+        loss_dst_weight=10.0)
+    self = VoteModule(vote_per_seed=3, in_channels=8, vote_loss=vote_loss)

    seed_xyz = torch.rand([2, 64, 3], dtype=torch.float32)  # (b, npoints, 3)
    seed_features = torch.rand(

--- a/tools/data_converter/create_gt_database.py
+++ b/tools/data_converter/create_gt_database.py
@@ -8,8 +8,8 @@ from mmcv import track_iter_progress
 from pycocotools.coco import COCO

 import mmdet3d.core.bbox.box_np_ops as box_np_ops
-from mmdet3d.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet3d.datasets import build_dataset
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.ops import roi_align