Merge branch 'clean_data-ptr' into 'master'

clean c files See merge request open-mmlab/mmdet.3d!53

Merge branch 'clean_data-ptr' into 'master'
clean c files See merge request open-mmlab/mmdet.3d!53
3fff6789 · zhangwenwei · 16c3f6e1 · d1b9ae40 · 3fff6789 · 3fff6789
Commit 3fff6789 authored Jun 06, 2020 by zhangwenwei
15 changed files
--- a/mmdet3d/ops/spconv/include/spconv/maxpool.h
+++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h
@@ -16,25 +16,20 @@
 #define SPARSE_MAXPOOL_FUNCTOR_H_
 #include <tensorview/tensorview.h>
-namespace spconv
+namespace spconv {
-{
+namespace functor {
-namespace functor
-{
 template <typename Device, typename T, typename Index>
-struct SparseMaxPoolForwardFunctor
+struct SparseMaxPoolForwardFunctor {
-{
  void operator()(const Device& d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const Index> indices, int size);
 };
 template <typename Device, typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor
+struct SparseMaxPoolBackwardFunctor {
-{
  void operator()(const Device& d, tv::TensorView<const T> outFeatures,
                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<T> din,
                  tv::TensorView<const Index> indices, int size);
 };

--- a/mmdet3d/ops/spconv/include/spconv/mp_helper.h
+++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h
@@ -4,7 +4,8 @@
 #include <utility>
 namespace spconv {
-template <class... T> struct mp_list {};
+template <class... T>
+struct mp_list {};
 template <class T, T... I>
 using mp_list_c = mp_list<std::integral_constant<T, I>...>;
@@ -16,7 +17,8 @@ constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
 }
-template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
  return std::forward<F>(f);
 }
@@ -24,7 +26,8 @@ template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
 namespace detail {
-template <class A, template <class...> class B> struct mp_rename_impl {
+template <class A, template <class...> class B>
+struct mp_rename_impl {
  // An error "no type named 'type'" here means that the first argument to
  // mp_rename is not a list
 };
@@ -39,7 +42,8 @@ struct mp_rename_impl<A<T...>, B> {
 template <class A, template <class...> class B>
 using mp_rename = typename detail::mp_rename_impl<A, B>::type;
-template <class L, class F> constexpr F mp_for_each(F &&f) {
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
 }
 }  // namespace spconv

--- a/mmdet3d/ops/spconv/include/spconv/nms.h
+++ b/mmdet3d/ops/spconv/include/spconv/nms.h
@@ -16,11 +16,13 @@
 #define NMS_CPU_H
 #include <pybind11/pybind11.h>
 // must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+#include <algorithm>
+#include <boost/geometry.hpp>
 #include <vector>
 #include "box_iou.h"
 #include "nms_gpu.h"
 namespace spconv {
@@ -48,13 +50,11 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
  DType xx1, xx2, w, h, inter, ovr;
  for (int _i = 0; _i < ndets; ++_i) {
    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
+    if (suppressed_rw(i) == 1) continue;
-      continue;
    keep.push_back(i);
    for (int _j = _i + 1; _j < ndets; ++_j) {
      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
+      if (suppressed_rw(j) == 1) continue;
-        continue;
      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
      w = xx2 - xx1 + eps;
@@ -65,8 +65,7 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
        if (h > 0) {
          inter = w * h;
          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
+          if (ovr >= thresh) suppressed_rw(j) = 1;
-            suppressed_rw(j) = 1;
        }
      }
    }
@@ -97,15 +96,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
  for (int _i = 0; _i < ndets; ++_i) {
    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
+    if (suppressed_rw(i) == 1) continue;
-      continue;
    keep.push_back(i);
    for (int _j = _i + 1; _j < ndets; ++_j) {
      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
+      if (suppressed_rw(j) == 1) continue;
-        continue;
+      if (standup_iou_r(i, j) <= 0.0) continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
      // std::cout << "pre_poly" << std::endl;
      try {
        bg::append(poly,
@@ -169,8 +165,7 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
          // std::cout << "post union area" << std::endl;
          // std::cout << union_area << "debug" << std::endl;
          overlap = inter_area / union_area;
-          if (overlap >= thresh)
+          if (overlap >= thresh) suppressed_rw(j) = 1;
-            suppressed_rw(j) = 1;
          poly_union.clear();
        }
      }

--- a/mmdet3d/ops/spconv/include/spconv/nms_functor.h
+++ b/mmdet3d/ops/spconv/include/spconv/nms_functor.h
@@ -16,21 +16,16 @@
 #define NMS_FUNCTOR_H_
 #include <tensorview/tensorview.h>
-namespace spconv
+namespace spconv {
-{
+namespace functor {
-namespace functor
-{
 template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor
+struct NonMaxSupressionFunctor {
-{
  Index operator()(const Device& d, tv::TensorView<Index> keep,
-                  tv::TensorView<const T> boxes,
+                   tv::TensorView<const T> boxes, T threshold, T eps);
-                  T threshold, T eps);
 };
 template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor
+struct rotateNonMaxSupressionFunctor {
-{
  Index operator()(const Device& d, tv::TensorView<Index> keep,
                   tv::TensorView<const T> boxCorners,
                   tv::TensorView<const T> standupIoU, T threshold);

--- a/mmdet3d/ops/spconv/include/spconv/point2voxel.h
+++ b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
@@ -16,13 +16,15 @@
 #include <pybind11/pybind11.h>
 // must include pybind11/eigen.h if using eigen matrix as arguments.
 // must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+#include <algorithm>
 // #include <vector>
-#include <iostream>
 #include <math.h>
+#include <iostream>
 namespace spconv {
 namespace py = pybind11;
 using namespace pybind11::literals;
@@ -64,13 +66,11 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
+      if (voxel_num >= max_voxels) break;
-        break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -87,15 +87,14 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
  }
  return voxel_num;
 }
 template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                          py::array_t<DType> means,
+                               py::array_t<DType> voxels,
-                          py::array_t<int> coors,
+                               py::array_t<DType> means, py::array_t<int> coors,
                               py::array_t<int> num_points_per_voxel,
                               py::array_t<int> coor_to_voxelidx,
                               std::vector<DType> voxel_size,
@@ -131,13 +130,11 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
+      if (voxel_num >= max_voxels) break;
-        break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -151,14 +148,15 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
      }
      num_points_per_voxel_rw(voxelidx) += 1;
      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
      }
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j){
+    for (int j = num; j < max_points; ++j) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(i, j, k) = means_rw(i, k);
      }
@@ -168,15 +166,12 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
 }
 template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
+int points_to_voxel_3d_np_height(
-                          py::array_t<DType> height,
+    py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> maxs,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
-                          py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-                          py::array_t<int> num_points_per_voxel,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-                          py::array_t<int> coor_to_voxelidx,
+    int max_points, int max_voxels) {
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
@@ -208,13 +203,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
+      if (voxel_num >= max_voxels) break;
-        break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -225,7 +218,8 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
      }
      num_points_per_voxel_rw(voxelidx) += 1;
@@ -241,15 +235,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
 }
 template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points,
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
-                          py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
-                          py::array_t<DType> height,
-                          py::array_t<DType> maxs,
                    py::array_t<int> coor_to_voxelidx,
                    std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
-                          int max_voxels,
-                          DType eps) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<1>();
  auto maxs_rw = maxs.template mutable_unchecked<1>();
@@ -278,8 +268,7 @@ int block_filtering(py::array_t<DType> points,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
@@ -299,29 +288,22 @@ int block_filtering(py::array_t<DType> points,
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
      mask(i) = 0;
    }
  }
 }
 template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
+int points_to_voxel_3d_with_filtering(
-                          py::array_t<int> voxel_mask,
+    py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> mins,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
-                          py::array_t<DType> maxs,
+    py::array_t<DType> maxs, py::array_t<int> coors,
-                          py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-                          py::array_t<int> num_points_per_voxel,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-                          py::array_t<int> coor_to_voxelidx,
+    int max_points, int max_voxels, int block_factor, int block_size,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range,
-                          int max_points,
-                          int max_voxels,
-                          int block_factor,
-                          int block_size,
    DType height_threshold) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto mins_rw = mins.template mutable_unchecked<2>();
@@ -361,13 +343,11 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
      }
      coor[ndim_minus_1 - j] = c;
    }
-    if (failed)
+    if (failed) continue;
-      continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
+      if (voxel_num >= max_voxels) break;
-        break;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -381,8 +361,10 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
      }
      block_coor[0] = coor[1] / block_factor;
      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      mins_rw(block_coor[0], block_coor[1]) =
-      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
@@ -394,13 +376,15 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
    block_coor[1] = coor[2] / block_factor;
    min_value = mins_rw(block_coor[0], block_coor[1]);
    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0]-block_size/2);
+    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
+    stopx =
-    starty = std::max(0, block_coor[1]-block_size/2);
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-    for (int j = startx; j < stopx; ++j){
+    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k){
+      for (int k = starty; k < stopy; ++k) {
        min_value = std::min(min_value, mins_rw(j, k));
        max_value = std::max(max_value, maxs_rw(j, k));
      }
@@ -410,5 +394,4 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
  return voxel_num;
 }
 }  // namespace spconv
--- a/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
--- a/mmdet3d/ops/spconv/include/spconv/reordering.h
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.h
@@ -16,23 +16,21 @@
 #define SPARSE_REORDERING_FUNCTOR_H_
 #include <tensorview/tensorview.h>
-namespace spconv
+namespace spconv {
-{
+namespace functor {
-namespace functor
-{
 template <typename Device, typename T, typename Index>
-struct SparseGatherFunctor
+struct SparseGatherFunctor {
-{
+  void operator()(const Device& d, tv::TensorView<T> buffer,
-    void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size);
 };
 template <typename Device, typename T, typename Index>
-struct SparseScatterAddFunctor
+struct SparseScatterAddFunctor {
-{
  void operator()(const Device& d, tv::TensorView<T> out_features,
-                    tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                  tv::TensorView<const T> buffer,
-                    int size, bool stable=false);
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
 };
 }  // namespace functor
 }  // namespace spconv

--- a/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
 #pragma once
 // from tensorflow
-namespace tv
+namespace tv {
-{
+namespace detail {
-namespace detail
-{
 template <typename T>
-class KernelLoop
+class KernelLoop {
-{
+  struct Iterator {
-  struct Iterator
+    __forceinline__ __device__ Iterator(T index, T delta)
-  {
+        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++()
+    __forceinline__ __device__ Iterator &operator++() {
-    {
      index_ += delta_;
      return *this;
    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-    {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      // Anything past an end iterator (delta_ == 0) is equal.
      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_)
+      if (!other.delta_) {
-      {
        return less;
      }
-      if (!delta_)
+      if (!delta_) {
-      {
        return greater;
      }
      return less || greater;
@@ -39,41 +32,40 @@ class KernelLoop
    const T delta_;
  };
-public:
+ public:
  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
      : begin_(begin), delta_(delta), end_(end) {}
-  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-private:
+ private:
  T begin_;
  T delta_;
  T end_;
 };
 }  // namespace detail
-template <typename T, int NumILP=1>
+template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
-{
  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
                               gridDim.x * blockDim.x * NumILP, count);
 }
 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
 // Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP=1>
+template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
-{
  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
                               gridDim.y * blockDim.y * NumILP, count);
 }
 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
 // Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP=1>
+template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
-{
  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
                               gridDim.z * blockDim.z * NumILP, count);
 }

--- a/mmdet3d/ops/spconv/include/tensorview/tensorview.h
+++ b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
@@ -13,10 +13,11 @@
 // limitations under the License.
 #pragma once
+#include <cuda_runtime_api.h>
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <cuda_runtime_api.h>
 #include <iostream>
 #include <memory>
 // #include <prettyprint.h>
@@ -52,12 +53,12 @@ namespace tv {
 #define TV_DEVICE_REQUIRE(expr, ...)                      \
  {                                                       \
-    if (!(expr) && threadIdx.x == 0)                                           \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
-      printf(__VA_ARGS__);                                                     \
    assert(expr);                                         \
  }
-template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
  ss << val;
 }
@@ -130,7 +131,7 @@ constexpr size_t calc_align(size_t ndim)
 */
 template <typename T, size_t MaxDim = TV_MAX_DIM>
 struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
-public:
+ public:
  TV_HOST_DEVICE_INLINE SimpleVector(){};
  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
    TV_ASSERT(q.size() <= MaxDim);
@@ -267,7 +268,7 @@ public:
    return const_iterator(mArray + mSize);
  }
-protected:
+ protected:
  T mArray[MaxDim];
  size_t mSize = 0;
 };
@@ -275,11 +276,9 @@ protected:
 template <typename T, size_t MaxDim>
 bool operator==(const SimpleVector<T, MaxDim> &lfs,
                const SimpleVector<T, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size())
+  if (lfs.size() != rfs.size()) return false;
-    return false;
  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i])
+    if (lfs[i] != rfs[i]) return false;
-      return false;
  }
  return true;
 }
@@ -287,12 +286,12 @@ bool operator==(const SimpleVector<T, MaxDim> &lfs,
 template <typename T, size_t MaxDim>
 bool operator!=(const SimpleVector<T, MaxDim> &lfs,
                const SimpleVector<T, MaxDim> &rfs) {
  return !(lfs == rfs);
 }
 struct Slice {
-  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
    SimpleVector<int, 3> slices{int(ints)...};
    mSlices[0] = -1;
@@ -333,7 +332,7 @@ struct Slice {
    return mSlices[idx];
  }
-protected:
+ protected:
  int mSlices[3];
 };
@@ -372,8 +371,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  }
  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0)
+    if (this->mSize == 0) return 0;
-      return 0;
    size_t s = 1;
    for (int i = 0; i < int(this->mSize); ++i) {
      s *= this->mArray[i];
@@ -384,16 +382,14 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1)
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-        shape.push_back(this->mArray[i]);
    }
    return shape;
  }
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1)
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-        shape.push_back(this->mArray[i]);
    }
    return shape;
  }
@@ -479,7 +475,8 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
  return index;
 }
-template <int N> struct ArrayIndexRowMajor {
+template <int N>
+struct ArrayIndexRowMajor {
  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
@@ -488,7 +485,8 @@ template <int N> struct ArrayIndexRowMajor {
  }
 };
-template <> struct ArrayIndexRowMajor<0> {
+template <>
+struct ArrayIndexRowMajor<0> {
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
    return 0;
@@ -496,24 +494,36 @@ template <> struct ArrayIndexRowMajor<0> {
 };
 namespace detail {
-template <typename T> constexpr const char *simpleTypeName(T val = T());
+template <typename T>
-template <> constexpr const char *simpleTypeName(float val) {
+constexpr const char *simpleTypeName(T val = T());
+template <>
+constexpr const char *simpleTypeName(float val) {
  return "float32";
 }
-template <> constexpr const char *simpleTypeName(double val) {
+template <>
+constexpr const char *simpleTypeName(double val) {
  return "float64";
 }
-template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
+template <>
-template <> constexpr const char *simpleTypeName(unsigned val) {
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
  return "uint32";
 }
-template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
+template <>
-template <> constexpr const char *simpleTypeName(unsigned long val) {
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
  return "uint64";
 }
 };  // namespace detail
-template <typename T, int Rank = -1> struct TensorView {
+template <typename T, int Rank = -1>
+struct TensorView {
  TV_HOST_DEVICE_INLINE TensorView() {}
  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
      : mPtr(ptr), mShape(shape) {}
@@ -526,29 +536,28 @@ template <typename T, int Rank = -1> struct TensorView {
    mShape = {int(shapes)...};
  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
-  assign(const TensorView<T, Rank> &tensor) {
+      const TensorView<T, Rank> &tensor) {
    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
               "\n");
    T *ptr = mPtr;
    const T *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i)
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
-      *(ptr++) = *(other_ptr++);
    return *this;
  }
  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
-  assign(std::initializer_list<T1> seq) {
+      std::initializer_list<T1> seq) {
    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
               "\n");
    T *ptr = mPtr;
-    for (const T1 &s : seq)
+    for (const T1 &s : seq) *(ptr++) = T(s);
-      *(ptr++) = T(s);
    return *this;
  }
-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
 #ifdef TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
@@ -610,7 +619,8 @@ template <typename T, int Rank = -1> struct TensorView {
    return mPtr[0];
  }
-  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+  template <class T1>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
 #if defined TV_DEBUG
 #if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
@@ -711,7 +721,8 @@ template <typename T, int Rank = -1> struct TensorView {
    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  }
-  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
 #ifdef TV_DEBUG
 #if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
@@ -843,12 +854,12 @@ template <typename T, int Rank = -1> struct TensorView {
 #endif
    return mPtr[idx];
  }*/
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> operator[](
-  operator[](SimpleVector<Slice> slice_vec) {
+      SimpleVector<Slice> slice_vec) {
    return _subview(slice_vec);
  }
-  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
+  TV_HOST_DEVICE_INLINE const TensorView<T, Rank> operator[](
-  operator[](SimpleVector<Slice> slice_vec) const {
+      SimpleVector<Slice> slice_vec) const {
    return _subview(slice_vec);
  }
  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
@@ -952,8 +963,7 @@ template <typename T, int Rank = -1> struct TensorView {
  std::string repr() const {
    std::ostringstream ss;
-    if (empty())
+    if (empty()) return "";
-      return "";
    if (mShape.ndim() == 0) {
      ss << *mPtr;
      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
@@ -980,14 +990,12 @@ template <typename T, int Rank = -1> struct TensorView {
          print_comma = false;
        }
      }
-      if (print_comma && i != this->size() - 1)
+      if (print_comma && i != this->size() - 1) ss << ", ";
-        ss << ", ";
      for (int j = 0; j < inc_count; ++j) {
        ss << "]";
      }
      if (i != this->size() - 1) {
-        if (inc_count != 0)
+        if (inc_count != 0) ss << "\n";
-          ss << "\n";
        for (int j = 0; j < inc_count; ++j) {
          ss << "[";
        }
@@ -1000,11 +1008,11 @@ template <typename T, int Rank = -1> struct TensorView {
    return ss.str();
  }
-protected:
+ protected:
  // TODO: make this function public.
  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> _subview(
-  _subview(SimpleVector<Slice> slice_vec) {
+      SimpleVector<Slice> slice_vec) {
    Shape new_shape;
    for (int i = 0; i < slice_vec.size(); ++i) {
      new_shape.push_back(slice_vec[i][0]);
@@ -1041,7 +1049,8 @@ protected:
    }
    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
  }
-  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
    return Slice{int(s), -1, -1};
  }
@@ -1064,17 +1073,30 @@ Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
 }
 namespace detail {
-template <typename T> constexpr const char *printfTypeFormat(T val = T());
+template <typename T>
-template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
+constexpr const char *printfTypeFormat(T val = T());
-template <> constexpr const char *printfTypeFormat(double val) {
+template <>
+constexpr const char *printfTypeFormat(float val) {
  return "%.2f";
 }
-template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
+template <>
-template <> constexpr const char *printfTypeFormat(unsigned val) {
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
  return "%u";
 }
-template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
+template <>
-template <> constexpr const char *printfTypeFormat(unsigned long val) {
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
  return "%lu";
 }
 };  // namespace detail
@@ -1082,8 +1104,7 @@ template <> constexpr const char *printfTypeFormat(unsigned long val) {
 template <typename T>
 TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
                                    const char *format) {
-  if (tensor.empty())
+  if (tensor.empty()) return;
-    return;
  if (tensor.ndim() == 0) {
    printf(format, tensor());
    printf("\n");
@@ -1108,14 +1129,12 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
        print_comma = false;
      }
    }
-    if (print_comma && i != tensor.size() - 1)
+    if (print_comma && i != tensor.size() - 1) printf(", ");
-      printf(", ");
    for (int j = 0; j < inc_count; ++j) {
      printf("]");
    }
    if (i != tensor.size() - 1) {
-      if (inc_count != 0)
+      if (inc_count != 0) printf("\n");
-        printf("\n");
      for (int j = 0; j < inc_count; ++j) {
        printf("[");
      }

--- a/mmdet3d/ops/spconv/src/indice.cc
+++ b/mmdet3d/ops/spconv/src/indice.cc
@@ -35,17 +35,14 @@ struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
                   bool transpose, bool resetGrid) {
    if (transpose)
      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
    else
      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut,
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
  }
 };
 template <typename Index, typename IndexGrid, unsigned NDim>
@@ -61,19 +58,18 @@ struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn,
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
-        gridsOut, indicePairs, indiceNum,
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
-        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
  }
 };
 }  // namespace functor
 #define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
                                                       NDIM>;
 #define DECLARE_CPU_INDEX(Index)          \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \

--- a/mmdet3d/ops/spconv/src/indice_cuda.cu
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
@@ -13,16 +13,17 @@
 // limitations under the License.
 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/mp_helper.h>
-#include <spconv/indice.h>
 #include <spconv/indice.cu.h>
+#include <spconv/indice.h>
+#include <spconv/mp_helper.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
-#include <type_traits>
 #include <utility/timer.h>
+#include <chrono>
+#include <limits>
+#include <type_traits>
 namespace spconv {
 namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
@@ -41,8 +42,7 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
                   bool transpose) {
    Index batchSize = gridsOut.dim(0);
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
+    if (numActIn == 0) return 0;
-      return 0;
    // auto timer = spconv::CudaContextTimer<>();
    if (transpose)
      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
@@ -75,8 +75,7 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
+    if (numActIn == 0) return 0;
-      return 0;
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
@@ -111,8 +110,7 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
+    if (numActIn == 0) return 0;
-      return 0;
    // auto timer = spconv::CudaContextTimer<>();
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
@@ -121,13 +119,15 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                         kernelSize, stride, padding, dilation, outSpatialShape);
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
    TV_CHECK_CUDA_ERR();
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;

--- a/mmdet3d/ops/spconv/src/maxpool.cc
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
--- a/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
@@ -13,13 +13,14 @@
 // limitations under the License.
 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
 #include <spconv/maxpool.h>
 #include <spconv/mp_helper.h>
 #include <tensorview/helper_kernel.cu.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
+#include <chrono>
+#include <limits>
 #include <type_traits>
 namespace spconv {
@@ -54,9 +55,10 @@ __global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
 }
 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
+__global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
-maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
+                                             const T *inFeatures,
-                             const Index *indicesIn, const Index *indicesOut,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
                                             int numHot, int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideX[NumILP];
@@ -160,10 +162,11 @@ __global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
 }
 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
+__global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
-maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
+                                      const T *dout, T *din,
-                      T *din, const Index *indicesIn, const Index *indicesOut,
+                                      const Index *indicesIn,
-                      int numHot, int numPlanes) {
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  T in, out;
  Index idxo, idxi;
@@ -226,10 +229,11 @@ __global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
 }
 template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
+__global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
-maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
+                                         const T *inFeatures, const T *dout,
-                         const T *dout, T *din, const Index *indicesIn,
+                                         T *din, const Index *indicesIn,
-                         const Index *indicesOut, int numHot, int numPlanes) {
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
@@ -255,7 +259,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
      reinterpret_cast<VecType *>(bufdo)[0] =
          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(din)[idxi];
 #pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
@@ -263,16 +268,18 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
          bufdi[i] += bufdo[i];
        }
      }
-      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+      reinterpret_cast<VecType *>(din)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
    }
  }
 }
 template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
+__global__ void maxPoolBwdGenericKernel(const T *outFeatures,
-maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
+                                        const T *inFeatures, const T *dout,
-                        const T *dout, T *din, const Index *indicesIn,
+                                        T *din, const Index *indicesIn,
-                        const Index *indicesOut, int numHot, int numPlanes) {
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
  int ILPStrideX[NumILP];
  Index RI[NumILP];
@@ -313,8 +320,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
+    if (size <= 0) return;
-      return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -326,7 +332,8 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
@@ -387,8 +394,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
                  tv::TensorView<const T> inFeatures,
                  tv::TensorView<const T> dout, tv::TensorView<T> din,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
+    if (size <= 0) return;
-      return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -400,7 +406,8 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),

--- a/mmdet3d/ops/spconv/src/reordering.cc
+++ b/mmdet3d/ops/spconv/src/reordering.cc
@@ -19,7 +19,8 @@ namespace spconv {
 namespace functor {
 template <typename T, typename Index>
 struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+  void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
    int numPlanes = features.dim(1);
    for (int i = 0; i < size; ++i) {
@@ -33,15 +34,15 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
 template <typename T, typename Index>
 struct SparseScatterAddFunctor<tv::CPU, T, Index> {
  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                  tv::TensorView<const T> buffer,
-                  int size, bool stable) {
+                  tv::TensorView<const Index> indices, int size, bool stable) {
    int numPlanes = outFeatures.dim(1);
    const T* buf = buffer.data();
    T* out = outFeatures.data();
    for (int i = 0; i < size; ++i) {
      buf = buffer.data() + i * numPlanes;
      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j){
+      for (int j = 0; j < numPlanes; ++j) {
        out[j] += buf[j];
      }
    }
@@ -50,7 +51,6 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
 }  // namespace functor
 #define DECLARE_CPU_SPECS_T_INDEX(T, Index)                        \
  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;

--- a/mmdet3d/ops/spconv/src/reordering_cuda.cu
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
@@ -13,17 +13,18 @@
 // limitations under the License.
 #include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
 #include <spconv/mp_helper.h>
-#include <spconv/reordering.h>
 #include <spconv/reordering.cu.h>
+#include <spconv/reordering.h>
 #include <tensorview/helper_kernel.cu.h>
 #include <tensorview/helper_launch.h>
 #include <tensorview/tensorview.h>
-#include <type_traits>
 #include <utility/timer.h>
+#include <chrono>
+#include <limits>
+#include <type_traits>
 namespace spconv {
 namespace functor {
 template <typename T, typename Index>
@@ -34,8 +35,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
                  tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
+    if (size <= 0) return;
-      return;
    int numPlanes = features.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
+                   d.getStream()>>>(buffer.data(), features.data(),
-                                 nHotBlock, numPlanes / vecloadFactor);
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
@@ -61,7 +62,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
                                    features.data(), indices.data() + nHotBlock,
-                                 size - nHotBlock, numPlanes / vecloadFactor);
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
@@ -89,8 +91,7 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
                  tv::TensorView<const T> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0)
+    if (size <= 0) return;
-      return;
    int numPlanes = outFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor =
@@ -139,7 +140,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
 };
 }  // namespace functor
 #define DECLARE_GPU_SPECS_T_INDEX(T, Index)                        \
  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;