format code, add benchmark per layer

3517290c · yanyan · 540a2209 · 3517290c · 3517290c · 3517290c
Commit 3517290c authored Jul 09, 2020 by yanyan
20 changed files
--- a/include/spconv/fused_conv.cu.h
+++ b/include/spconv/fused_conv.cu.h
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -21,15 +21,16 @@

 namespace spconv {

-template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch;
+template <bool UseDeconv, typename Index, unsigned NDim>
+struct ConvIndiceDispatch;

 template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<true, Index, NDim>{
-  constexpr static auto* func = getValidOutPosTranspose<Index, NDim>;
+struct ConvIndiceDispatch<true, Index, NDim> {
+  constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
 };
 template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<false, Index, NDim>{
-  constexpr static auto* func = getValidOutPos<Index, NDim>;
+struct ConvIndiceDispatch<false, Index, NDim> {
+  constexpr static auto *func = getValidOutPos<Index, NDim>;
 };

 template <typename Index, unsigned NDim, bool UseDeconv,
@@ -61,8 +62,8 @@ __global__ void prepareIndicePairsKernel(
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(),
-        outSpatialShape.data(), validPoints);
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];

--- a/include/spconv/minkowski.cu.h
+++ b/include/spconv/minkowski.cu.h
@@ -89,7 +89,6 @@ __global__ void matmul(const Dtype *A, const int wA, const int hA,
  // C[wB * out_row + x] += Csub;
 }

-
 template <typename Dtype, typename Itype, int BLOCK_SIZE>
 __global__ void matmul2(const Dtype *A, const int wA, const int hA,
                        const Dtype *B, const int wB, const int hB,

--- a/include/spconv/point2voxel.cu.h
+++ b/include/spconv/point2voxel.cu.h
@@ -7,10 +7,8 @@
 namespace spconv {
 template <typename Index, unsigned NDim>
 __global__ void scatterPointToGridKernel(
-    tv::TensorView<const float> points,
-    tv::TensorView<const Index> indexes,
-    tv::TensorView<float> grids,
-    tv::TensorView<Index> numPointsPerGrid,
+    tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
+    tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
    tv::TensorView<Index> pointIndex,
    const tv::SimpleVector<Index, NDim> gridShape) {
  Index index;
@@ -24,14 +22,15 @@ __global__ void scatterPointToGridKernel(
    atomicAdd(numPointsPerGrid.data() + index, Index(1));
 #pragma unroll
    for (int k = 0; k != numFeatures; ++k) {
-      atomicAdd(grids.data() + index * numFeatures + k, *(points.data() + ix * numFeatures + k));
+      atomicAdd(grids.data() + index * numFeatures + k,
+                *(points.data() + ix * numFeatures + k));
    }
  }
 }

 template <typename Index, unsigned NDim>
-__global__ void gatherPointFromGridKernel(
-    tv::TensorView<const float> grids,
+__global__ void
+gatherPointFromGridKernel(tv::TensorView<const float> grids,
                          tv::TensorView<const Index> numPointsPerGrid,
                          tv::TensorView<const Index> pointIndexUnique,
                          tv::TensorView<float> voxels,
@@ -47,14 +46,13 @@ __global__ void gatherPointFromGridKernel(
    for (int k = 0; k != numFeatures; ++k) {
      voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
    }
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, coors.data() + ix * NDim, gridShape.data());
+    index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
+                                            gridShape.data());
  }
 }

 template <typename Index>
-__global__ void resetGridKernel(
-    tv::TensorView<float> grids,
+__global__ void resetGridKernel(tv::TensorView<float> grids,
                                tv::TensorView<Index> numPointsPerGrid,
                                tv::TensorView<Index> pointIndexUnique) {
  Index index;
@@ -72,8 +70,8 @@ __global__ void resetGridKernel(
 }

 template <typename Index>
-__global__ void resetPointIndexKernel(
-    tv::TensorView<Index> pointIndex, const Index gridVolume) {
+__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
+                                      const Index gridVolume) {
  int num_max_points = pointIndex.dim(0) - 1;

  for (int ix : tv::KernelLoopX<int>(num_max_points)) {

--- a/include/spconv/point2voxel_ops.h
+++ b/include/spconv/point2voxel_ops.h
@@ -21,15 +21,10 @@

 namespace spconv {

-int64_t
-pointsToVoxel(torch::Tensor points,
-              torch::Tensor indexes,
-              torch::Tensor pointIndex,
-              torch::Tensor grids,
-              torch::Tensor numPointsPerGrid,
-              torch::Tensor voxels,
-              torch::Tensor coors,
-              std::vector<int64_t> gridShape,
+int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
+                      torch::Tensor pointIndex, torch::Tensor grids,
+                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
+                      torch::Tensor coors, std::vector<int64_t> gridShape,
                      const int64_t ndim);

 } // namespace spconv
--- a/include/spconv/points2voxels.h
+++ b/include/spconv/points2voxels.h
@@ -3,20 +3,16 @@
 #include <tensorview/tensorview.h>
 #include <torch/script.h>

-
 namespace spconv {

-void scatter_point_to_grid_cuda(
-    torch::Tensor points,
-    torch::Tensor indexes,
+void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
                                torch::Tensor grids,
                                torch::Tensor numPointsPerGrid,
                                torch::Tensor pointIndex,
-    std::vector<int64_t> gridShape,
-    const int ndim);
+                                std::vector<int64_t> gridShape, const int ndim);

-void gather_point_from_grid_cuda(
-    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+void gather_point_from_grid_cuda(torch::Tensor grids,
+                                 torch::Tensor numPointsPerGrid,
                                 torch::Tensor pointIndex,
                                 torch::Tensor pointIndexUnique,
                                 torch::Tensor voxels, torch::Tensor coors,

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -23,9 +23,15 @@

 namespace spconv {

-enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine };
-using all_conv_algos_t =
-    tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine>;
+enum ConvAlgo {
+  kNative = 0,
+  kBatch,
+  kBatchGemmGather,
+  kSparseConvNet,
+  kMinkowskiEngine
+};
+using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
+                                       kSparseConvNet, kMinkowskiEngine>;

 // torch.jit's doc says only support int64, so we need to convert to int32.
 std::vector<torch::Tensor>

--- a/include/tensorview/cc17.h
+++ b/include/tensorview/cc17.h
@@ -7,9 +7,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)

 From Caffe2:

@@ -53,8 +54,8 @@ modification, are permitted provided that the following conditions are met:
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.

-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-   and IDIAP Research Institute nor the names of its contributors may be
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
   used to endorse or promote products derived from this software without
   specific prior written permission.

@@ -97,47 +98,67 @@ struct _identity final {
    return std::forward<T>(arg);
  }
 };
-template<class Func, class Enable = void>
+template <class Func, class Enable = void>
 struct function_takes_identity_argument : std::false_type {};
 #if defined(_MSC_VER)
-// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t.
-// But we're only building on MSVC versions that have std::void_t, so let's just use that one.
-template<class Func>
-struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
+// For some weird reason, MSVC shows a compiler error when using guts::void_t
+// instead of std::void_t. But we're only building on MSVC versions that have
+// std::void_t, so let's just use that one.
+template <class Func>
+struct function_takes_identity_argument<
+    Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
+    : std::true_type {};
 #else
-template<class Func>
-struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
+template <class Func>
+struct function_takes_identity_argument<
+    Func, void_t<decltype(std::declval<Func>()(_identity()))>>
+    : std::true_type {};
 #endif

-template<bool Condition>
-struct _if_constexpr;
-
-template<>
-struct _if_constexpr<true> final {
-  template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
-  static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
-    // The _identity instance passed in can be used to delay evaluation of an expression,
-    // because the compiler can't know that it's just the identity we're passing in.
+template <bool Condition> struct _if_constexpr;
+
+template <> struct _if_constexpr<true> final {
+  template <
+      class ThenCallback, class ElseCallback,
+      std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
+                       void *> = nullptr>
+  static decltype(auto) call(ThenCallback &&thenCallback,
+                             ElseCallback && /* elseCallback */) {
+    // The _identity instance passed in can be used to delay evaluation of an
+    // expression, because the compiler can't know that it's just the identity
+    // we're passing in.
    return thenCallback(_identity());
  }

-  template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
-  static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
+  template <
+      class ThenCallback, class ElseCallback,
+      std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
+                       void *> = nullptr>
+  static decltype(auto) call(ThenCallback &&thenCallback,
+                             ElseCallback && /* elseCallback */) {
    return thenCallback();
  }
 };

-template<>
-struct _if_constexpr<false> final {
-  template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
-  static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
-    // The _identity instance passed in can be used to delay evaluation of an expression,
-    // because the compiler can't know that it's just the identity we're passing in.
+template <> struct _if_constexpr<false> final {
+  template <
+      class ThenCallback, class ElseCallback,
+      std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
+                       void *> = nullptr>
+  static decltype(auto) call(ThenCallback && /* thenCallback */,
+                             ElseCallback &&elseCallback) {
+    // The _identity instance passed in can be used to delay evaluation of an
+    // expression, because the compiler can't know that it's just the identity
+    // we're passing in.
    return elseCallback(_identity());
  }

-  template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
-  static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
+  template <
+      class ThenCallback, class ElseCallback,
+      std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
+                       void *> = nullptr>
+  static decltype(auto) call(ThenCallback && /* thenCallback */,
+                             ElseCallback &&elseCallback) {
    return elseCallback();
  }
 };
@@ -173,33 +194,40 @@ struct _if_constexpr<false> final {
 *   template <class T>
 *   int func(T t) {
 *     return if_constexpr<std::is_same<T, MyClass1>::value>(
- *       [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile
- *       [&](auto _) { return _(t).val; }    // this code is invalid for T == MyClass1
+ *       [&](auto _) { return _(t).value; }, // this code is invalid for T ==
+ * MyClass2, so a regular non-constexpr if statement wouldn't compile
+ *       [&](auto _) { return _(t).val; }    // this code is invalid for T ==
+ * MyClass1
 *     );
 *   }
 *
- * Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing.
- *       It is used to force the compiler to delay type checking, because the compiler
- *       doesn't know what kind of _ is passed in. Without it, the compiler would fail
- *       when you try to access t.value but the member doesn't exist.
+ * Note: The _ argument passed in Example 3 is the identity function, i.e. it
+ * does nothing. It is used to force the compiler to delay type checking,
+ * because the compiler doesn't know what kind of _ is passed in. Without it,
+ * the compiler would fail when you try to access t.value but the member doesn't
+ * exist.
 *
- * Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
- *       If func() had a return type of "auto", then both branches could return different
- *       types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
+ * Note: In Example 3, both branches return int, so func() returns int. This is
+ * not necessary. If func() had a return type of "auto", then both branches
+ * could return different types, say func<MyClass1>() could return int and
+ * func<MyClass2>() could return string.
 */
-template<bool Condition, class ThenCallback, class ElseCallback>
-decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
+template <bool Condition, class ThenCallback, class ElseCallback>
+decltype(auto) if_constexpr(ThenCallback &&thenCallback,
+                            ElseCallback &&elseCallback) {
 #if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
-  // This will give us better error messages.
-  if constexpr(Condition) {
-    if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
+  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
+  // it. This will give us better error messages.
+  if constexpr (Condition) {
+    if constexpr (detail::function_takes_identity_argument<
+                      ThenCallback>::value) {
      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
    } else {
      return std::forward<ThenCallback>(thenCallback)();
    }
  } else {
-    if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) {
+    if constexpr (detail::function_takes_identity_argument<
+                      ElseCallback>::value) {
      return std::forward<ElseCallback>(elseCallback)(detail::_identity());
    } else {
      return std::forward<ElseCallback>(elseCallback)();
@@ -207,18 +235,20 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCall
  }
 #else
  // C++14 implementation of if constexpr
-  return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback),
+  return detail::_if_constexpr<Condition>::call(
+      std::forward<ThenCallback>(thenCallback),
      std::forward<ElseCallback>(elseCallback));
 #endif
 }

-template<bool Condition, class ThenCallback>
-decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
+template <bool Condition, class ThenCallback>
+decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
 #if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
-  // This will give us better error messages.
-  if constexpr(Condition) {
-    if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
+  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
+  // it. This will give us better error messages.
+  if constexpr (Condition) {
+    if constexpr (detail::function_takes_identity_argument<
+                      ThenCallback>::value) {
      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
    } else {
      return std::forward<ThenCallback>(thenCallback)();
@@ -226,9 +256,9 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
  }
 #else
  // C++14 implementation of if constexpr
-  return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {});
+  return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
+                                 [](auto) {});
 #endif
 }

-
-}
+} // namespace tv
--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -22,13 +22,13 @@ If you can use libtorch, dont use tv::Tensor.
 */

 #pragma once
+#include "cc17.h"
 #include "mp_helper.h"
 #include "tensorview.h"
 #include <cstring>
 #include <iomanip>
 #include <memory>
 #include <type_traits>
-#include "cc17.h"
 #ifdef TV_CUDA
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -632,25 +632,30 @@ struct Tensor {
  tview() const {
    static_assert(Rank == -1 || Rank > 0, "error");
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return if_constexpr<(Rank > 0)>([&](auto _){
+    return if_constexpr<(Rank > 0)>(
+        [&](auto _) {
          TV_ASSERT_RT_ERR(Rank == ndim(), "error");
-      ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
+          ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
+              stride(Rank);
          for (int i = 0; i < Rank; ++i) {
            shape[i] = shape_[i];
            stride[i] = stride_[i];
          }
-      return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
-          reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
-          stride);
-    }, [&](auto _){
+          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
+                            Tindex>(
+              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
+              shape, stride);
+        },
+        [&](auto _) {
          ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
          for (int i = 0; i < int(ndim()); ++i) {
            shape[i] = shape_[i];
            stride[i] = stride_[i];
          }
-      return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
-          reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
-          stride);
+          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
+                            Tindex>(
+              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
+              shape, stride);
        });
  }
  template <class... Inds> Tensor view(Inds... newShapes) const {

--- a/include/tensorview/tools.h
+++ b/include/tensorview/tools.h
@@ -36,22 +36,21 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
    return res;
  }
  template <int Count, typename F>
-  double benchmark(F&& f, int start=int(Count) * 0.3){
+  double benchmark(F &&f, int start = int(Count) * 0.3) {
    // std::vector<TimeT::rep> times;
    auto res = typename TimeT::rep();
    int count = 0;
    cudaDeviceSynchronize();
-    for (int i = 0; i < Count; ++i){
+    for (int i = 0; i < Count; ++i) {
      std::forward<F>(f)();
      auto time = report();
-      if (i >= start){
+      if (i >= start) {
        // times.push_back(time)
        res += time;
        count += 1;
      }
    }
    return res / double(count);
-
  }

 private:

--- a/include/tensorview/torch_utils.h
+++ b/include/tensorview/torch_utils.h
@@ -14,9 +14,9 @@

 #pragma once
 #include "mp_helper.h"
-#include <tensorview/tensorview.h>
-#include <tensorview/tensor.h>
 #include <ATen/ATen.h>
+#include <tensorview/tensor.h>
+#include <tensorview/tensorview.h>
 #include <torch/script.h>
 #ifdef TV_CUDA
 #include <ATen/cuda/CUDAContext.h>

--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -22,6 +22,7 @@ from spconv import ops, utils
 from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
                         SparseConvTranspose3d, SparseInverseConv2d,
                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from spconv.core import SparseConvTensor
 from spconv.identity import Identity
 from spconv.modules import SparseModule, SparseSequential
 from spconv.ops import ConvAlgo
@@ -35,85 +36,6 @@ _LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME)
 torch.ops.load_library(_LIB_PATH)


-def scatter_nd(indices, updates, shape):
-    """pytorch edition of tensorflow scatter_nd.
-    this function don't contain except handle code. so use this carefully
-    when indice repeats, don't support repeat add which is supported
-    in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-
-
-class SparseConvTensor(object):
-    def __init__(self, features, indices, spatial_shape, batch_size,
-                 grid=None):
-        """
-        Args:
-            features: [num_points, num_features] feature tensor
-            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
-            spatial_shape: spatial shape of your sparse data
-            batch_size: batch size of your sparse data
-            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
-                is very large.
-        """
-        self.features = features
-        self.indices = indices
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        self.indice_dict = {}
-        if grid is None:
-            grid = torch.Tensor()  # empty tensor
-        self.grid = grid
-
-    @classmethod
-    def from_dense(cls, x: torch.Tensor):
-        """create sparse tensor fron channel last dense tensor by to_sparse
-        x must be NHWC tensor, channel last
-        """
-        x = x.to_sparse(x.ndim - 1)
-        spatial_shape = x.shape[1:-1]
-        batch_size = x.shape[0]
-        indices_th = x.indices().permute(1, 0).contiguous().int()
-        features_th = x.values()
-        return cls(features_th, indices_th, spatial_shape, batch_size)
-
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-
-    def find_indice_pair(self, key):
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-
-    def dense(self, channels_first=True):
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(
-            self.indices.to(self.features.device).long(), self.features,
-            output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-
-    @property
-    def sparity(self):
-        return self.indices.shape[0] / np.prod(
-            self.spatial_shape) / self.batch_size
-
-
 class ToDense(SparseModule):
    """convert SparseConvTensor to NCHW dense tensor.
    """

--- a/spconv/conv.py
+++ b/spconv/conv.py
@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
 import spconv
 import spconv.functional as Fsp
 from spconv import ops
+from spconv.core import IndiceData, SparseConvTensor
 from spconv.modules import SparseModule


@@ -72,8 +73,9 @@ class SparseConvolution(SparseModule):
                 indice_key=None,
                 fused_bn=False,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
-        super(SparseConvolution, self).__init__()
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvolution, self).__init__(name=name)
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
@@ -123,8 +125,8 @@ class SparseConvolution(SparseModule):
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

-    def forward(self, input):
-        assert isinstance(input, spconv.SparseConvTensor)
+    def forward(self, input: SparseConvTensor):
+        assert isinstance(input, SparseConvTensor)
        features = input.features
        device = features.device
        indices = input.indices
@@ -143,29 +145,58 @@ class SparseConvolution(SparseModule):
            out_spatial_shape = spatial_shape
        # input.update_grid(out_spatial_shape)
        # t = time.time()
+        out_tensor = input.shadow_copy()
+        if input.benchmark:
+            if self.name is None:
+                raise ValueError(
+                    "you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
+                )
+            if self.name not in input.benchmark_record:
+                input.benchmark_record[self.name] = {
+                    "type": "SparseConvolution",
+                    "indice_gen_time": [],
+                    "time": [],
+                    "num_points": [],
+                    "num_out_points": [],
+                    "params": {
+                        "kernel_size": self.kernel_size,
+                        "stride": self.stride,
+                        "padding": self.padding,
+                        "dilation": self.dilation,
+                        "output_padding": self.output_padding,
+                        "subm": self.subm,
+                        "transposed": self.transposed,
+                        "input_channels": self.in_channels,
+                        "out_channels": self.out_channels,
+                    }
+                }
        if self.conv1x1:
            features = torch.mm(
                input.features,
                self.weight.view(self.in_channels, self.out_channels))
            if self.bias is not None:
                features += self.bias
-            out_tensor = spconv.SparseConvTensor(features, input.indices,
-                                                 input.spatial_shape,
-                                                 input.batch_size)
-            out_tensor.indice_dict = input.indice_dict
-            out_tensor.grid = input.grid
+            out_tensor.features = features
            return out_tensor
        datas = input.find_indice_pair(self.indice_key)
        if self.inverse:
            assert datas is not None and self.indice_key is not None
-            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
+            outids = datas.indices
+            indice_pairs = datas.indice_pairs
+            indice_pair_num = datas.indice_pair_num
+            out_spatial_shape = datas.out_spatial_shape
            assert indice_pair_num.shape[0] == np.prod(
                self.kernel_size
            ), "inverse conv must have same kernel size as its couple conv"
        else:
            if self.indice_key is not None and datas is not None:
-                outids, _, indice_pairs, indice_pair_num, _ = datas
+                outids = datas.out_indices
+                indice_pairs = datas.indice_pairs
+                indice_pair_num = datas.indice_pair_num
            else:
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    t = time.time()
                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                    indices,
                    batch_size,
@@ -179,10 +210,19 @@ class SparseConvolution(SparseModule):
                    self.transposed,
                    grid=input.grid,
                    use_hash=self.use_hash)
-                input.indice_dict[self.indice_key] = (outids, indices,
-                                                      indice_pairs,
-                                                      indice_pair_num,
-                                                      spatial_shape)
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    interval = time.time() - t
+                    out_tensor.benchmark_record[
+                        self.name]["indice_gen_time"].append(interval)
+
+                indice_data = IndiceData(outids, indices, indice_pairs,
+                                         indice_pair_num, spatial_shape)
+                input.indice_dict[self.indice_key] = indice_data
+        if input.benchmark:
+            torch.cuda.synchronize()
+            t = time.time()
+
        if self.fused_bn:
            assert self.bias is not None
            out_features = ops.fused_indice_conv(features, self.weight,
@@ -210,10 +250,18 @@ class SparseConvolution(SparseModule):

            if self.bias is not None:
                out_features += self.bias
-        out_tensor = spconv.SparseConvTensor(out_features, outids,
-                                             out_spatial_shape, batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["time"].append(interval)
+            out_tensor.benchmark_record[self.name]["num_points"].append(
+                features.shape[0])
+            out_tensor.benchmark_record[self.name]["num_out_points"].append(
+                out_features.shape[0])
+
+        out_tensor.features = out_features
+        out_tensor.indices = outids
+        out_tensor.spatial_shape = out_spatial_shape
        return out_tensor


@@ -229,7 +277,8 @@ class SparseConv2d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv2d, self).__init__(2,
                                           in_channels,
                                           out_channels,
@@ -241,7 +290,8 @@ class SparseConv2d(SparseConvolution):
                                           bias,
                                           indice_key=indice_key,
                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)


 class SparseConv3d(SparseConvolution):
@@ -256,7 +306,8 @@ class SparseConv3d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv3d, self).__init__(3,
                                           in_channels,
                                           out_channels,
@@ -268,7 +319,8 @@ class SparseConv3d(SparseConvolution):
                                           bias,
                                           indice_key=indice_key,
                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)


 class SparseConv4d(SparseConvolution):
@@ -283,7 +335,8 @@ class SparseConv4d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv4d, self).__init__(4,
                                           in_channels,
                                           out_channels,
@@ -295,7 +348,8 @@ class SparseConv4d(SparseConvolution):
                                           bias,
                                           indice_key=indice_key,
                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)


 class SparseConvTranspose2d(SparseConvolution):
@@ -310,7 +364,8 @@ class SparseConvTranspose2d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConvTranspose2d, self).__init__(2,
                                                    in_channels,
                                                    out_channels,
@@ -323,7 +378,8 @@ class SparseConvTranspose2d(SparseConvolution):
                                                    transposed=True,
                                                    indice_key=indice_key,
                                                    use_hash=use_hash,
-                                                    algo=algo)
+                                                    algo=algo,
+                                                    name=name)


 class SparseConvTranspose3d(SparseConvolution):
@@ -338,7 +394,8 @@ class SparseConvTranspose3d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConvTranspose3d, self).__init__(3,
                                                    in_channels,
                                                    out_channels,
@@ -351,7 +408,8 @@ class SparseConvTranspose3d(SparseConvolution):
                                                    transposed=True,
                                                    indice_key=indice_key,
                                                    use_hash=use_hash,
-                                                    algo=algo)
+                                                    algo=algo,
+                                                    name=name)


 class SparseInverseConv2d(SparseConvolution):
@@ -361,7 +419,8 @@ class SparseInverseConv2d(SparseConvolution):
                 kernel_size,
                 indice_key,
                 bias=True,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseInverseConv2d, self).__init__(2,
                                                  in_channels,
                                                  out_channels,
@@ -369,7 +428,8 @@ class SparseInverseConv2d(SparseConvolution):
                                                  bias=bias,
                                                  inverse=True,
                                                  indice_key=indice_key,
-                                                  algo=algo)
+                                                  algo=algo,
+                                                  name=name)


 class SparseInverseConv3d(SparseConvolution):
@@ -379,7 +439,8 @@ class SparseInverseConv3d(SparseConvolution):
                 kernel_size,
                 indice_key,
                 bias=True,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseInverseConv3d, self).__init__(3,
                                                  in_channels,
                                                  out_channels,
@@ -387,7 +448,8 @@ class SparseInverseConv3d(SparseConvolution):
                                                  bias=bias,
                                                  inverse=True,
                                                  indice_key=indice_key,
-                                                  algo=algo)
+                                                  algo=algo,
+                                                  name=name)


 class SubMConv2d(SparseConvolution):
@@ -402,7 +464,8 @@ class SubMConv2d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv2d, self).__init__(2,
                                         in_channels,
                                         out_channels,
@@ -415,7 +478,8 @@ class SubMConv2d(SparseConvolution):
                                         True,
                                         indice_key=indice_key,
                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)


 class SubMConv3d(SparseConvolution):
@@ -430,7 +494,8 @@ class SubMConv3d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv3d, self).__init__(3,
                                         in_channels,
                                         out_channels,
@@ -443,7 +508,8 @@ class SubMConv3d(SparseConvolution):
                                         True,
                                         indice_key=indice_key,
                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)


 class SubMConv4d(SparseConvolution):
@@ -458,7 +524,8 @@ class SubMConv4d(SparseConvolution):
                 bias=True,
                 indice_key=None,
                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv4d, self).__init__(4,
                                         in_channels,
                                         out_channels,
@@ -471,4 +538,5 @@ class SubMConv4d(SparseConvolution):
                                         True,
                                         indice_key=indice_key,
                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)
--- a/spconv/core.py
+++ b/spconv/core.py
+from typing import Optional
+
+import numpy as np
+import torch
+
+
+class IndiceData(object):
+    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
+                 out_spatial_shape):
+        self.out_indices = out_indices
+        self.indices = indices
+        self.indice_pairs = indice_pairs
+        self.indice_pair_num = indice_pair_num
+        self.out_spatial_shape = out_spatial_shape
+
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor(object):
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None,
+                 benchmark=False):
+        """
+        Args:
+            features: [num_points, num_features] feature tensor
+            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
+            spatial_shape: spatial shape of your sparse data
+            batch_size: batch size of your sparse data
+            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
+                is very large.
+            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
+                SparseConvTensor.
+        """
+        self.features = features
+        self.indices = indices
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        if grid is None:
+            grid = torch.Tensor()  # empty tensor
+        self.grid = grid
+        self.benchmark = benchmark
+        self.benchmark_record = {}
+
+    @classmethod
+    def from_dense(cls, x: torch.Tensor):
+        """create sparse tensor fron channel last dense tensor by to_sparse
+        x must be NHWC tensor, channel last
+        """
+        x = x.to_sparse(x.ndim - 1)
+        spatial_shape = x.shape[1:-1]
+        batch_size = x.shape[0]
+        indices_th = x.indices().permute(1, 0).contiguous().int()
+        features_th = x.values()
+        return cls(features_th, indices_th, spatial_shape, batch_size)
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key) -> Optional[IndiceData]:
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(
+            self.indices.to(self.features.device).long(), self.features,
+            output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return self.indices.shape[0] / np.prod(
+            self.spatial_shape) / self.batch_size
+
+    def shadow_copy(self) -> "SparseConvTensor":
+        """create a new spconv tensor with all member unchanged"""
+        tensor = SparseConvTensor(self.features, self.indices,
+                                  self.spatial_shape, self.batch_size,
+                                  self.grid, self.benchmark)
+        tensor.benchmark_record = self.benchmark_record
+        tensor.indice_dict = self.indice_dict
+        return tensor
--- a/spconv/modules.py
+++ b/spconv/modules.py
@@ -49,7 +49,9 @@ def _mean_update(vals, m_vals, t):
 class SparseModule(nn.Module):
    """ place holder, all module subclass from this will take sptensor in SparseSequential.
    """
-    pass
+    def __init__(self, name=None):
+        super().__init__()
+        self.name = name


 class SparseSequential(SparseModule):

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -26,6 +26,7 @@ class ConvAlgo(Enum):
    SparseConvNet = 3
    Minkowski = 4  # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu

+
 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)
    output_size = []

--- a/spconv/pool.py
+++ b/spconv/pool.py
@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
 import spconv
 import spconv.functional as Fsp
 from spconv import ops
+from spconv.core import IndiceData
 from spconv.modules import SparseModule


@@ -34,8 +35,10 @@ class SparseMaxPool(SparseModule):
                 stride=None,
                 padding=0,
                 dilation=1,
-                 subm=False):
-        super(SparseMaxPool, self).__init__()
+                 indice_key=None,
+                 subm=False,
+                 name=None):
+        super(SparseMaxPool, self).__init__(name=name)
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
        if stride is None:
@@ -52,6 +55,7 @@ class SparseMaxPool(SparseModule):
        self.padding = padding
        self.subm = subm
        self.dilation = dilation
+        self.indice_key = indice_key

    def forward(self, input):
        assert isinstance(input, spconv.SparseConvTensor)
@@ -66,6 +70,32 @@ class SparseMaxPool(SparseModule):
                self.dilation)
        else:
            out_spatial_shape = spatial_shape
+        out_tensor = input.shadow_copy()
+        if input.benchmark:
+            if self.name is None:
+                raise ValueError(
+                    "you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
+                )
+            if self.name not in input.benchmark_record:
+                input.benchmark_record[self.name] = {
+                    "type": "SparseMaxPool",
+                    "indice_gen_time": [],
+                    "time": [],
+                    "num_points": [],
+                    "num_out_points": [],
+                    "params": {
+                        "kernel_size": self.kernel_size,
+                        "stride": self.stride,
+                        "padding": self.padding,
+                        "dilation": self.dilation,
+                        "channels": features.shape[1],
+                    }
+                }
+
+        if input.benchmark:
+            torch.cuda.synchronize()
+            t = time.time()
+
        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
            indices,
            batch_size,
@@ -77,24 +107,65 @@ class SparseMaxPool(SparseModule):
            0,
            self.subm,
            grid=input.grid)
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["indice_gen_time"].append(
+                interval)
+            t = time.time()
+
+        if self.indice_key is not None:
+            datas = input.find_indice_pair(self.indice_key)
+            if datas is None:
+                indice_data = IndiceData(outids, indices, indice_pairs,
+                                         indice_pairs_num, spatial_shape)
+                input.indice_dict[self.indice_key] = indice_data
+            else:
+                raise ValueError("indice data exists")

        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
                                          indice_pairs_num.to(device),
                                          outids.shape[0])
-        out_tensor = spconv.SparseConvTensor(out_features, outids,
-                                             out_spatial_shape, batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["time"].append(interval)
+            out_tensor.benchmark_record[self.name]["num_points"].append(
+                features.shape[0])
+            out_tensor.benchmark_record[self.name]["num_out_points"].append(
+                out_features.shape[0])
+
+        out_tensor.features = out_features
+        out_tensor.indices = outids
+        out_tensor.spatial_shape = out_spatial_shape
        return out_tensor


 class SparseMaxPool2d(SparseMaxPool):
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
-                                              dilation)
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool2d, self).__init__(2,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)


 class SparseMaxPool3d(SparseMaxPool):
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
-                                              dilation)
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool3d, self).__init__(3,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
--- a/spconv/spatial.py
+++ b/spconv/spatial.py
@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
 import spconv
 from spconv.modules import SparseModule

+
 class RemoveDuplicate(SparseModule):
    def forward(self, x: spconv.SparseConvTensor):
        inds = x.indices
@@ -39,5 +40,6 @@ class RemoveDuplicate(SparseModule):
        _, unique_inds = torch.unique(indices_index)
        new_inds = inds[unique_inds]
        new_features = x.features[unique_inds]
-        res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid)
+        res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape,
+                                      x.batch_size, x.grid)
        return res
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -294,20 +294,18 @@ class VoxelGeneratorV2:
    def grid_size(self):
        return self._grid_size

+
 class VoxelGeneratorV3:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_points,
-                 num_features,
-                 dtype,
-                 device):
+    def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
+                 dtype, device):

        self._max_points = max_points

        self._point_cloud_range = point_cloud_range
        self._voxel_size = voxel_size
-        self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
+        self._grid_size = torch.round(
+            (self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
+            self._voxel_size).to(torch.int32)
        grid_volume = self._grid_size.prod()
        self._grid_size = self._grid_size.cpu().numpy().tolist()
        self._ndim = len(self._grid_size)
@@ -315,19 +313,34 @@ class VoxelGeneratorV3:
        self._dtype = dtype
        self._device = device

-        self._point_index = torch.full([max_points + 1], grid_volume, dtype=torch.int32, device=self._device)
-        self._grids = torch.zeros([grid_volume, num_features], dtype=self._dtype, device=self._device)
-        self._num_points_per_grid = torch.zeros([grid_volume], dtype=torch.int32, device=self._device)
-        self._voxels = torch.zeros([max_points, num_features], dtype=self._dtype, device=self._device)
-        self._coors = torch.zeros([max_points, self._ndim], dtype=torch.int32, device=self._device)
+        self._point_index = torch.full([max_points + 1],
+                                       grid_volume,
+                                       dtype=torch.int32,
+                                       device=self._device)
+        self._grids = torch.zeros([grid_volume, num_features],
+                                  dtype=self._dtype,
+                                  device=self._device)
+        self._num_points_per_grid = torch.zeros([grid_volume],
+                                                dtype=torch.int32,
+                                                device=self._device)
+        self._voxels = torch.zeros([max_points, num_features],
+                                   dtype=self._dtype,
+                                   device=self._device)
+        self._coors = torch.zeros([max_points, self._ndim],
+                                  dtype=torch.int32,
+                                  device=self._device)

    def generate(self, points):
-        assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
+        assert points.shape[
+            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
+                points.shape[0])
        points.to(self._dtype).to(self._device)
        return self.points_to_voxel(points)

    def generate_multi_gpu(self, points):
-        assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
+        assert points.shape[
+            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
+                points.shape[0])
        points.to(self._dtype).to(self._device)
        return self.points_to_voxel(points)

@@ -351,23 +364,21 @@ class VoxelGeneratorV3:
            coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
                format: xyzxyz, minmax
        """
-        indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
-        num_voxel = torch.ops.spconv.points_to_voxel(points, indexes,
-                                                     self._point_index,
-                                                     self._grids,
-                                                     self._num_points_per_grid,
-                                                     self._voxels,
-                                                     self._coors,
-                                                     self._grid_size,
-                                                     self._ndim)
+        indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
+                              self._voxel_size).to(torch.int32)
+        num_voxel = torch.ops.spconv.points_to_voxel(
+            points, indexes, self._point_index, self._grids,
+            self._num_points_per_grid, self._voxels, self._coors,
+            self._grid_size, self._ndim)
        voxels = self._voxels[:num_voxel, :]
        coors = self._coors[:num_voxel, :]

        # xyz --> zyx
        #coors = coors[::-1]
-        x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
+        x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
+            [-1, 1]), coors[:, 2].reshape([-1, 1])
        coors = torch.cat([z, y, x], dim=1)
        # can be skipped
-#        x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
-#        voxels = torch.cat([z, y, x, f], dim=1)
+        #        x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
+        #        voxels = torch.cat([z, y, x, f], dim=1)
        return voxels, coors
--- a/src/spconv/fused_conv.cu
+++ b/src/spconv/fused_conv.cu
@@ -15,8 +15,8 @@
 #include <ATen/ATen.h>
 #include <spconv/fused_conv.cu.h>
 #include <spconv/fused_conv.h>
-#include <tensorview/torch_utils.h>
 #include <spconv/minkowski.cu.h>
+#include <tensorview/torch_utils.h>

 namespace spconv {
 void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
@@ -81,10 +81,9 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
  int step = (nHot + num_div - 1) / num_div;
  dim3 threads(shared_mem_size, shared_mem_size);

-
  tv::dispatch_torch<float>(dtype, [&](auto I) {
    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){
+    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
      constexpr int ShmemSize = decltype(ShSizeValue)::value;
      for (int s = 0; s < num_div; s++) {
        int remainder = nHot - step * s;
@@ -93,16 +92,18 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
                  (curr_num_active + threads.y - 1) / threads.y);
        matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
            features.data_ptr<T>(), in_nchannel, curr_num_active,
-            filters.data_ptr<T>(), out_nchannel,
-            in_nchannel, output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
+            filters.data_ptr<T>(), out_nchannel, in_nchannel,
+            output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
            indicesOut.data_ptr<int32_t>());
      }
    });
  });
 }
-void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
+void fused_conv_backward_cuda_minkowski(torch::Tensor features,
+                                        torch::Tensor din, torch::Tensor dout,
+                                        torch::Tensor filters,
+                                        torch::Tensor dfilters,
+                                        torch::Tensor indicesIn,
                                        torch::Tensor indicesOut, int nHot) {
  auto dtype = features.scalar_type();
  auto in_nchannel = features.size(1);
@@ -131,7 +132,7 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di

  tv::dispatch_torch<float>(dtype, [&](auto I) {
    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){
+    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
      constexpr int ShmemSize = decltype(ShSizeValue)::value;
      for (int s = 0; s < num_div; s++) {
        int remainder = nHot - step * s;