format code, add benchmark per layer

3517290c · yanyan · 540a2209 · 3517290c · 3517290c · 3517290c
Commit 3517290c authored Jul 09, 2020 by yanyan
9 changed files
--- a/src/spconv/point2voxel.cu
+++ b/src/spconv/point2voxel.cu
@@ -10,9 +10,7 @@
 namespace spconv {
-void scatter_point_to_grid_cuda(
+void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
-    torch::Tensor points,
-    torch::Tensor indexes,
                                torch::Tensor grids,
                                torch::Tensor numPointsPerGrid,
                                torch::Tensor pointIndex,
@@ -27,27 +25,25 @@ void scatter_point_to_grid_cuda(
      constexpr int NDim = decltype(I)::value;
      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
      scatterPointToGridKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS,
+          <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
-             0, stream>>>(tv::torch2tv<float>(points),
+             stream>>>(tv::torch2tv<float>(points),
-                          tv::torch2tv<Index>(indexes),
+                       tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
-                          tv::torch2tv<float>(grids),
                       tv::torch2tv<Index>(numPointsPerGrid),
-                          tv::torch2tv<Index>(pointIndex),
+                       tv::torch2tv<Index>(pointIndex), gs);
-                          gs);
      TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
      cudaFuncAttributes attr;
-      checkCudaErrors(cudaFuncGetAttributes(
+      checkCudaErrors(
-          &attr, scatterPointToGridKernel<Index, NDim>));
+          cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
-      tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim,
+      tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
-                  ">", attr.numRegs);
+                  attr.numRegs);
 #endif
    });
  });
 }
-void gather_point_from_grid_cuda(
+void gather_point_from_grid_cuda(torch::Tensor grids,
-    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+                                 torch::Tensor numPointsPerGrid,
                                 torch::Tensor pointIndex,
                                 torch::Tensor pointIndexUnique,
                                 torch::Tensor voxels, torch::Tensor coors,
@@ -57,15 +53,17 @@ void gather_point_from_grid_cuda(
  auto num_voxel = voxels.size(0);
  auto num_max_points = pointIndex.size(0) - 1;
  auto grid_volume = grids.size(0);
-  tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
+  tv::dispatch_torch<int32_t>(
+      pointIndexUnique.scalar_type(), [&](auto IndexValue) {
        using Index = decltype(IndexValue);
        tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
          constexpr int NDim = decltype(I)::value;
          tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
          resetPointIndexKernel<Index>
-          <<<tv::cuda::getBlocks(num_max_points), tv::cuda::CUDA_NUM_THREADS,
+              <<<tv::cuda::getBlocks(num_max_points),
-             0, stream>>>(tv::torch2tv<Index>(pointIndex), grid_volume);
+                 tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                  tv::torch2tv<Index>(pointIndex), grid_volume);
          TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
          cudaFuncAttributes attr0;
@@ -76,32 +74,30 @@ void gather_point_from_grid_cuda(
 #endif
          gatherPointFromGridKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
+              <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
-             0, stream>>>(tv::torch2tv<float>(grids),
+                 stream>>>(tv::torch2tv<float>(grids),
                           tv::torch2tv<Index>(numPointsPerGrid),
                           tv::torch2tv<Index>(pointIndexUnique),
                           tv::torch2tv<float>(voxels),
-                          tv::torch2tv<Index>(coors),
+                           tv::torch2tv<Index>(coors), gs);
-                          gs);
          TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
          cudaFuncAttributes attr1;
          checkCudaErrors(cudaFuncGetAttributes(
              &attr1, gatherPointFromGridKernel<Index, NDim>));
-      tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
+          tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
-                  attr1.numRegs);
+                      ">", attr1.numRegs);
 #endif
-      resetGridKernel<Index>
+          resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
-          <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
+                                   tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-             0, stream>>>(tv::torch2tv<float>(grids),
+              tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
-                          tv::torch2tv<Index>(numPointsPerGrid),
              tv::torch2tv<Index>(pointIndexUnique));
          TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
          cudaFuncAttributes attr2;
-      checkCudaErrors(cudaFuncGetAttributes(
+          checkCudaErrors(
-          &attr2, resetGridKernel<Index, NDim>));
+              cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
          tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
                      attr2.numRegs);
 #endif

--- a/src/spconv/point2voxel_ops.cc
+++ b/src/spconv/point2voxel_ops.cc
@@ -3,23 +3,18 @@
 namespace spconv {
-int64_t
+int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
-pointsToVoxel(torch::Tensor points,
+                      torch::Tensor pointIndex, torch::Tensor grids,
-              torch::Tensor indexes,
+                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
-              torch::Tensor pointIndex,
+                      torch::Tensor coors, std::vector<int64_t> gridShape,
-              torch::Tensor grids,
-              torch::Tensor numPointsPerGrid,
-              torch::Tensor voxels,
-              torch::Tensor coors,
-              std::vector<int64_t> gridShape,
                      const int64_t ndim) {
  if (points.device().type() == torch::kCPU) {
    TV_THROW_INVALID_ARG("not support cpu currently");
  }
 #ifdef TV_CUDA
  else if (points.device().type() == torch::kCUDA) {
-    scatter_point_to_grid_cuda(points, indexes, grids,
+    scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
-        numPointsPerGrid, pointIndex, gridShape, ndim);
+                               pointIndex, gridShape, ndim);
  }
 #endif
  else {
@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points,
  }
 #ifdef TV_CUDA
  else if (points.device().type() == torch::kCUDA) {
-    gather_point_from_grid_cuda(grids, numPointsPerGrid,
+    gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
-        pointIndex, pointIndexUnique, voxels, coors, gridShape, ndim);
+                                pointIndexUnique, voxels, coors, gridShape,
+                                ndim);
  }
 #endif
  else {

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
 }
 template <int Algo>
-torch::Tensor
+torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
-indiceConvFused(torch::Tensor features, torch::Tensor filters,
+                              torch::Tensor indicePairs,
-                        torch::Tensor indicePairs, torch::Tensor indiceNum,
+                              torch::Tensor indiceNum, int64_t numActOut,
-                        int64_t numActOut, int64_t _inverse, int64_t _subM) {
+                              int64_t _inverse, int64_t _subM) {
  auto kernelVolume = indiceNum.size(0);
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
@@ -282,7 +282,8 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters,
    }
 #ifdef TV_CUDA
    else if (device == torch::kCUDA) {
-      FusedConvDispatch<Algo>::fwd(output, features, filters[i], indicePairs[inverse][i],
+      FusedConvDispatch<Algo>::fwd(output, features, filters[i],
+                                   indicePairs[inverse][i],
                                   indicePairs[!inverse][i], nHot);
    }
 #endif
@@ -518,8 +519,7 @@ template <int Algo>
 std::vector<torch::Tensor>
 indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                          torch::Tensor indiceNum, int64_t _inverse,
+                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-                          int64_t _subM) {
  auto kernelVolume = indiceNum.size(0);
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> {
  constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
 };
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
                         int64_t numActOut, int64_t _inverse, int64_t _subM,

--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -26,38 +26,104 @@ class Net(nn.Module):
    def __init__(self, shape, algo):
        super().__init__()
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
-            spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
+                              algo=algo),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
            # nn.BatchNorm1d(32),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
+            spconv.SubMConv3d(64,
-            spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
            # nn.BatchNorm1d(64),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
+            spconv.SubMConv3d(96,
-            spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
+            spconv.SubMConv3d(128,
-            spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
+            spconv.SubMConv3d(160,
-            spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
+            spconv.SubMConv3d(192,
-            spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
+            spconv.SubMConv3d(224,
-            spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
        )
        max_batch_size = 1
        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.

--- a/test/benchmark_detail.py
+++ b/test/benchmark_detail.py
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch import nn
+import spconv
+from spconv.utils import VoxelGeneratorV2
+def waymo_data(batch_size=1):
+    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+                           150000)
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    pc = data["pc"]
+    data = gen.generate(pc)
+    voxels = data["voxels"].reshape(-1, 3)
+    coors = data["coordinates"]
+    N = coors.shape[0]
+    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
+    return voxels, coors, gen.grid_size
+class Net(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo,
+                              name="subm-0-0"),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo,
+                              name="subm-0-1"),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-0"),
+            spconv.SubMConv3d(64,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo,
+                              name="subm-1-0"),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo,
+                              name="subm-1-1"),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-1"),
+            spconv.SubMConv3d(96,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo,
+                              name="subm-2-0"),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo,
+                              name="subm-2-1"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-2"),
+            spconv.SubMConv3d(128,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo,
+                              name="subm-3-0"),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo,
+                              name="subm-3-1"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-3"),
+            spconv.SubMConv3d(160,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo,
+                              name="subm-4-0"),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo,
+                              name="subm-4-1"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-4"),
+            spconv.SubMConv3d(192,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo,
+                              name="subm-5-0"),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo,
+                              name="subm-5-1"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, name="pool-5"),
+            spconv.SubMConv3d(224,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo,
+                              name="subm-6-0"),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo,
+                              name="subm-6-1"),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features,
+                                    coors,
+                                    self.shape,
+                                    batch_size,
+                                    self.grid,
+                                    benchmark=True)
+        return self.net(x)
+def main():
+    dtype = torch.float32
+    voxels, coors, spatial_shape = waymo_data()
+    voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
+    coors_th = torch.from_numpy(coors).cuda().int()
+    algo = spconv.ConvAlgo.Minkowski
+    net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
+    print(coors_th.shape)
+    out = net(voxels_th, coors_th, 1)
+    print(out.spatial_shape)
+    times = []
+    detail_bench = {}
+    detail_ind_gen_bench = {}
+    with torch.no_grad():
+        for i in range(20):
+            torch.cuda.synchronize()
+            t = time.time()
+            out = net(voxels_th, coors_th, 1)
+            for k, v in out.benchmark_record.items():
+                if k not in detail_bench:
+                    detail_bench[k] = []
+                    detail_ind_gen_bench[k] = []
+                detail_bench[k].extend(v["time"])
+                detail_ind_gen_bench[k].extend(v["indice_gen_time"])
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    # print((net.grid == -1).float().sum(), net.grid.numel())
+    # print("spconv time", time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+    print(detail_bench["subm-6-0"])
+    print(detail_ind_gen_bench["subm-6-0"])
+if __name__ == "__main__":
+    main()
--- a/test/benchmark_points_to_voxel.py
+++ b/test/benchmark_points_to_voxel.py
@@ -13,10 +13,14 @@ def waymo_data_gpu(batch_size=1):
    print('gpu with total points available per voxel')
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
+    voxel_size = torch.Tensor([0.1, 0.1,
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
+                               0.1]).to(points.dtype).to(points.device)
+    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
-    gen = VoxelGeneratorV3(voxel_size, coors_range, max_points=200000,
+                                6]).to(points.dtype).to(points.device)
+    gen = VoxelGeneratorV3(voxel_size,
+                           coors_range,
+                           max_points=200000,
                           num_features=points.shape[1],
                           dtype=points.dtype,
                           device=points.device)
@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1):
 def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
    print('cpu with %d max points per voxel' % max_points_per_voxel)
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], max_points_per_voxel,
+    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
-                           150000)
+                           max_points_per_voxel, 150000)
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    pc = data["pc"]
    data = gen.generate(pc)
@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
    return voxels, coors, gen.grid_size
 def get_index(coor, grid_size):
    index = coor[0]
    for c, g in zip(coor[1:], grid_size):
@@ -100,5 +105,6 @@ def main():
    print('Perfect GPU Voxelization!!!')
 if __name__ == "__main__":
    main()
--- a/test/benchmark_points_to_voxel_gpu.py
+++ b/test/benchmark_points_to_voxel_gpu.py
@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3
 def waymo_data(batch_size=1):
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
+    voxel_size = torch.Tensor([0.1, 0.1,
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
+                               0.1]).to(points.dtype).to(points.device)
+    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
+                                6]).to(points.dtype).to(points.device)
    gen = VoxelGeneratorV3(voxel_size, coors_range)
    voxels, coors = gen.generate(points)
@@ -28,43 +30,111 @@ class Net(nn.Module):
        super().__init__()
        self.device = device
        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
-            spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
+                              algo=algo),
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
            # nn.BatchNorm1d(32),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
+            spconv.SubMConv3d(64,
-            spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
            # nn.BatchNorm1d(64),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
+            spconv.SubMConv3d(96,
-            spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
+            spconv.SubMConv3d(128,
-            spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
+            spconv.SubMConv3d(160,
-            spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
+            spconv.SubMConv3d(192,
-            spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
            # nn.BatchNorm1d(128),
            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
+            spconv.SubMConv3d(224,
-            spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
        )
        max_batch_size = 1
        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape], -1,
+        self.grid = torch.full([max_batch_size, *shape],
-                               dtype=torch.int32, device=self.device)
+                               -1,
+                               dtype=torch.int32,
+                               device=self.device)
        # self.grid = None
        self.shape = shape
@@ -78,7 +148,8 @@ def main():
    voxels, coors, spatial_shape = waymo_data()
    voxels_th, coors_th = voxels, coors
    algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape[::-1], algo, voxels_th.device).cuda(device=voxels_th.device).eval().float()
+    net = Net(spatial_shape[::-1], algo,
+              voxels_th.device).cuda(device=voxels_th.device).eval().float()
    print(coors_th.shape)
    out = net(voxels_th, coors_th, 1)
    print(out.spatial_shape)

--- a/cutlass @ fd7e058d
+++ b/cutlass @ fd7e058d
-Subproject commit 86931fef8538008a1a92036732b3eb7fe47b25d0
+Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
--- a/mp11 @ 29764aad
+++ b/mp11 @ 29764aad
-Subproject commit 10ba80acb91f138170b7a22bb86523cb07d6f942
+Subproject commit 29764aad4881fde809af6a025c12012e47a55515