sync some code

d17a00e0 · yanyan · 11bcbbf6 · d17a00e0 · d17a00e0 · d17a00e0
Commit d17a00e0 authored Jun 05, 2020 by yanyan
13 changed files
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel(
 template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
          unsigned K2>
-__global__ void getSubMIndicePairsKernel3(
+__global__ void getSubMIndicePairsUnrollKernel3(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3(
 #pragma unroll
        for (int k = 0; k < K2; ++k) {
          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center){
+          if (offset > center) {
            continue;
          }
-          if (center == offset){
+          if (center == offset) {
-              // center of subm indice pairs dont need atomicadd
+            // center of subm indice pairs dont need atomicadd
-              indicePairs(1, offset, ix) = ix;
+            indicePairs(1, offset, ix) = ix;
-              indicePairs(0, offset, ix) = ix;
+            indicePairs(0, offset, ix) = ix;
-          }else{
+          } else {
            point[2] = indice_data[3] - k + K2 / 2;
            point[1] = indice_data[2] - j + K1 / 2;
            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
+            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
-                point[2] < outSpatialShape[2] && point[0] >= 0 &&
+                point[2] >= 0 && point[2] < outSpatialShape[2] &&
-                point[0] < outSpatialShape[0]) {
+                point[0] >= 0 && point[0] < outSpatialShape[0]) {
              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
                          point, outSpatialShape.data(), 0) +
                      spatialVolume * indice_data[0];
              if (gridsOut[index] != -1) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
+                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
+                // - 1]
                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
                indicePairs(1, offset, oldNum) = gridsOut[index];
@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3(
 }
 template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
-__global__ void getSubMIndicePairsKernel2(
+__global__ void getSubMIndicePairsUnrollKernel2(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2(
 #pragma unroll
      for (int j = 0; j < K1; ++j) {
        offset = i * K1 + j;
-        if (offset > center){
+        if (offset > center) {
          continue;
        }
-        if (center == offset){
+        if (center == offset) {
-            // center of subm indice pairs dont need atomicadd
+          // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
+          indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
+          indicePairs(0, offset, ix) = ix;
-        }else{
+        } else {
          point[1] = indice_data[2] - j + K1 / 2;
          point[0] = indice_data[1] - i + K0 / 2;
          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel(
  }
 }
+template <typename Index, unsigned K0, unsigned K1, unsigned K2,
+          unsigned kNumHashFunctions = 4>
+__global__ void getSubMIndicePairsHashUnrollKernel3(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume,
+    unsigned table_size, const cuhash::Entry *table,
+    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
+    unsigned stash_count) {
+  auto numActIn = indicesIn.dim(0);
+  Index index = 0;
+  Index offset;
+  Index point[3];
+  constexpr unsigned KV = K0 * K1 * K2;
+  constexpr unsigned center = KV / 2;
+  *(indiceNum.data() + center) = numActIn;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
+#pragma unroll
+    for (int i = 0; i < K0; ++i) {
+#pragma unroll
+      for (int j = 0; j < K1; ++j) {
+#pragma unroll
+        for (int k = 0; k < K2; ++k) {
+          offset = i * K1 * K2 + j * K2 + k;
+          if (offset > center) {
+            continue;
+          }
+          if (center == offset) {
+            // center of subm indice pairs dont need atomicadd
+            indicePairs(1, offset, ix) = ix;
+            indicePairs(0, offset, ix) = ix;
+          } else {
+            point[2] = indice_data[3] - k + K2 / 2;
+            point[1] = indice_data[2] - j + K1 / 2;
+            point[0] = indice_data[1] - i + K0 / 2;
+            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
+                point[2] >= 0 && point[2] < outSpatialShape[2] &&
+                point[0] >= 0 && point[0] < outSpatialShape[0]) {
+              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
+                          point, outSpatialShape.data(), 0) +
+                      spatialVolume * indice_data[0];
+              auto val =
+                  cuhash::retrieve((unsigned)(index), table_size, table,
+                                   constants, stash_constants, stash_count);
+              if (val != cuhash::kNotFound) {
+                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
+                // - 1]
+                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
+                indicePairs(1, offset, oldNum) = val;
+                indicePairs(0, offset, oldNum) = ix;
+                indicePairs(1, KV - offset - 1, oldNum) = ix;
+                indicePairs(0, KV - offset - 1, oldNum) = val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+template <typename Index, unsigned K0, unsigned K1,
+          unsigned kNumHashFunctions = 4>
+__global__ void getSubMIndicePairsHashUnrollKernel2(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume,
+    unsigned table_size, const cuhash::Entry *table,
+    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
+    unsigned stash_count) {
+  auto numActIn = indicesIn.dim(0);
+  Index index = 0;
+  Index offset;
+  Index point[2];
+  constexpr unsigned KV = K0 * K1;
+  constexpr unsigned center = KV / 2;
+  *(indiceNum.data() + center) = numActIn;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
+#pragma unroll
+    for (int i = 0; i < K0; ++i) {
+#pragma unroll
+      for (int j = 0; j < K1; ++j) {
+        offset = i * K1 + j;
+        if (offset > center) {
+          continue;
+        }
+        if (center == offset) {
+          // center of subm indice pairs dont need atomicadd
+          indicePairs(1, offset, ix) = ix;
+          indicePairs(0, offset, ix) = ix;
+        } else {
+          point[1] = indice_data[2] - j + K1 / 2;
+          point[0] = indice_data[1] - i + K0 / 2;
+          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
+              point[0] < outSpatialShape[0]) {
+            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
+                        point, outSpatialShape.data(), 0) +
+                    spatialVolume * indice_data[0];
+            auto val =
+                cuhash::retrieve((unsigned)(index), table_size, table,
+                                 constants, stash_constants, stash_count);
+            if (val != cuhash::kNotFound) {
+              // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
+              // 1]
+              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
+              indicePairs(1, offset, oldNum) = val;
+              indicePairs(0, offset, oldNum) = ix;
+              indicePairs(1, KV - offset - 1, oldNum) = ix;
+              indicePairs(0, KV - offset - 1, oldNum) = val;
+            }
+          }
+        }
+      }
+    }
+  }
+}
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void resetGridKernel(const Index *indicePairUnique,
                                tv::TensorView<IndexGrid> gridsOut,
@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void
 resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
                    const tv::SimpleVector<Index, NDim> outSpatialShape,
-                    int numAct) {
+                    int numAct, Index spatialVolume) {
-  Index outSpatialShapeReg[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShapeReg[i] = outSpatialShape[i];
-  }
-  Index spatialVolume = 1;
  auto indsPtr = indices;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
  Index index;
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    indsPtr = indices + ix * (NDim + 1);
    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
-                                                        outSpatialShapeReg, 0);
+                                                        outSpatialShape.data(), 0);
    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
  }
 }

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
 // torch.jit's doc says only support int64, so we need to convert to int32.
 std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, int64_t batchSize,
+getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
               std::vector<int64_t> outSpatialShape,
               std::vector<int64_t> spatialShape,
               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,

--- a/include/sphash/hashmap.h
+++ b/include/sphash/hashmap.h
+#include <tensorview/tensor.h>
+namespace spconv {
+enum HashTypes {
+    kDenseMap = 0,
+    kCUDPPHash = 1
+};
+template <int Impl>
+struct HashMap;
+template<>
+struct HashMap<kDenseMap>{
+};
+}
\ No newline at end of file
--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> {
  }
 };
+template <class T> struct DispatchIntNoexcept;
+template <template <class...> class T, class... Args>
+struct DispatchIntNoexcept<T<Args...>> {
+  template <typename F> inline bool operator()(int t, F &&f) {
+    return dispatch_int_noexcept<Args::value...>(t, std::forward<F>(f));
+  }
+  template <typename F, typename BinaryPredicate>
+  inline bool operator()(int t, BinaryPredicate p, F &&f) {
+    return dispatch_int_noexcept<Args::value...>(t, p, std::forward<F>(f));
+  }
+};
 constexpr size_t kTensorMaxDim = 10;
 using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;

--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -68,6 +68,8 @@ class SparseConvTensor(object):
        self.spatial_shape = spatial_shape
        self.batch_size = batch_size
        self.indice_dict = {}
+        if grid is None:
+            grid = torch.Tensor() # empty tensor
        self.grid = grid
    @classmethod

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -88,23 +88,13 @@ def get_indice_pairs(indices,
    else:
        out_shape = spatial_shape
    if grid is None:
-        res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,
+        grid = torch.Tensor()
-                                                spatial_shape, ksize, stride,
+    res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size, out_shape,
-                                                padding, dilation, out_padding,
+                                            spatial_shape, ksize, stride,
-                                                int(subm), int(transpose),
+                                            padding, dilation, out_padding,
-                                                int(use_hash))
+                                            int(subm), int(transpose),
-        return res
+                                            int(use_hash))
-    else:
+    return res
-        if ndim == 2:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
-        elif ndim == 3:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose), int(use_hash))
 def indice_conv(features,

--- a/spconv/pool.py
+++ b/spconv/pool.py
@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule):
            padding = [padding] * ndim
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim
        self.ndim = ndim
        self.kernel_size = kernel_size
        self.stride = stride
@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule):
        indices = input.indices
        spatial_shape = input.spatial_shape
        batch_size = input.batch_size
+        torch.cuda.synchronize()
+        t = time.time()
        if not self.subm:
            out_spatial_shape = ops.get_conv_output_size(
                spatial_shape, self.kernel_size, self.stride, self.padding,
@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule):
            out_spatial_shape = spatial_shape
        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
-            self.padding, self.dilation, 0, self.subm)
+            self.padding, self.dilation, 0, self.subm, grid=input.grid)
        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
                                          indice_pairs_num.to(device),
                                          outids.shape[0])
+        torch.cuda.synchronize()
+        print("maxpool", spatial_shape, time.time() - t)
        out_tensor = spconv.SparseConvTensor(out_features, outids,
                                             out_spatial_shape, batch_size)
        out_tensor.indice_dict = input.indice_dict

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -23,7 +23,6 @@ static auto registry =
    torch::RegisterOperators()
        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
        .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
        .op("spconv::indice_maxpool", &spconv::indiceMaxPool)

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda(
  auto ndim = kernelSize.size();
  auto numActIn = indicesIn.size(0);
  auto kernelVolume = indiceNum.size(0);
+  // auto timer = spconv::CudaContextTimer<>();
  if (numActIn == 0)
    return 0;
  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda(
                                  pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
            }
+            // tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
 #ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
            checkCudaErrors(cudaFuncGetAttributes(
@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda(
  return numAct;
 }
-template <typename T> struct is_valid {
-  __device__ __forceinline__ bool operator()(const T x) { return x != -1; }
-};
 int create_submconv_indice_pair_cuda(
    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda(
      for (int i = 0; i < NDim; ++i) {
        spatialVolume *= outSpatialShape[i];
      }
+      auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
      if (useHash) {
        auto table = cuhash::HashTable();
        // std::cout << "create " << numAct << " size table..." << std::endl;
@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda(
        auto constants = table.get_constants_4();
        auto stash_constants = table.get_stash_constants();
        auto stash_count = table.get_stash_count();
-        tv::DispatchInt<max_kernel_vol_t>()(
+        bool dilation_one = true;
-            kernelVolume, std::less_equal<int>(), [&](auto I2) {
+        for (int i = 0; i < NDim; ++i) {
-              constexpr int MaxKernelVolume = decltype(I2)::value;
+          dilation_one &= di[i] == 1;
-              getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
+        }
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+        auto found = false;
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+        if (dilation_one && (NDim == 2 || NDim == 3)) {
-                                  tv::torch2tv<Index>(indicePairs),
+          auto indiceNumCpu = indiceNum.cpu();
-                                  tv::torch2tv<Index>(indiceNum), ks, st, pa,
+          if (NDim == 2) {
-                                  di, ou, tableSize, tableData, constants,
+            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                  stash_constants, stash_count);
+                                           outSpatialShape.end());
-              TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
+            dispatcher(kernelSize[0], [&](auto K0C) {
+              dispatcher(kernelSize[1], [&](auto K1C) {
+                constexpr int K0 = decltype(K0C)::value;
+                constexpr int K1 = decltype(K1C)::value;
+                found = true;
+                getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
+                    <<<tv::cuda::getBlocks(numActIn),
+                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                        tv::torch2tv<Index>(indicesIn),
+                        tv::torch2tv<Index>(indicePairs),
+                        tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
+                        tableSize, tableData, constants, stash_constants,
+                        stash_count);
+              });
            });
+          } else if (NDim == 3) {
+            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
+                                           outSpatialShape.end());
+            dispatcher(kernelSize[0], [&](auto K0C) {
+              dispatcher(kernelSize[1], [&](auto K1C) {
+                dispatcher(kernelSize[2], [&](auto K2C) {
+                  constexpr int K0 = decltype(K0C)::value;
+                  constexpr int K1 = decltype(K1C)::value;
+                  constexpr int K2 = decltype(K2C)::value;
+                  found = true;
+                  getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
+                      <<<tv::cuda::getBlocks(numActIn),
+                         tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                          tv::torch2tv<Index>(indicesIn),
+                          tv::torch2tv<Index>(indicePairs),
+                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
+                          tableSize, tableData, constants, stash_constants,
+                          stash_count);
+                });
+              });
+            });
+          }
+        }
+        if (!found) {
+          tv::DispatchInt<max_kernel_vol_t>()(
+              kernelVolume, std::less_equal<int>(), [&](auto I2) {
+                constexpr int MaxKernelVolume = decltype(I2)::value;
+                getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
+                    <<<tv::cuda::getBlocks(numActIn),
+                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                        tv::torch2tv<Index>(indicesIn),
+                        tv::torch2tv<Index>(indicePairs),
+                        tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
+                        tableSize, tableData, constants, stash_constants,
+                        stash_count);
+                TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
+              });
+        }
      } else {
        // auto timer = spconv::CudaContextTimer<>();
        prepareSubMGridKernel<Index, IndexGrid, NDim>
@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda(
        auto found = false;
        if (dilation_one && (NDim == 2 || NDim == 3)) {
          auto indiceNumCpu = indiceNum.cpu();
          if (NDim == 2) {
            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
-            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
+            dispatcher(kernelSize[0], [&](auto K0C) {
-              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
+              dispatcher(kernelSize[1], [&](auto K1C) {
                constexpr int K0 = decltype(K0C)::value;
                constexpr int K1 = decltype(K1C)::value;
                found = true;
-                getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1>
+                getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
                    <<<tv::cuda::getBlocks(numActIn),
                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
                        tv::torch2tv<Index>(indicesIn),
@@ -301,24 +356,21 @@ int create_submconv_indice_pair_cuda(
          } else if (NDim == 3) {
            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
+            dispatcher(kernelSize[0], [&](auto K0C) {
-            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
+              dispatcher(kernelSize[1], [&](auto K1C) {
-              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
+                dispatcher(kernelSize[2], [&](auto K2C) {
-                tv::dispatch_int_noexcept<1, 3, 5>(
+                  constexpr int K0 = decltype(K0C)::value;
-                    kernelSize[2], [&](auto K2C) {
+                  constexpr int K1 = decltype(K1C)::value;
-                      constexpr int K0 = decltype(K0C)::value;
+                  constexpr int K2 = decltype(K2C)::value;
-                      constexpr int K1 = decltype(K1C)::value;
+                  found = true;
-                      constexpr int K2 = decltype(K2C)::value;
+                  getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
-                      found = true;
+                      <<<tv::cuda::getBlocks(numActIn),
-                      getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2>
+                         tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                          <<<tv::cuda::getBlocks(numActIn),
+                          tv::torch2tv<Index>(indicesIn),
-                             tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                          tv::torch2tv<IndexGrid>(gridsOut),
-                              tv::torch2tv<Index>(indicesIn),
+                          tv::torch2tv<Index>(indicePairs),
-                              tv::torch2tv<IndexGrid>(gridsOut),
+                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
-                              tv::torch2tv<Index>(indicePairs),
+                });
-                              tv::torch2tv<Index>(indiceNum), ou_,
-                              spatialVolume);
-                    });
              });
            });
          }
@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda(
        resetGridSubMKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(indicesIn.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
+                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn, spatialVolume);
        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
      }
    });

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
@@ -2,7 +2,7 @@
 namespace spconv {
 std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, int64_t batchSize,
+getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
               std::vector<int64_t> outSpatialShape,
               std::vector<int64_t> spatialShape,
               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
  if (useHash) {
    gridSize = batchSize;
  }
-  torch::Tensor gridOut = torch::full(
+  bool resetGrid = gridOut.numel() != 0;
-      {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
+  if (!resetGrid){
+    gridOut = torch::full(
+          {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
+  }
  gridOut = gridOut.view({batchSize, -1});
  int64_t numActOut = -1;
  for (int i = 0; i < NDim; ++i) {
@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
    else if (indices.device().type() == torch::kCUDA) {
      numActOut = create_submconv_indice_pair_cuda(
          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, false, useHash);
+          dilation, outSpatialShape, transpose, resetGrid, useHash);
      if (numActOut == -1) {
        auto device = indices.device();
        indicePairs = indicePairs.to({torch::kCPU});
@@ -98,10 +101,10 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
    if (indices.device().type() == torch::kCPU) {
      numActOut = create_conv_indice_pair_cpu(
          indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose, false, useHash);
+          padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
    }
 #ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
+    else if (indices.device().type() == torch::kCUDA) {      
      numActOut = create_conv_indice_pair_p1_cuda(
          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
          padding, dilation, outSpatialShape, transpose);
@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
        indicePairUnique = std::get<0>(res);
        numActOut = create_conv_indice_pair_p2_cuda(
            indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
-            outSpatialShape, transpose, false, useHash);
+            outSpatialShape, transpose, resetGrid, useHash);
        if (numActOut == -1) {
          auto device = indices.device();
          outInds = outInds.to({torch::kCPU});
@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
+  // tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
+  //                      indicePairNumCpu.data_ptr<int>() + kernelVolume));
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
    // totalSAddTime += timer.report() / 1000.0;
  }
  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
+  // tv::ssprint("final subm gemm time", timer.report() / 1000.0);
  return output;
 }

--- a/test/benchmark.py
+++ b/test/benchmark.py
+import torch 
+import spconv 
+import numpy as np 
+from spconv.utils import VoxelGeneratorV2
+from pathlib import Path 
+from torch import nn
+import time 
+def waymo_data(batch_size=1):
+    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, 150000)
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    pc = data["pc"]
+    data = gen.generate(pc)
+    voxels = data["voxels"].reshape(-1, 3)
+    coors = data["coordinates"]
+    N = coors.shape[0]
+    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
+    return voxels, coors, gen.grid_size
+class Net(nn.Module):
+    def __init__(self,
+                 shape):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"),
+            spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0"),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1"),
+            spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1"),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2"),
+            spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3"),
+            spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4"),
+            spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5"),
+            spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5"),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6"),
+            spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6"),
+        )
+        max_batch_size = 1
+        self.grid = torch.full([max_batch_size, *shape], -1, dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
+def main():
+    voxels, coors, spatial_shape = waymo_data()
+    voxels_th = torch.from_numpy(voxels).cuda().float()
+    coors_th = torch.from_numpy(coors).cuda()
+    net = Net(spatial_shape[::-1]).cuda().eval().float()
+    print(coors_th.shape)
+    out = net(voxels_th, coors_th, 1)
+    print(out.spatial_shape)
+    times = []
+    with torch.no_grad():
+        for i in range(20):
+            torch.cuda.synchronize()
+            t = time.time()
+            out = net(voxels_th, coors_th, 1)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    # print((net.grid == -1).float().sum(), net.grid.numel())
+    # print("spconv time", time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/test/data/benchmark-pc.npz
+++ b/test/data/benchmark-pc.npz
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
 if __name__ == '__main__':
-    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
+    main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
-    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half)
+    main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
    # TestSpConv().testSpConv3d()