faster subm indice generation by

unroll loop for common params

faster subm indice generation by
unroll loop for common params
492865a3 · yanyan · 9a23d934 · 492865a3 · 492865a3 · 492865a3
Commit 492865a3 authored Jun 04, 2020 by yanyan
20 changed files
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ Since install newest driver and CUDA is very simple on windows, please use CUDA
 ```Python
 features = # your features with shape [N, numPlanes]
 indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0]
-spatial_shape = # spatial shape of your sparse tensor.
+spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i].
 batch_size = # batch size of your sparse tensor.
 x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size)
 x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor.

--- a/include/spconv/cublas_gemm.h
+++ b/include/spconv/cublas_gemm.h
+#pragma once
+#include <cublas_v2.h>
+#include <tensorview/tensorview.h>
+namespace spconv {
+template <class T>
+cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n, int k,
+                           const T *alpha, const T *A, int lda, const T *B,
+                           int ldb, const T *beta, T *C, int ldc);
+template <class T>
+cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k,
+                              const T *alpha, const T *A, int lda, const T *B,
+                              int ldb, const T *beta, T *C, int ldc) {
+  return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
+                        beta, C, ldc);
+}
+template <class T> inline T constant_scalar(float data) { return T(data); }
+template <class T>
+cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
+                    const tv::TensorView<T> A, const tv::TensorView<T> B,
+                    tv::TensorView<T> C) {
+  TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
+  TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
+  auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int m = transa ? A.dim(1) : A.dim(0);
+  int n = transb ? B.dim(0) : B.dim(1);
+  int ka = transa ? A.dim(0) : A.dim(1);
+  int kb = transb ? B.dim(1) : B.dim(0);
+  int lda = transa ? m : ka;
+  int ldb = transb ? ka : n;
+  int ldc = n;
+  TV_ASSERT_RT_ERR(ka == kb, "error");
+  T alpha = constant_scalar<T>(1);
+  T beta = constant_scalar<T>(0);
+  return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
+                           &alpha, A.data(), lda, B.data(), ldb, &beta,
+                           C.data(), ldc);
+}
+} // namespace spconv
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -54,9 +54,10 @@ __global__ void prepareIndicePairsKernel(
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(0, offset, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+                  pointPtr, outSpatialShape.data(), 0) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(1, offset, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
@@ -97,9 +98,10 @@ __global__ void prepareDeConvIndicePairsKernel(
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(0, offset, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+                  pointPtr, outSpatialShape.data(), 0) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(1, offset, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
@@ -190,21 +192,16 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
 }
 template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
+__global__ void prepareSubMGridKernel(
-prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-                      tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
-                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
  Index index = 0;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+    index =
-                                         outSpatialShape.data()) +
+        tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-            spatialVolume * indicesIn(ix, 0);
+            indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
+        spatialVolume * indicesIn(ix, 0);
    gridsOut[index] = ix;
  }
 }
@@ -258,10 +255,11 @@ __global__ void getSubMIndicePairsKernel(
    for (int i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+                  pointPtr, outSpatialShape.data(), 0) +
              spatialVolume * indicesIn(ix, 0);
      if (gridsOut[index] > -1) {
-        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
        indicePairs(1, offset, oldNum) = gridsOut[index];
        indicePairs(0, offset, oldNum) = ix;
      }
@@ -269,6 +267,82 @@ __global__ void getSubMIndicePairsKernel(
  }
 }
+template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
+          unsigned K2>
+__global__ void getSubMIndicePairsKernel3(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
+  auto numActIn = indicesIn.dim(0);
+  Index point[3];
+  Index index = 0;
+  Index offset;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
+#pragma unroll
+    for (int i = 0; i < K0; ++i) {
+#pragma unroll
+      for (int j = 0; j < K1; ++j) {
+#pragma unroll
+        for (int k = 0; k < K2; ++k) {
+          offset = i * K1 * K2 + j * K2 + k;
+          point[2] = indice_data[3] - k + K2 / 2;
+          point[1] = indice_data[2] - j + K1 / 2;
+          point[0] = indice_data[1] - i + K0 / 2;
+          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
+              point[2] < outSpatialShape[2] && point[0] >= 0 &&
+              point[0] < outSpatialShape[0]) {
+            index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
+                        point, outSpatialShape.data(), 0) +
+                    spatialVolume * indice_data[0];
+            if (gridsOut[index] != -1) {
+              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+              indicePairs(1, offset, oldNum) = gridsOut[index];
+              indicePairs(0, offset, oldNum) = ix;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
+__global__ void getSubMIndicePairsKernel2(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
+  auto numActIn = indicesIn.dim(0);
+  Index point[2];
+  Index index = 0;
+  Index offset;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
+#pragma unroll
+    for (int i = 0; i < K0; ++i) {
+#pragma unroll
+      for (int j = 0; j < K1; ++j) {
+        offset = i * K1 + j;
+        point[1] = indice_data[2] - j + K1 / 2;
+        point[0] = indice_data[1] - i + K0 / 2;
+        if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
+            point[0] < outSpatialShape[0]) {
+          index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
+                      point, outSpatialShape.data(), 0) +
+                  spatialVolume * indice_data[0];
+          if (gridsOut[index] > -1) {
+            Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+            indicePairs(1, offset, oldNum) = gridsOut[index];
+            indicePairs(0, offset, oldNum) = ix;
+          }
+        }
+      }
+    }
+  }
+}
 template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
          unsigned kNumHashFunctions = 4>
 __global__ void getSubMIndicePairsHashKernel(
@@ -299,12 +373,13 @@ __global__ void getSubMIndicePairsHashKernel(
    for (int i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+                  pointPtr, outSpatialShape.data(), 0) +
              spatialVolume * indicesIn(ix, 0);
      auto val = cuhash::retrieve((unsigned)(index), table_size, table,
                                  constants, stash_constants, stash_count);
      if (val != cuhash::kNotFound) {
-        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
        indicePairs(1, offset, oldNum) = val;
        indicePairs(0, offset, oldNum) = ix;
      }
@@ -332,7 +407,7 @@ __global__ void
 resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
                    const tv::SimpleVector<Index, NDim> outSpatialShape,
                    int numAct) {
-  int outSpatialShapeReg[NDim];
+  Index outSpatialShapeReg[NDim];
  for (int i = 0; i < NDim; ++i) {
    outSpatialShapeReg[i] = outSpatialShape[i];
  }
@@ -345,11 +420,14 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
  Index index;
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    indsPtr = indices + ix * (NDim + 1);
-    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
+                                                        outSpatialShapeReg, 0);
    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
  }
 }
 } // namespace spconv
+#undef atomicAdd
 #endif
\ No newline at end of file
--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
@@ -18,65 +18,6 @@
 #include <torch/script.h>
 namespace spconv {
-namespace functor {
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1 {
-  Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose);
-};
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2 {
-  Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false, bool useHash = true);
-};
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor {
-  Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false, bool useHash = true);
-};
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor {
-  Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false, bool useHash = true);
-};
-} // namespace functor
 int create_conv_indice_pair_p1_cuda(
    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,

--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
@@ -16,8 +16,8 @@
 #define REORDERING_CU_H_
 #include <THC/THCAtomics.cuh>
 #include <THC/THCNumerics.cuh>
-#include <tensorview/kernel_utils.h>
 #include <cuda_fp16.h>
+#include <tensorview/kernel_utils.h>
 #if PYTORCH_VERSION < 10500
 #define TH_ATOMIC_ADD atomicAdd
@@ -299,7 +299,7 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
              __hadd2(reinterpret_cast<__half2 *>(buf)[i],
                      reinterpret_cast<__half2 *>(buf2)[i]);
        }
-#else 
+#else
 #pragma unroll
        for (int i = 0; i < vecloadFactor; i++) {
          buf[i] += buf2[i];
@@ -319,8 +319,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
 template <typename T, typename Index, int NumTLP, int NumILP>
 __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
+                                      const Index *indices, int size,
-                                         int numPlanes) {
+                                      int numPlanes) {
  int ILPStrideX[NumILP];
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
@@ -330,7 +330,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
 #pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
+      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
    }
  }
 }
@@ -338,8 +339,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
 #if __CUDA_ARCH__ >= 530
 template <typename T, typename Index, int NumTLP, int NumILP>
 __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
+                                          const Index *indices, int size,
-                                         int numPlanes) {
+                                          int numPlanes) {
  int ILPStrideX[NumILP];
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
@@ -351,9 +352,10 @@ __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
 #pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<__half2 *>(outFeatures)[idx] =
+      reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
-          __hadd2(reinterpret_cast<__half2 *>(outFeatures)[idx],
+          reinterpret_cast<__half2 *>(outFeatures)[idx],
-                  reinterpret_cast<__half2 *>(buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
+          reinterpret_cast<__half2 *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
    }
  }
 }

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -26,321 +26,15 @@ namespace spconv {
 enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
 // torch.jit's doc says only support int64, so we need to convert to int32.
-template <unsigned NDim>
 std::vector<torch::Tensor>
-getIndicePair(torch::Tensor indices, int64_t batchSize,
+getIndicePairs(torch::Tensor indices, int64_t batchSize,
-              std::vector<int64_t> outSpatialShape,
+               std::vector<int64_t> outSpatialShape,
-              std::vector<int64_t> spatialShape,
+               std::vector<int64_t> spatialShape,
-              std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-              std::vector<int64_t> padding, std::vector<int64_t> dilation,
+               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-              std::vector<int64_t> outPadding, int64_t _subM,
+               std::vector<int64_t> outPadding, int64_t _subM,
-              int64_t _transpose, int64_t _useHash) {
+               int64_t _transpose, int64_t _useHash);
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  // CPU always use hash (tsl::robin_map).
-  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  std::string msg = "due to limits of cuda hash, the volume of dense space "
-                    "include batch size ";
-  msg += "must less than std::numeric_limits<int>::max() = 2e9";
-  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
-                   msg);
-  torch::Tensor indicePairs =
-      torch::full({2, kernelVolume, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  auto gridSize = batchSize * outputVolume;
-  if (useHash) {
-    gridSize = batchSize;
-  }
-  torch::Tensor gridOut = torch::full(
-      {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
-  gridOut = gridOut.view({batchSize, -1});
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose, false, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose, false, useHash);
-      if (numActOut == -1) {
-        // build hash failed. use CPU algorithm
-        auto device = indices.device();
-        indicePairs = indicePairs.to({torch::kCPU});
-        indiceNum = indiceNum.to({torch::kCPU});
-        indices = indices.to({torch::kCPU});
-        auto getIndicePairFtor =
-            functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-        numActOut = getIndicePairFtor(
-            tv::CPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-            tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-            dilation32, outSpatialShape32, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device),
-                indiceNum.to(device)};
-      }
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    auto indicePairUnique = torch::full(
-        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-        torch::dtype(torch::kInt32).device(indices.device()));
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
-            false, useHash);
-        if (numActOut == -1) {
-          // build hash failed. use CPU algorithm
-          auto getIndicePairFtor =
-              functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-          auto device = indices.device();
-          outInds = outInds.to({torch::kCPU});
-          indicePairs = indicePairs.to({torch::kCPU});
-          indiceNum = indiceNum.to({torch::kCPU});
-          indices = indices.to({torch::kCPU});
-          numActOut = getIndicePairFtor(
-              tv::CPU(), tv::torch2tv<const int>(indices),
-              tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-              tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-              kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-              transpose);
-          return {outInds.to(device).slice(0, 0, numActOut),
-                  indicePairs.to(device), indiceNum.to(device)};
-        }
-      }
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-std::vector<torch::Tensor>
-getIndicePairV2(torch::Tensor indices, int64_t batchSize,
-                std::vector<int64_t> outSpatialShape,
-                std::vector<int64_t> spatialShape,
-                std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-                std::vector<int64_t> padding, std::vector<int64_t> dilation,
-                std::vector<int64_t> outPadding, int64_t _subM,
-                int64_t _transpose, int64_t _useHash);
-template <unsigned NDim>
-std::vector<torch::Tensor> getIndicePairPreGrid(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose,
-    int64_t _useHash) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  bool useHash = _useHash != 0;
-  TV_ASSERT_RT_ERR(!useHash, "error");
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
-  torch::Tensor indicePairs =
-      torch::full({kernelVolume, 2, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique = torch::full(
-      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-      torch::dtype(torch::kInt32).device(indices.device()));
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-      gridOut.fill_(-1);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose, true);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose, true);
-      gridOut.fill_(-1);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
-            true);
-      }
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
 torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
                              torch::Tensor indicePairs,
                              torch::Tensor indiceNum, int64_t numActOut,

--- a/include/tensorrt/inference.h
+++ b/include/tensorrt/inference.h
@@ -66,7 +66,7 @@ public:
 class InferenceContext {
 public:
-  explicit InferenceContext(const std::string& engine_bin, int device)
+  explicit InferenceContext(const std::string &engine_bin, int device)
      : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
    TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
    int deviceCount;
@@ -159,9 +159,9 @@ public:
    return (*this)(inputs_vec);
  }
-  tv::Tensor operator[](std::string name){
+  tv::Tensor operator[](std::string name) {
    auto iter = name_to_host_mem_.find(name);
-    if (iter == name_to_host_mem_.end()){
+    if (iter == name_to_host_mem_.end()) {
      TV_THROW_INVALID_ARG(name, "not found.");
    }
    return iter->second;
@@ -204,4 +204,4 @@ private:
  int device_;
 };
-} // namespace trtplus
+} // namespace trt
--- a/include/tensorview/mp_helper.h
+++ b/include/tensorview/mp_helper.h
@@ -23,6 +23,9 @@ template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
 } // namespace detail
+template <class... T>
+using mp_length = std::integral_constant<std::size_t, sizeof...(T)>;
 namespace detail {
 template <class A, template <class...> class B> struct mp_rename_impl {
@@ -40,6 +43,8 @@ struct mp_rename_impl<A<T...>, B> {
 template <class A, template <class...> class B>
 using mp_rename = typename detail::mp_rename_impl<A, B>::type;
+template <class L> using mp_size = mp_rename<L, mp_length>;
 template <class L, class F> constexpr F mp_for_each(F &&f) {
  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
 }

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -56,9 +56,15 @@ using dtype_collection_t =
    tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
                  float16, int64, uint16, uint32, uint64>;
+#ifdef TV_CUDA
 using all_tensor_types_t =
    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
               uint16_t, uint32_t, uint64_t, bool>;
+#else
+using all_tensor_types_t =
+    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
+               uint16_t, uint32_t, uint64_t, bool>;
+#endif
 template <typename T> class TensorStorage {
 public:
@@ -395,6 +401,63 @@ void dispatch_int(int idx, BinaryPredicate p, F &&f) {
  }
 }
+// Ts is pack of mp_list_c
+template <class... Ts, typename Iterator, typename F>
+bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
+  static_assert(sizeof...(Ts) > 0,
+                "you need to provide at least one candidate");
+  bool notFound = true;
+  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    using val_lst_t = decltype(I);
+    auto val_lst_size = mp_size<val_lst_t>::value;
+    bool equal = true;
+    std::size_t count = 0;
+    auto iter = begin;
+    mp_for_each<val_lst_t>([&](auto E) {
+      if (iter == end || !equal) {
+        return;
+      }
+      if (count >= val_lst_size) {
+        TV_THROW_INVALID_ARG("iterator length invalid:", val_lst_size);
+      }
+      constexpr auto c = decltype(E)::value;
+      if (c != *iter) {
+        equal = false;
+      }
+      ++count;
+      std::advance(iter, 1);
+    });
+    if (count != val_lst_size || iter != end) {
+      equal = false;
+    }
+    if (equal && notFound) {
+      std::forward<F>(f)(I);
+      notFound = false;
+    }
+  });
+  return !notFound;
+}
+template <class... Ts, typename Iterator, typename F>
+void dispatch_container(Iterator begin, Iterator end, F &&f) {
+  if (!dispatch_container_noexcept<Ts...>(begin, end, std::forward<F>(f))) {
+    std::stringstream ss;
+    ss << "unknown value [";
+    for (auto iter = begin; iter != end; std::advance(iter, 1)) {
+      ss << *iter << ",";
+    }
+    ss << "], available: ";
+    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
+      ss << "[";
+      mp_for_each<decltype(I)>(
+          [=, &ss](auto E) { ss << decltype(E)::value << ","; });
+      ss << "]";
+    });
+    TV_THROW_RT_ERR(ss.str());
+  }
+}
 /*
 template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
  return dispatch_scalar<int, Is...>(idx, f);
@@ -410,6 +473,26 @@ struct Dispatch<T<Args...>> {
  }
 };
+template <class T> struct DispatchContainer;
+template <template <class...> class T, class... Args>
+struct DispatchContainer<T<Args...>> {
+  template <typename Iterator, typename F>
+  inline void operator()(Iterator begin, Iterator end, F &&f) {
+    return dispatch_container<Args...>(begin, end, std::forward<F>(f));
+  }
+};
+template <class T> struct DispatchContainerNoexcept;
+template <template <class...> class T, class... Args>
+struct DispatchContainerNoexcept<T<Args...>> {
+  template <typename Iterator, typename F>
+  inline bool operator()(Iterator begin, Iterator end, F &&f) {
+    return dispatch_container_noexcept<Args...>(begin, end, std::forward<F>(f));
+  }
+};
 template <class T> struct DispatchInt;
 // Args should be std::integral_constant<int, value>
@@ -531,13 +614,11 @@ struct Tensor {
            typename Tindex = int,
            typename std::enable_if<Rank == -1, int>::type = 0>
  TensorView<T, Rank, PtrTraits, Tindex> tview() {
-    using tv_shape_t =
-        typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
    writable_check();
    static_assert(Rank == -1 || Rank > 0, "error");
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
-    for (int i = 0; i < ndim(); ++i) {
+    for (size_t i = 0; i < ndim(); ++i) {
      shape[i] = shape_[i];
      stride[i] = stride_[i];
    }
@@ -579,7 +660,7 @@ struct Tensor {
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
-    for (int i = 0; i < ndim(); ++i) {
+    for (int i = 0; i < int(ndim()); ++i) {
      shape[i] = shape_[i];
      stride[i] = stride_[i];
    }
@@ -621,6 +702,21 @@ struct Tensor {
    return res;
  }
+  Tensor operator[](int64_t index) {
+    TV_ASSERT_INVALID_ARG(ndim() > 1, "error");
+    if (index < 0) {
+      index += dim(0);
+    }
+    TV_ASSERT_INVALID_ARG(index < dim(0), "error");
+    Tensor res = Tensor();
+    res.storage_ = storage_;
+    res.shape_ = shape_.subshape(1);
+    res.offset_ = offset_ + index * stride_[0];
+    res.stride_ = stride_.subshape(1);
+    res.writeable_ = writeable_;
+    return res;
+  }
  Tensor squeeze() const { return view(shape_.squeeze()); }
  Tensor squeeze(int axis) const {
@@ -665,6 +761,7 @@ struct Tensor {
  size_t ndim() const { return shape_.ndim(); }
  const TensorShape &shape() const { return shape_; }
+  const TensorShape &sizes() const { return shape_; }
  const TensorShape &stride() const { return stride_; }
  int dim(int idx) const {
@@ -679,6 +776,7 @@ struct Tensor {
  const uint8_t *raw_data() const { return storage_->data() + offset_; }
  size_t raw_size() const { return size() * itemsize(); }
  size_t size() const { return shape_.size(); }
+  size_t size(int64_t idx) const { return dim(idx); }
  size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
  Tensor &zero_() {
    writable_check();
@@ -716,6 +814,16 @@ struct Tensor {
    return reinterpret_cast<const T *>(raw_data());
  }
+  template <typename T> T *data_ptr() { return data<T>(); }
+  template <typename T> const T *data_ptr() const { return data<T>(); }
+  void *data_ptr() { return reinterpret_cast<void *>(raw_data()); }
+  const void *data_ptr() const {
+    return reinterpret_cast<const void *>(raw_data());
+  }
  void copy_(const Tensor &tensor) {
    writable_check();
    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
@@ -837,8 +945,8 @@ struct Tensor {
        using Tcur = decltype(Icur);
        if (std::is_convertible<Tcur, Tdst>::value) {
          auto ptr = this->data<Tcur>();
-          tensor = Tensor(this->shape_, this->stride_, dtype, this->device(), this->pinned(),
+          tensor = Tensor(this->shape_, this->stride_, dtype, this->device(),
-                          this->storage_->managed());
+                          this->pinned(), this->storage_->managed());
          std::copy(ptr, ptr + this->size(), tensor.data<Tdst>());
        } else {
          TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",

--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -13,13 +13,13 @@
 // limitations under the License.
 #pragma once
+#include "common.h"
+#include "prettyprint.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include "common.h"
-#include "prettyprint.h"
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <sstream>
 #include <type_traits>
@@ -121,6 +121,15 @@ constexpr size_t calc_align(size_t ndim)
    return 64;
 }
 */
+namespace detail {
+template <typename _InIter>
+using _RequireInputIter = typename std::enable_if<std::is_convertible<
+    typename std::iterator_traits<_InIter>::iterator_category,
+    std::input_iterator_tag>::value>::type;
+}
 template <typename T, size_t MaxDim = TV_MAX_DIM>
 struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
 public:
@@ -131,7 +140,8 @@ public:
      array_[i] = init;
    }
  };
-  template <typename Iterator> SimpleVector(Iterator first, Iterator last) {
+  template <typename Iterator, typename = detail::_RequireInputIter<Iterator>>
+  SimpleVector(Iterator first, Iterator last) {
    size_ = 0;
    for (; first != last; ++first) {
      if (size_ >= MaxDim) {
@@ -588,6 +598,12 @@ template <int N, int Ndim> struct ArrayIndexRowMajor {
    return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
        shape, (index + start) * shape[Ndim - N + 1], inds...);
  }
+  template <typename TShape, typename Tinit>
+  TV_HOST_DEVICE_INLINE static unsigned
+  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
+    return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(
+        indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
+  }
 };
 template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
@@ -601,6 +617,11 @@ template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
                                                 Tinit start, T idx) {
    return start + idx;
  }
+  template <typename TShape, typename Tinit>
+  TV_HOST_DEVICE_INLINE static unsigned
+  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
+    return start + indexes[Ndim - 1];
+  }
 };
 template <> struct ArrayIndexRowMajor<0, 0> {
@@ -613,6 +634,11 @@ template <> struct ArrayIndexRowMajor<0, 0> {
                                                 Tinit start) {
    return 0;
  }
+  template <typename TShape, typename Tinit>
+  TV_HOST_DEVICE_INLINE static unsigned
+  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
+    return 0;
+  }
 };
 template <int N, int Ndim> struct ArrayIndexStride {
@@ -744,8 +770,8 @@ struct TensorAccesserBase {
  }
 protected:
-  const Tindex *stride_ptr_;
  ptr_t ptr_;
+  const Tindex *stride_ptr_;
 };
 } // namespace detail
@@ -1091,8 +1117,7 @@ struct TensorView {
    return TensorView<T, Rank, PtrTraits, Tindex>(
        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
  }
-  template <typename Os>
+  template <typename Os> std::string repr(Os &ss) const {
-  std::string repr(Os &ss, int limit = 1000, int limit_axis = 6) const {
    if (empty())
      return "";
    if (shape_.ndim() == 0) {
@@ -1100,7 +1125,6 @@ struct TensorView {
      ss << *ptr_;
      return ss.str();
    }
-    bool enable_limit = size() > limit;
    SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
    SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
@@ -1111,7 +1135,7 @@ struct TensorView {
    ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
       << ", stride=" << stride() << std::endl;
    auto ndimValue = ndim();
-    for (int64_t i = 0; i < size(); ++i) {
+    for (int64_t i = 0; i < int64_t(size()); ++i) {
      rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
      bool newline = false;
      int end_count = 0;

--- a/include/tensorview/torch_utils.h
+++ b/include/tensorview/torch_utils.h
@@ -133,7 +133,6 @@ torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) {
  auto tensor_shape = tensor.sizes();
  std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end());
  shape[0] = end - start;
-  auto dtype = tensor.scalar_type();
  uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr());
  res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(),
                         torch::IntArrayRef(shape), tensor.options());

--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -96,8 +96,9 @@ class SparseConvTensor(object):
    def dense(self, channels_first=True):
        output_shape = [self.batch_size] + list(
            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(self.indices.long().to(self.features.device),
+        res = scatter_nd(
-                         self.features, output_shape)
+            self.indices.to(self.features.device).long(), self.features,
+            output_shape)
        if not channels_first:
            return res
        ndim = len(self.spatial_shape)

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -20,9 +20,9 @@ import spconv
 class ConvAlgo(Enum):
-    Native = 0 # small memory cost, faster when number of points is large.
+    Native = 0  # small memory cost, faster when number of points is large.
-    Batch = 1 # high memory cost, faster when number of points is small (< 50000)
+    Batch = 1  # high memory cost, faster when number of points is small (< 50000)
-    BatchGemmGather = 2 # high memory cost, faster when number of points medium
+    BatchGemmGather = 2  # high memory cost, faster when number of points medium
 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
@@ -88,12 +88,11 @@ def get_indice_pairs(indices,
    else:
        out_shape = spatial_shape
    if grid is None:
-        res = torch.ops.spconv.get_indice_pairs_v2(indices, batch_size,
+        res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,
-                                                   out_shape, spatial_shape,
+                                                spatial_shape, ksize, stride,
-                                                   ksize, stride, padding,
+                                                padding, dilation, out_padding,
-                                                   dilation, out_padding,
+                                                int(subm), int(transpose),
-                                                   int(subm), int(transpose),
+                                                int(use_hash))
-                                                   int(use_hash))
        return res
    else:
        if ndim == 2:

--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
 set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
 if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
+    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
 endif()
 add_library(spconv SHARED ${ALL_FILES})

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -21,14 +21,7 @@
 static auto registry =
    torch::RegisterOperators()
-        .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
+        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
-        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
-        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
-        .op("spconv::get_indice_pairs_v2", &spconv::getIndicePairV2)
-        .op("spconv::get_indice_pairs_grid_2d",
-            &spconv::getIndicePairPreGrid<2>)
-        .op("spconv::get_indice_pairs_grid_3d",
-            &spconv::getIndicePairPreGrid<3>)
        .op("spconv::indice_conv", &spconv::indiceConv)
        .op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)

--- a/src/spconv/cublas_gemm.cc
+++ b/src/spconv/cublas_gemm.cc
+#include <ATen/ATen.h>
+#include <spconv/cublas_gemm.h>
+namespace spconv {
+template <>
+cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n, int k,
+                           const float *alpha, const float *A, int lda,
+                           const float *B, int ldb, const float *beta, float *C,
+                           int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+template <>
+cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n, int k,
+                           const __half *alpha, const __half *A, int lda,
+                           const __half *B, int ldb, const __half *beta,
+                           __half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+template <>
+cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n, int k,
+                           const at::Half *alpha, const at::Half *A, int lda,
+                           const at::Half *B, int ldb, const at::Half *beta,
+                           at::Half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k,
+                     reinterpret_cast<const __half *>(alpha),
+                     reinterpret_cast<const __half *>(A), lda,
+                     reinterpret_cast<const __half *>(B), ldb,
+                     reinterpret_cast<const __half *>(beta),
+                     reinterpret_cast<__half *>(C), ldc);
+}
+template <>
+cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n, int k,
+                           const double *alpha, const double *A, int lda,
+                           const double *B, int ldb, const double *beta,
+                           double *C, int ldc) {
+  return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+template <> inline __half constant_scalar(float data) {
+  return __float2half(data);
+}
+} // namespace spconv
\ No newline at end of file
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
@@ -327,67 +327,4 @@ int create_submconv_indice_pair_cpu(
  return numActIn;
 }
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid, bool useHash) {
-    if (transpose)
-      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-    else
-      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid, bool useHash) {
-    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
-  }
-};
-} // namespace functor
-#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int,    \
-                                                       NDIM>;                  \
-  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,    \
-                                                       NDIM>;
-#define DECLARE_CPU_INDEX(Index)                                               \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2);                                      \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3);                                      \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_SPECS_INDEX_NDIM
 } // namespace spconv
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -20,10 +20,11 @@
 #include <spconv/indice.h>
 #include <tensorview/cuda_utils.h>
 #include <tensorview/mp_helper.h>
-#include <tensorview/torch_utils.h>
 #include <tensorview/tensor.h>
 #include <tensorview/tensorview.h>
+#include <tensorview/torch_utils.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <type_traits>
 #include <utility/timer.h>
@@ -78,10 +79,12 @@ int create_conv_indice_pair_p1_cuda(
            }
 #ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
-            checkCudaErrors(cudaFuncGetAttributes(&attr, prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
+            checkCudaErrors(cudaFuncGetAttributes(
-            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim, MaxKernelVolume, ">", attr.numRegs);
+                &attr,
+                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
+            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
+                        MaxKernelVolume, ">", attr.numRegs);
 #endif
          });
    });
  });
@@ -166,13 +169,16 @@ int create_conv_indice_pair_p2_cuda(
        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
+        checkCudaErrors(cudaFuncGetAttributes(
-        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim, ">", attr.numRegs);
+            &attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
+        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
+                    ">", attr.numRegs);
        cudaFuncAttributes attr2;
-        checkCudaErrors(cudaFuncGetAttributes(&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
+        checkCudaErrors(cudaFuncGetAttributes(
-        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">", attr2.numRegs);
+            &attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
+        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
+                    attr2.numRegs);
 #endif
      }
      if (resetGrid && (!useHash)) {
@@ -187,6 +193,10 @@ int create_conv_indice_pair_p2_cuda(
  return numAct;
 }
+template <typename T> struct is_valid {
+  __device__ __forceinline__ bool operator()(const T x) { return x != -1; }
+};
 int create_submconv_indice_pair_cuda(
    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
@@ -212,6 +222,10 @@ int create_submconv_indice_pair_cuda(
      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
                                       outSpatialShape.end());
+      Index spatialVolume = 1;
+      for (int i = 0; i < NDim; ++i) {
+        spatialVolume *= outSpatialShape[i];
+      }
      if (useHash) {
        auto table = cuhash::HashTable();
        // std::cout << "create " << numAct << " size table..." << std::endl;
@@ -251,23 +265,79 @@ int create_submconv_indice_pair_cuda(
              TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
            });
      } else {
+        // auto timer = spconv::CudaContextTimer<>();
        prepareSubMGridKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou);
+                         tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
+        // tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
-        tv::DispatchInt<max_kernel_vol_t>()(
+        // when dilation all one, we use a simple kernel to calc result
-            ndim, std::less_equal<int>(), [&](auto I2) {
+        bool dilation_one = true;
-              constexpr int MaxKernelVolume = decltype(I2)::value;
+        for (int i = 0; i < NDim; ++i) {
-              getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
+          dilation_one &= di[i] == 1;
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+        }
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+        auto found = false;
-                                  tv::torch2tv<IndexGrid>(gridsOut),
+        if (dilation_one && (NDim == 2 || NDim == 3)) {
-                                  tv::torch2tv<Index>(indicePairs),
+          auto indiceNumCpu = indiceNum.cpu();
-                                  tv::torch2tv<Index>(indiceNum), ks, st, pa,
+          if (NDim == 2) {
-                                  di, ou);
+            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-              TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+                                           outSpatialShape.end());
+            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
+              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
+                constexpr int K0 = decltype(K0C)::value;
+                constexpr int K1 = decltype(K1C)::value;
+                found = true;
+                getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1>
+                    <<<tv::cuda::getBlocks(numActIn),
+                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                        tv::torch2tv<Index>(indicesIn),
+                        tv::torch2tv<IndexGrid>(gridsOut),
+                        tv::torch2tv<Index>(indicePairs),
+                        tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
+              });
+            });
+          } else if (NDim == 3) {
+            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
+                                           outSpatialShape.end());
+            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
+              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
+                tv::dispatch_int_noexcept<1, 3, 5>(
+                    kernelSize[2], [&](auto K2C) {
+                      constexpr int K0 = decltype(K0C)::value;
+                      constexpr int K1 = decltype(K1C)::value;
+                      constexpr int K2 = decltype(K2C)::value;
+                      found = true;
+                      getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2>
+                          <<<tv::cuda::getBlocks(numActIn),
+                             tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                              tv::torch2tv<Index>(indicesIn),
+                              tv::torch2tv<IndexGrid>(gridsOut),
+                              tv::torch2tv<Index>(indicePairs),
+                              tv::torch2tv<Index>(indiceNum), ou_,
+                              spatialVolume);
+                    });
+              });
            });
+          }
+        }
+        if (!found) {
+          tv::DispatchInt<
+              max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
+            constexpr int MaxKernelVolume = decltype(I2)::value;
+            getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
+                <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
+                   stream>>>(tv::torch2tv<Index>(indicesIn),
+                             tv::torch2tv<IndexGrid>(gridsOut),
+                             tv::torch2tv<Index>(indicePairs),
+                             tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
+                             ou);
+            TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
+          });
+        }
+        // tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
      }
      if (resetGrid && (!useHash)) {
@@ -282,209 +352,4 @@ int create_submconv_indice_pair_cuda(
  return numActIn;
 }
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose) {
-    Index batchSize = gridsOut.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, NDim, 4096>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicePairs, indiceNum,
-                              indicePairUnique, kernelSize, stride, padding,
-                              dilation, outSpatialShape);
-    else
-      prepareIndicePairsKernel<Index, NDim, 4096>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicePairs, indiceNum,
-                              indicePairUnique, kernelSize, stride, padding,
-                              dilation, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
-    return 1;
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid, bool useHash) {
-    Index batchSize = gridsOut.dim(0);
-    auto kernelVolume = indiceNum.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
-    // after unique, there is a std::numeric_limits<int>::max() in the end of
-    // indicePairUnique
-    Index numAct = indicePairUnique.dim(0) - 1;
-    if (useHash) {
-      auto table = cuhash::HashTable();
-      // std::cout << "create " << numAct << " size table..." << std::endl;
-      table.Initialize(numAct, 2.0, 4);
-      unsigned *d_values = nullptr;
-      cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
-      TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-      arangeKernel<unsigned>
-          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(d_values, numAct);
-      bool res = table.Build(
-          numAct, reinterpret_cast<unsigned *>(indicePairUnique.data()),
-          d_values);
-      cudaFree(d_values);
-      if (!res) {
-        return -1; // use -1 to tell outside use CPU implementation
-      }
-      assignIndiceOutKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesOut, numAct, indicePairUnique,
-                              outSpatialShape, batchSize);
-      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-      auto tableSize = table.get_table_size();
-      auto tableData = table.data();
-      auto constants = table.get_constants_4();
-      auto stash_constants = table.get_stash_constants();
-      auto stash_count = table.get_stash_count();
-      assignIndicePairsHashKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesOut, numActIn, indicePairs,
-                              indicePairUnique, tableSize, tableData, constants,
-                              stash_constants, stash_count);
-      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-    } else {
-      assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                              indicePairUnique, outSpatialShape, batchSize);
-      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-      assignIndicePairsKernel<Index, IndexGrid, NDim>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                              indicePairUnique, outSpatialShape);
-      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-    }
-    if (resetGrid && (!useHash)) {
-      resetGridKernel<Index, IndexGrid, NDim>
-          <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
-      TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-    }
-    return numAct;
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid, bool useHash) {
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0)
-      return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    if (useHash) {
-      auto table = cuhash::HashTable();
-      // std::cout << "subm create " << numActIn << " size table..." <<
-      // std::endl;
-      table.Initialize(numActIn, 2.0, 4);
-      unsigned *d_keyvalues = nullptr;
-      cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
-      unsigned *d_values = d_keyvalues + numActIn;
-      prepareSubMHashKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, d_keyvalues, d_values,
-                              outSpatialShape);
-      TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
-      bool res =
-          table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
-                      reinterpret_cast<unsigned *>(d_values));
-      cudaFree(d_keyvalues);
-      if (!res) {
-        return -1; // use -1 to tell outside use CPU implementation
-      }
-      auto tableSize = table.get_table_size();
-      auto tableData = table.data();
-      auto constants = table.get_constants_4();
-      auto stash_constants = table.get_stash_constants();
-      auto stash_count = table.get_stash_count();
-      getSubMIndicePairsHashKernel<Index, NDim, 4096>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicePairs, indiceNum, kernelSize,
-                              stride, padding, dilation, outSpatialShape,
-                              tableSize, tableData, constants, stash_constants,
-                              stash_count);
-      TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
-    } else {
-      prepareSubMGridKernel<Index, IndexGrid, NDim>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
-      TV_CHECK_CUDA_ERR();
-      getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                              kernelSize, stride, padding, dilation,
-                              outSpatialShape);
-      TV_CHECK_CUDA_ERR();
-    }
-    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
-    if (resetGrid && (!useHash)) {
-      resetGridSubMKernel<Index, IndexGrid, NDim>
-          <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
-                              numActIn);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numActIn;
-  }
-};
-} // namespace functor
-#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
-  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,    \
-                                                       NDIM>;                  \
-  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int,  \
-                                                         NDIM>;                \
-  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int,  \
-                                                         NDIM>;                \
-  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,    \
-                                                       NDIM>;
-#define DECLARE_GPU_INDEX(Index)                                               \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1);                                      \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2);                                      \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3);                                      \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
-DECLARE_GPU_INDEX(int);
-#undef DECLARE_GPU_INDEX
-#undef DECLARE_GPU_SPECS_INDEX_NDIM
 } // namespace spconv
\ No newline at end of file
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -30,10 +30,12 @@ namespace spconv {
 using float_types_t = tv::mp_list<float, double, at::Half>;
 using int_types_t = tv::mp_list<int32_t, int64_t>;
 template <typename T>
-using half_vec_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
+using half_vec_t =
+    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
 template <typename T>
-using half_vec_sadd_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
+using half_vec_sadd_t =
-  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
+    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
+using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
 void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                        torch::Tensor indices, int size) {
@@ -51,49 +53,55 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
      bool notFound = true;
      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>(
+      tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-          [=, &buffer, &features, &indices, &notFound](auto NumTLP) {
+                                       &notFound](auto NumTLP) {
-            constexpr int NumILP = NumTLP / 4;
+        constexpr int NumILP = NumTLP / 4;
-            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+        // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-            int nHotBlock = (size / NumTLP) * NumTLP;
+        int nHotBlock = (size / NumTLP) * NumTLP;
-            if (notFound) {
+        if (notFound) {
-              if (numPlanes % NumTLP == 0) {
+          if (numPlanes % NumTLP == 0) {
-                if (nHotBlock >= NumTLP) {
+            if (nHotBlock >= NumTLP) {
-                  gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
+              gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
+                                   vecload_type_t>
-                      <<<dim3(size / NumTLP, numPlanes / NumTLP),
+                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
+                     stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), nHotBlock,
+                               indices.data_ptr<Index>(), nHotBlock,
-                                   numPlanes / vecloadFactor);
+                               numPlanes / vecloadFactor);
 #ifdef TV_LOG_KERNEL_INFO
-                  cudaFuncAttributes attr;
+              cudaFuncAttributes attr;
-                  checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
+              checkCudaErrors(cudaFuncGetAttributes(
-                                       vecload_type_t>));
+                  &attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                  tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+                                              vecload_type_t>));
+              tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
+                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
+                          attr.numRegs);
 #endif
-                  TV_CHECK_CUDA_ERR();
+              TV_CHECK_CUDA_ERR();
-                }
+            }
-                if (size - nHotBlock > 0) {
+            if (size - nHotBlock > 0) {
-                  gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+              gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                      <<<dim3(1, numPlanes / NumTLP),
+                  <<<dim3(1, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
+                     stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                   features.data_ptr<T>(),
+                               features.data_ptr<T>(),
-                                   indices.data_ptr<Index>() + nHotBlock,
+                               indices.data_ptr<Index>() + nHotBlock,
-                                   size - nHotBlock, numPlanes / vecloadFactor);
+                               size - nHotBlock, numPlanes / vecloadFactor);
 #ifdef TV_LOG_KERNEL_INFO
-                  cudaFuncAttributes attr;
+              cudaFuncAttributes attr;
-                  checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>));
+              checkCudaErrors(cudaFuncGetAttributes(
-                  tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+                  &attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
+                                         vecload_type_t>));
+              tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
+                          int(NumTLP), NumILP, ">", attr.numRegs);
 #endif
-                  TV_CHECK_CUDA_ERR();
+              TV_CHECK_CUDA_ERR();
-                }
-                notFound = false;
-              }
            }
-          });
+            notFound = false;
+          }
+        }
+      });
      if (notFound) {
        constexpr int NumTLP = 64;
        constexpr int NumILP = NumTLP / 4;
@@ -105,8 +113,10 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                indices.data_ptr<Index>(), size, numPlanes);
 #ifdef TV_LOG_KERNEL_INFO
        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
+        checkCudaErrors(cudaFuncGetAttributes(
-        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+            &attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
+        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
+                    int(NumTLP), NumILP, ">", attr.numRegs);
 #endif
        TV_CHECK_CUDA_ERR();
@@ -151,9 +161,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
                               numPlanes / vecloadFactor);
 #ifdef TV_LOG_KERNEL_INFO
              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
+              checkCudaErrors(cudaFuncGetAttributes(
-                                       vecload_type_t>));
+                  &attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+                                                  vecload_type_t>));
+              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
+                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
+                          attr.numRegs);
 #endif
              TV_CHECK_CUDA_ERR();
@@ -167,8 +180,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
                                  size - nHotBlock, numPlanes);
 #ifdef TV_LOG_KERNEL_INFO
              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+              checkCudaErrors(cudaFuncGetAttributes(
-              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+                  &attr,
+                  scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
+                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
+                          attr.numRegs);
 #endif
              TV_CHECK_CUDA_ERR();
            }
@@ -187,8 +204,10 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
                indices.data_ptr<Index>(), size, numPlanes);
 #ifdef TV_LOG_KERNEL_INFO
        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+        checkCudaErrors(cudaFuncGetAttributes(
-        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+            &attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
+                    tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
 #endif
        TV_CHECK_CUDA_ERR();

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
@@ -2,13 +2,13 @@
 namespace spconv {
 std::vector<torch::Tensor>
-getIndicePairV2(torch::Tensor indices, int64_t batchSize,
+getIndicePairs(torch::Tensor indices, int64_t batchSize,
-                std::vector<int64_t> outSpatialShape,
+               std::vector<int64_t> outSpatialShape,
-                std::vector<int64_t> spatialShape,
+               std::vector<int64_t> spatialShape,
-                std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-                std::vector<int64_t> padding, std::vector<int64_t> dilation,
+               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-                std::vector<int64_t> outPadding, int64_t _subM,
+               std::vector<int64_t> outPadding, int64_t _subM,
-                int64_t _transpose, int64_t _useHash) {
+               int64_t _transpose, int64_t _useHash) {
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
@@ -57,6 +57,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize,
      stride[i] = 1;
    }
  }
+  // tv::ssprint("prepare", timer.report() / 1000.0);
  if (subM) {
    if (indices.device().type() == torch::kCPU) {
      numActOut = create_submconv_indice_pair_cpu(
@@ -85,6 +86,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize,
    else {
      TV_THROW_INVALID_ARG("unknown device type");
    }
+    // tv::ssprint("subm", timer.report() / 1000.0);
    return {indices, indicePairs, indiceNum};
  } else {
    auto indicePairUnique = torch::full(
@@ -193,6 +195,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
+    // TODO torch::from_blob is a little slow
    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
                                             {nHot, numOutPlanes}, options);
    auto inputBufferBlob =
@@ -406,11 +409,11 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  auto filterShape = filters.sizes();
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor filtersGrad = torch::empty(filterShape, options);
  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+      torch::empty({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+      torch::empty({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});