working on remove functor

6c767a51 · Yan Yan · 19e73bbe · 6c767a51 · 6c767a51 · 6c767a51
Commit 6c767a51 authored May 21, 2020 by Yan Yan
13 changed files
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
@@ -24,7 +24,6 @@
 namespace spconv {
 // torch.jit's doc says only support int64, so we need to convert to int32.

-template <typename T>
 torch::Tensor
 fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor bias, torch::Tensor indicePairs,
@@ -80,31 +79,17 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
      continue;
    }
    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
+    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
                                            {nHot, numInPlanes}, options);

    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
+      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
    }
 #ifdef TV_CUDA
    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
+      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
    }
 #endif
    else {
@@ -116,20 +101,11 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
    // totalGEMMTime += timer.report() / 1000.0;

    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
+      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse], nHot);
    }
 #ifdef TV_CUDA
    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
+      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse], nHot);
    }
 #endif
    else {

--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
@@ -97,6 +97,21 @@ int create_submconv_indice_pair_cuda(
    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
    bool transpose, bool resetGrid, bool useHash);

+int create_conv_indice_pair_cpu(
+    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
+    torch::Tensor indicePairs, torch::Tensor indiceNum,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
+    bool useHash);
+
+int create_submconv_indice_pair_cpu(
+    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
+    std::vector<int64_t> stride, std::vector<int64_t> padding,
+    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
+    bool transpose, bool resetGrid, bool useHash);
+
 } // namespace spconv

 #endif
\ No newline at end of file
--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
@@ -15,24 +15,19 @@
 #ifndef SPARSE_REORDERING_FUNCTOR_H_
 #define SPARSE_REORDERING_FUNCTOR_H_
 #include <tensorview/tensorview.h>
+#include <torch/script.h>

 namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct SparseGatherFunctor {
-  void operator()(const Device &d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size);
-};
+void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
+                        torch::Tensor indices, int size);
+void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
+                             torch::Tensor indices, int size);
+
+void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
+                        torch::Tensor indices, int size);
+void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
+                             torch::Tensor indices, int size);

-template <typename Device, typename T, typename Index>
-struct SparseScatterAddFunctor {
-  void operator()(const Device &d, tv::TensorView<T> out_features,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size,
-                  bool stable = false);
-};
-} // namespace functor
 } // namespace spconv

 #endif
\ No newline at end of file
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -198,6 +198,15 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
  }
 }

+std::vector<torch::Tensor>
+getIndicePairV2(torch::Tensor indices, int64_t batchSize,
+                std::vector<int64_t> outSpatialShape,
+                std::vector<int64_t> spatialShape,
+                std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+                std::vector<int64_t> padding, std::vector<int64_t> dilation,
+                std::vector<int64_t> outPadding, int64_t _subM,
+                int64_t _transpose, int64_t _useHash);
+
 template <unsigned NDim>
 std::vector<torch::Tensor> getIndicePairPreGrid(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
@@ -333,7 +342,6 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
                         int64_t numActOut, int64_t _inverse, int64_t _subM);
-
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -52,6 +52,10 @@ enum DType {

 namespace detail {

+using dtype_collection_t =
+    tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
+                  float16, int64, uint16, uint32, uint64>;
+
 using all_tensor_types_t =
    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
               uint16_t, uint32_t, uint64_t, bool>;
@@ -305,7 +309,7 @@ template <class... Ts, typename F> void dispatch(DType t, F &&f) {
  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
  bool notFound = true;
  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (type_v<decltype(I)> == t) {
+    if (type_v<decltype(I)> == t && notFound) {
      std::forward<F>(f)(decltype(I)());
      notFound = false;
    }
@@ -325,7 +329,7 @@ template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
                "you need to provide at least one candidate");
  bool notFound = true;
  mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
-    if (T(I) == idx) {
+    if (T(I) == idx && notFound) {
      std::forward<F>(f)(I);
      notFound = false;
    }
@@ -343,7 +347,27 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
                "you need to provide at least one candidate");
  bool notFound = true;
  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
-    if (int(I) == idx) {
+    if (decltype(I)::value == idx && notFound) {
+      std::forward<F>(f)(I);
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    mp_for_each<mp_list_c<int, Is...>>(
+        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
+    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
+  }
+}
+
+template <int... Is, typename F, class BinaryPredicate>
+void dispatch_int(int idx, BinaryPredicate p, F &&f) {
+  // BinaryPredicate: BinaryPredicate(idx, candidate)
+  static_assert(sizeof...(Is) > 0,
+                "you need to provide at least one candidate");
+  bool notFound = true;
+  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
+    if (p(idx, decltype(I)::value) && notFound) {
      std::forward<F>(f)(I);
      notFound = false;
    }
@@ -351,7 +375,7 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
  if (notFound) {
    std::stringstream ss;
    mp_for_each<mp_list_c<int, Is...>>(
-        [=, &ss](auto I) { ss << int(I) << " "; });
+        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
  }
 }
@@ -373,12 +397,16 @@ struct Dispatch<T<Args...>> {

 template <class T> struct DispatchInt;

-template <template <int...> class T, int... Ints>
-struct DispatchInt<T<Ints...>> {
+template <template<class...> class Tin, template<class, int> class T, int... Ints>
+struct DispatchInt<Tin<T<int, Ints>...>> {
  template <typename F> inline void operator()(int t, F &&f) {
    return dispatch_int<Ints...>(t, std::forward<F>(f));
  }
+  template <typename F, typename BinaryPredicate> inline void operator()(int t, BinaryPredicate p, F &&f) {
+    return dispatch_int<Ints...>(t, p, std::forward<F>(f));
+  }
 };
+
 constexpr size_t kTensorMaxDim = 10;
 using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;


--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -81,16 +81,7 @@ def get_indice_pairs(indices,
    else:
        out_shape = spatial_shape
    if grid is None:
-        if ndim == 2:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
-        elif ndim == 3:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
-        elif ndim == 4:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_4d
-        else:
-            raise NotImplementedError
-
-        res = get_indice_pairs_func(indices, batch_size, out_shape,
+        res = torch.ops.spconv.get_indice_pairs_v2(indices, batch_size, out_shape,
                                    spatial_shape, ksize, stride, padding,
                                    dilation, out_padding, int(subm),
                                    int(transpose), int(use_hash))
@@ -115,7 +106,7 @@ def indice_conv(features,
                num_activate_out,
                inverse=False,
                subm=False):
-    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
+    return torch.ops.spconv.indice_conv_v2(features, filters, indice_pairs,
                                        indice_pair_num, num_activate_out,
                                        int(inverse), int(subm))


--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -12,28 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <spconv/fused_spconv_ops.h>
 #include <spconv/nms_ops.h>
 #include <spconv/pillar_scatter_ops.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
 #include <torch/script.h>
+#include <spconv/fused_spconv_ops.h>

 static auto registry =
    torch::RegisterOperators()
        .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
+        .op("spconv::get_indice_pairs_v2", &spconv::getIndicePairV2)
        .op("spconv::get_indice_pairs_grid_2d",
            &spconv::getIndicePairPreGrid<2>)
        .op("spconv::get_indice_pairs_grid_3d",
            &spconv::getIndicePairPreGrid<3>)
        .op("spconv::indice_conv", &spconv::indiceConv)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::fused_indice_conv_fp32",
-            &spconv::fusedIndiceConvBatchNorm<float>)
-        .op("spconv::fused_indice_conv_half",
-            &spconv::fusedIndiceConvBatchNorm<at::Half>)
+        .op("spconv::fused_indice_conv_bn",
+            &spconv::fusedIndiceConvBatchNorm)
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
        .op("spconv::indice_maxpool_backward_fp32",
            &spconv::indiceMaxPoolBackward<float>)

--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
@@ -16,6 +16,7 @@
 #include <spconv/geometry.h>
 #include <spconv/indice.h>
 #include <spconv/spconv_ops.h>
+#include <tensorview/tensor.h>
 #include <torch/script.h>

 namespace spconv {
@@ -253,6 +254,79 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
 }
 #endif

+int create_conv_indice_pair_cpu(
+    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
+    torch::Tensor indicePairs, torch::Tensor indiceNum,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
+    bool useHash) {
+  auto ndim = outSpatialShape.size();
+  auto numActIn = indicesIn.size(0);
+  int batchSize = gridsOut.size(0);
+  auto kernelVolume = indicePairs.size(0);
+  if (numActIn == 0)
+    return 0;
+  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
+    using Index = decltype(V);
+    using IndexGrid = int32_t;
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = decltype(I)::value;
+      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
+      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
+      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
+      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
+      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
+                                       outSpatialShape.end());
+      if (transpose)
+        numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
+            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
+            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
+            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
+            di.data(), ou.data());
+      else
+        numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
+            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
+            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
+            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
+            di.data(), ou.data());
+    });
+  });
+  return numActIn;
+}
+
+int create_submconv_indice_pair_cpu(
+    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
+    std::vector<int64_t> stride, std::vector<int64_t> padding,
+    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
+    bool transpose, bool resetGrid, bool useHash) {
+  auto ndim = outSpatialShape.size();
+  auto numActIn = indicesIn.size(0);
+  int batchSize = gridsOut.size(0);
+  auto kernelVolume = indicePairs.size(0);
+  if (numActIn == 0)
+    return 0;
+  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
+    using Index = decltype(V);
+    using IndexGrid = int32_t;
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = decltype(I)::value;
+      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
+      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
+      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
+      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
+      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
+                                       outSpatialShape.end());
+      numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
+          tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
+          tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
+          ks.data(), st.data(), pa.data(), di.data(), ou.data());
+    });
+  });
+  return numActIn;
+}
+
 namespace functor {
 template <typename Index, typename IndexGrid, unsigned NDim>
 struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -38,37 +38,43 @@ int create_conv_indice_pair_p1_cuda(
  auto stream = at::cuda::getCurrentCUDAStream();
  auto ndim = kernelSize.size();
  auto numActIn = indicesIn.size(0);
+  auto kernelVolume = indicePairs.size(0);
  if (numActIn == 0)
    return 0;
-  // dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
+    using Index = decltype(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = I;
+      constexpr int NDim = decltype(I)::value;
      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
                                       outSpatialShape.end());
-      if (transpose) {
-        prepareDeConvIndicePairsKernel<Index, NDim, 4096>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indiceNum),
-                         tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
-                         ou);
-      } else {
-        prepareIndicePairsKernel<Index, NDim, 4096>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indiceNum),
-                         tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
-                         ou);
-      }
+      tv::dispatch_int<16, 32, 256, 4096>(
+          kernelVolume, std::less_equal<int>(), [&](auto I2) {
+            constexpr int MaxKernelVolume = decltype(I2)::value;
+            if (transpose) {
+              prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<Index>(indicePairs),
+                                  tv::torch2tv<Index>(indiceNum),
+                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
+                                  pa, di, ou);
+              TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
+            } else {
+              prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<Index>(indicePairs),
+                                  tv::torch2tv<Index>(indiceNum),
+                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
+                                  pa, di, ou);
+              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
+            }
+          });
    });
  });
  return 1;
@@ -88,12 +94,11 @@ int create_conv_indice_pair_p2_cuda(
  auto kernelVolume = indicePairs.size(0);
  if (numActIn == 0)
    return 0;
-  // dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
+    using Index = decltype(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = I;
+      constexpr int NDim = decltype(I)::value;
      using IndexGrid = int32_t;
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
                                       outSpatialShape.end());
@@ -122,6 +127,8 @@ int create_conv_indice_pair_p2_cuda(
            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
+        TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
+
        auto tableSize = table.get_table_size();
        auto tableData = table.data();
        auto constants = table.get_constants_4();
@@ -133,6 +140,7 @@ int create_conv_indice_pair_p2_cuda(
                         tv::torch2tv<Index>(indicePairs),
                         tv::torch2tv<Index>(indicePairUnique), tableSize,
                         tableData, constants, stash_constants, stash_count);
+        TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");

      } else {
        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
@@ -145,7 +153,7 @@ int create_conv_indice_pair_p2_cuda(
        assignIndicePairsKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
+                         tv::torch2tv<IndexGrid>(gridsOut), numActIn,
                         tv::torch2tv<Index>(indicePairs),
                         tv::torch2tv<Index>(indicePairUnique), ou);
        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
@@ -177,11 +185,11 @@ int create_submconv_indice_pair_cuda(
  auto kernelVolume = indicePairs.size(0);
  if (numActIn == 0)
    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
+  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
+    using Index = decltype(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = I;
+      constexpr int NDim = decltype(I)::value;
      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
@@ -214,26 +222,36 @@ int create_submconv_indice_pair_cuda(
        auto constants = table.get_constants_4();
        auto stash_constants = table.get_stash_constants();
        auto stash_count = table.get_stash_count();
-        getSubMIndicePairsHashKernel<Index, NDim, 4096>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
-                         tableSize, tableData, constants, stash_constants,
-                         stash_count);
+        tv::dispatch_int<16, 32, 256, 4096>(
+            kernelVolume, std::less_equal<int>(), [&](auto I2) {
+              constexpr int MaxKernelVolume = decltype(I2)::value;
+              getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<Index>(indicePairs),
+                                  tv::torch2tv<Index>(indiceNum), ks, st, pa,
+                                  di, ou, tableSize, tableData, constants,
+                                  stash_constants, stash_count);
+              TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
+            });
      } else {
        prepareSubMGridKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(tv::torch2tv<Index>(indicesIn),
                         tv::torch2tv<IndexGrid>(gridsOut), ou);
        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
-        getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<IndexGrid>(gridsOut),
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+        tv::dispatch_int<16, 32, 256, 4096>(
+            ndim, std::less_equal<int>(), [&](auto I2) {
+              constexpr int MaxKernelVolume = decltype(I2)::value;
+              getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<IndexGrid>(gridsOut),
+                                  tv::torch2tv<Index>(indicePairs),
+                                  tv::torch2tv<Index>(indiceNum), ks, st, pa,
+                                  di, ou);
+              TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+            });
      }

      if (resetGrid && (!useHash)) {

--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
@@ -14,59 +14,60 @@

 #include <ATen/Parallel.h>
 #include <spconv/reordering.h>
+#include <tensorview/torch_utils.h>
 #include <torch/script.h>

 namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    int numPlanes = features.dim(1);
-    at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-      for (int i = begin; i < end; ++i) {
-        std::memcpy(buffer.data() + i * numPlanes,
-                    features.data() + indices[i] * numPlanes,
-                    sizeof(T) * numPlanes);
-      }
-    });
-  }
-};
-
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    int numPlanes = outFeatures.dim(1);
-    const T *buf = buffer.data();
-    T *out = outFeatures.data();
-    for (int i = 0; i < size; ++i) {
-      buf = buffer.data() + i * numPlanes;
-      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j) {
-        out[j] += buf[j];
-      }
-    }
-  }
-};
-
-} // namespace functor
+using float_types_t = tv::mp_list<float, double, at::Half>;
+using int_types_t = tv::mp_list<int32_t, int64_t>;

-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;             \
-  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_SPECS(T)                                                   \
-  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
+void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
+                       torch::Tensor indices, int size) {
+  int numPlanes = features.size(1);
+  auto dtype = features.scalar_type();
+  auto int_dtype = indices.scalar_type();
+  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
+    using T = decltype(TValue);
+    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
+      using Index = decltype(IndexValue);
+      Index *indices_data = indices.data_ptr<Index>();
+      T *buffer_data = buffer.data_ptr<T>();
+      const T *features_data = features.data_ptr<T>();
+      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
+        for (int i = begin; i < end; ++i) {
+          std::memcpy(buffer_data + i * numPlanes,
+                      features_data + indices_data[i] * numPlanes,
+                      sizeof(T) * numPlanes);
+        }
+      });
+    });
+  });
+}

-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
+void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
+                            torch::Tensor indices, int size) {
+  int numPlanes = outFeatures.size(1);
+  auto dtype = outFeatures.scalar_type();
+  auto int_dtype = indices.scalar_type();

-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
+  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
+    using T = decltype(TValue);
+    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
+      using Index = decltype(IndexValue);
+      Index *indices_data = indices.data_ptr<Index>();
+      const T *buffer_data = buffer.data_ptr<T>();
+      T *features_data = outFeatures.data_ptr<T>();
+      const T *buf = buffer.data_ptr<T>();
+      T *out = outFeatures.data_ptr<T>();
+      for (int i = 0; i < size; ++i) {
+        buf = buffer_data + i * numPlanes;
+        out = features_data + indices_data[i] * numPlanes;
+        for (int j = 0; j < numPlanes; ++j) {
+          out[j] += buf[j];
+        }
+      }
+    });
+  });
+}

 } // namespace spconv
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -20,137 +20,148 @@
 #include <tensorview/cuda_utils.h>
 #include <tensorview/kernel_utils.h>
 #include <tensorview/mp_helper.h>
+#include <tensorview/tensor.h>
 #include <tensorview/tensorview.h>
+#include <tensorview/torch_utils.h>
 #include <type_traits>
 #include <utility/timer.h>

 namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0)
-      return;
-    int numPlanes = features.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                     &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);

-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(1, numPlanes / NumTLP),
-                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                    features.data(), indices.data() + nHotBlock,
-                                    size - nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
+void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
+                        torch::Tensor indices, int size) {
+  if (size <= 0)
+    return;
+  int numPlanes = features.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream();

-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      gatherGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                  tv::cuda::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              buffer.data(), features.data(), indices.data(), size, numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0)
-      return;
-    int numPlanes = outFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor =
-        sizeof(vecload_type_t) / sizeof(T); // important for half.
-    tv::mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
-                                     &notFound](auto NumTLP) {
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      constexpr int NumILP = NumTLP / 4;
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(
-                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
-                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+  tv::dispatch_torch<float, double,
+                     at::Half>(features.scalar_type(), [&](auto TValue) {
+    using T = decltype(TValue);
+    using vecload_type_t =
+        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
+
+    tv::dispatch_torch<int32_t, int64_t>(
+        indices.scalar_type(), [&](auto IndexValue) {
+          using Index = decltype(IndexValue);
+          bool notFound = true;
+          constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+          tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                           &notFound](auto NumTLP) {
+            constexpr int NumILP = NumTLP / 4;
+            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+            int nHotBlock = (size / NumTLP) * NumTLP;
+            if (notFound) {
+              if (numPlanes % NumTLP == 0) {
+                if (nHotBlock >= NumTLP) {
+                  gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                       vecload_type_t>
+                      <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                         dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
+                                   indices.data_ptr<Index>(), nHotBlock,
+                                   numPlanes / vecloadFactor);
+
+                  TV_CHECK_CUDA_ERR();
+                }
+                if (size - nHotBlock > 0) {
+                  gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                      <<<dim3(1, numPlanes / NumTLP),
+                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
+                                   features.data_ptr<T>(),
+                                   indices.data_ptr<Index>() + nHotBlock,
+                                   size - nHotBlock, numPlanes / vecloadFactor);
+                  TV_CHECK_CUDA_ERR();
+                }
+                notFound = false;
+              }
+            }
+          });
+
+          if (notFound) {
+            constexpr int NumTLP = 64;
+            constexpr int NumILP = NumTLP / 4;
+            gatherGenericKernel<T, Index, NumTLP, NumILP>
+                <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                        tv::cuda::DivUp(numPlanes, NumTLP)),
+                   dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
+                    buffer.data_ptr<T>(), features.data_ptr<T>(),
+                    indices.data_ptr<Index>(), size, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                  tv::cuda::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              outFeatures.data(), buffer.data(), indices.data(), size,
-              numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-} // namespace functor
+        });
+  });
+}

-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
-  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
+void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
+                             torch::Tensor indices, int size) {
+  if (size <= 0)
+    return;
+  int numPlanes = outFeatures.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream();

-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+  tv::dispatch_torch<float, double, at::Half>(
+      outFeatures.scalar_type(), [&](auto TValue) {
+        using T = decltype(TValue);
+        using vecload_type_t =
+            std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+        using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;

-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
+        tv::dispatch_torch<int32_t, int64_t>(
+            indices.scalar_type(), [&](auto IndexValue) {
+              using Index = decltype(IndexValue);
+              bool notFound = true;
+              constexpr int vecloadFactor =
+                  sizeof(vecload_type_t) / sizeof(T); // important for half.
+              tv::mp_for_each<kernel_block_t>(
+                  [=, &outFeatures, &buffer, &indices, &notFound](auto NumTLP) {
+                    // constexpr int NumILP = NumTLP / (64 / (NumTLP /
+                    // vecloadFactor));
+                    constexpr int NumILP = NumTLP / 4;
+                    int nHotBlock = (size / NumTLP) * NumTLP;
+                    if (notFound) {
+                      if (numPlanes % NumTLP == 0) {
+                        if (nHotBlock >= NumTLP) {
+                          scatterAddVecBlockKernel<T, Index, int(NumTLP),
+                                                   NumILP, vecload_type_t>
+                              <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                                 dim3(NumTLP / vecloadFactor, NumTLP / NumILP),
+                                 0, stream>>>(outFeatures.data_ptr<T>(),
+                                              buffer.data_ptr<T>(),
+                                              indices.data_ptr<Index>(),
+                                              nHotBlock,
+                                              numPlanes / vecloadFactor);
+                          TV_CHECK_CUDA_ERR();
+                        }
+                        if (size - nHotBlock > 0) {
+                          scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
+                              <<<dim3(1, numPlanes / NumTLP),
+                                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
+                                  outFeatures.data_ptr<T>(),
+                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
+                                  indices.data_ptr<Index>() + nHotBlock,
+                                  size - nHotBlock, numPlanes);
+                          TV_CHECK_CUDA_ERR();
+                        }
+                        notFound = false;
+                      }
+                    }
+                  });
+              if (notFound) {
+                constexpr int NumTLP = 64;
+                constexpr int NumILP = NumTLP / 4;
+                scatterAddGenericKernel<T, Index, NumTLP, NumILP>
+                    <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                            tv::cuda::DivUp(numPlanes, NumTLP)),
+                       dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
+                        outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
+                        indices.data_ptr<Index>(), size, numPlanes);
+                TV_CHECK_CUDA_ERR();
+              }
+            });
+      });
+}

-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
 } // namespace spconv
\ No newline at end of file
--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
 #include <spconv/spconv_ops.h>
 namespace spconv {

+std::vector<torch::Tensor>
+getIndicePairV2(torch::Tensor indices, int64_t batchSize,
+                std::vector<int64_t> outSpatialShape,
+                std::vector<int64_t> spatialShape,
+                std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+                std::vector<int64_t> padding, std::vector<int64_t> dilation,
+                std::vector<int64_t> outPadding, int64_t _subM,
+                int64_t _transpose, int64_t _useHash) {
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto NDim = kernelSize.size();
+  // CPU always use hash (tsl::robin_map).
+  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  std::string msg = "due to limits of cuda hash, the volume of dense space "
+                    "include batch size ";
+  msg += "must less than std::numeric_limits<int>::max() = 2e9";
+  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
+                   msg);
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  auto gridSize = batchSize * outputVolume;
+  if (useHash) {
+    gridSize = batchSize;
+  }
+  torch::Tensor gridOut = torch::full(
+      {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
+  gridOut = gridOut.view({batchSize, -1});
+  int64_t numActOut = -1;
+  for (int i = 0; i < NDim; ++i) {
+    if (subM) {
+      padding[i] = kernelSize[i] / 2;
+      stride[i] = 1;
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      numActOut = create_submconv_indice_pair_cpu(
+          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
+          dilation, outSpatialShape, transpose, false, useHash);
+    }
+#ifdef TV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
+      numActOut = create_submconv_indice_pair_cuda(
+          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
+          dilation, outSpatialShape, transpose, false, useHash);
+    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    auto indicePairUnique = torch::full(
+        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+        torch::dtype(torch::kInt32).device(indices.device()));
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      numActOut = create_conv_indice_pair_cpu(
+          indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
+          padding, dilation, outSpatialShape, transpose, false, useHash);
+    }
+#ifdef TV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
+      numActOut = create_conv_indice_pair_p1_cuda(
+          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
+          padding, dilation, outSpatialShape, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = create_conv_indice_pair_p2_cuda(
+            indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
+            outSpatialShape, transpose, false, useHash);
+      }
+    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
@@ -47,81 +153,59 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
-  tv::dispatch_torch<float, double, at::Half>(
-      features.scalar_type(), [&](auto I) {
-        using T = decltype(I);
-        for (int i = 0; i < kernelVolume; ++i) {
-          auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-          if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-            continue;
-          }
-          // auto timer = spconv::CudaContextTimer<>();
-          auto outputBufferBlob = torch::from_blob(
-              outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
-          auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                                  {nHot, numInPlanes}, options);
-
-          if (device == torch::kCPU) {
-            functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-            gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                       tv::torch2tv<const T>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-          }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob = torch::from_blob(
+        outputBuffer.data_ptr(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
+                                            {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
+    }
 #ifdef TV_CUDA
-          else if (device == torch::kCUDA) {
-            functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-            gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                       tv::torch2tv<const T>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-            TV_CHECK_CUDA_ERR();
-            /* slower than SparseGatherFunctor, may due to int->long conversion
-    auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-    auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-    {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-    features, 0, indicePairBlob);*/
-          }
+    else if (device == torch::kCUDA) {
+      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
+      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
+      features, 0, indicePairBlob);*/
+    }
 #endif
-          else {
-            TV_ASSERT_INVALID_ARG(false, "unknown device type");
-          }
-
-          // totalGatherTime += timer.report() / 1000.0;
-          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-          // totalGEMMTime += timer.report() / 1000.0;
-
-          if (device == torch::kCPU) {
-            functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-            scatterFtor(
-                tv::CPU(), tv::torch2tv<T>(output),
-                tv::torch2tv<const T>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-          }
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+
+    if (device == torch::kCPU) {
+      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse], nHot);
+    }
 #ifdef TV_CUDA
-          else if (device == torch::kCUDA) {
-            functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-            scatterFtor(
-                tv::TorchGPU(), tv::torch2tv<T>(output),
-                tv::torch2tv<const T>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-            TV_CHECK_CUDA_ERR();
-          }
+    else if (device == torch::kCUDA) {
+      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse], nHot);
+    }
 #endif
-          else {
-            TV_ASSERT_INVALID_ARG(false, "unknown device type");
-          }
-          // totalSAddTime += timer.report() / 1000.0;
-        }
-      });
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+
  // std::cout << "gather time " << totalGatherTime << std::endl;
  // std::cout << "gemm time " << totalGEMMTime << std::endl;
  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
  return output;
 }

+
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,
@@ -158,77 +242,47 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
    torch::mm_out(filterGradSub, features.t(), outGrad);
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
  }
-  tv::dispatch_torch<float, double,
-                     at::Half>(features.scalar_type(), [&](auto I) {
-    using T = decltype(I);
-    for (int i = 0; i < kernelVolume; ++i) {
-      auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-      if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-        continue;
-      }
-      if (device == torch::kCPU) {
-        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
-        gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                   tv::torch2tv<const T>(features),
-                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                   nHot);
-        gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
-                      tv::torch2tv<const T>(outGrad),
-                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                      nHot);
-      }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
+      sparse_gather_cpu(outputBuffer, outGrad, indicePairs[i][!inverse], nHot);
+    }
 #ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
-        gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                   tv::torch2tv<const T>(features),
-                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                   nHot);
-        TV_CHECK_CUDA_ERR();
-        gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
-                      tv::torch2tv<const T>(outGrad),
-                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                      nHot);
-        TV_CHECK_CUDA_ERR();
-      }
+    else if (device == torch::kCUDA) {
+      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
+      sparse_gather_cuda(outputBuffer, outGrad, indicePairs[i][!inverse], nHot);
+    }
 #endif
-      else {
-        TV_ASSERT_INVALID_ARG(false, "unknown device type");
-      }
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }

-      auto filterGradSub = filtersGrad[i];
-      auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-                                               {nHot, numOutPlanes}, options);
-      auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                              {nHot, numInPlanes}, options);
-
-      torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-      torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-      if (device == torch::kCPU) {
-        functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-        scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
-                    tv::torch2tv<const T>(inputBuffer),
-                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                    nHot);
-      }
+    auto filterGradSub = filtersGrad[i];
+    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
+                                              {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
+                                            {nHot, numInPlanes}, options);
+
+    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+    if (device == torch::kCPU) {
+      sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[i][inverse], nHot);
+    }
 #ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-        scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
-                    tv::torch2tv<const T>(inputBuffer),
-                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                    nHot);
-        TV_CHECK_CUDA_ERR();
-      }
+    else if (device == torch::kCUDA) {
+      sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[i][inverse], nHot);
+    }
 #endif
-      else {
-        TV_ASSERT_INVALID_ARG(false, "unknown device type");
-      }
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
-  });
+  }
  return {inputGrad, filtersGrad.view(filterShape)};
 }

+
 } // namespace spconv
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -392,7 +392,7 @@ class TestSpConv(TestCase):

    def testSpDeConv3d(self):
        np.random.seed(484)
-        devices = ["cuda:0", "cpu:0"]
+        devices = ["cuda:0"]
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]

@@ -598,9 +598,9 @@ def main():
    shapes = [[50, 30, 30]]
    batchsizes = [2]

-    in_channels = [256]
-    out_channels = [256]
-    ksizes = [(3, 1, 1)]
+    in_channels = [32]
+    out_channels = [64]
+    ksizes = [(3, 3, 3)]
    strides = [1]
    paddings = [0]
    dilations = [1]
@@ -654,5 +654,6 @@ def main():


 if __name__ == '__main__':
-    main()
+    # main()
    # unittest.main()
+    TestSpConv().testSpDeConv3d()