some c++ change:

1. use dispatch instead of template 2. set use_hash default to False

some c++ change:
1. use dispatch instead of template 2. set use_hash default to False
5073a413 · tusimple · ec40b6f7 · 5073a413 · 5073a413 · 5073a413
Commit 5073a413 authored Apr 22, 2020 by tusimple
12 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 *.so
 *.o
 *.out
+*.so.*
 # Distribution / packaging
 .Python

--- a/include/pybind11_utils.h
+++ b/include/pybind11_utils.h
@@ -13,49 +13,103 @@
 // limitations under the License.
 #pragma once
+#include <tensorview/tensorview.h>
+#include <tensorview/tensor.h>
 #include <algorithm>
+#include <array>
 #include <iostream>
-#include <pybind11/embed.h> // everything needed for embedding
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include <tensorview/tensorview.h>
 namespace py = pybind11;
+namespace tv {
+template <typename T> TensorView<T> arrayt2tv(py::array_t<T> arr) {
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return TensorView<T>(arr.mutable_data(), shape);
+}
-template <typename T, typename TPyObject>
+template <typename T> TensorView<const T> carrayt2tv(py::array_t<T> arr) {
-std::vector<T> array2Vector(TPyObject arr){
+  Shape shape;
-    py::array arr_np = arr;
+  for (int i = 0; i < arr.ndim(); ++i) {
-    size_t size = arr.attr("size").template cast<size_t>();
+    shape.push_back(arr.shape(i));
-    py::array_t<T> arr_cc = arr_np;
+  }
-    std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
+  return TensorView<const T>(arr.data(), shape);
-    return data;
+}
+template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
+  return TensorView<T>(arr.data(), {arr.size()});
 }
 template <typename T>
-std::vector<T> arrayT2Vector(py::array_t<T> arr)
+TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
-{
+  TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
-  std::vector<T> data(arr.data(), arr.data() + arr.size());
+  return TensorView<T>(arr.data(), shape);
-  return data;
+}
-}
+template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
-template <typename T, typename TPyObject>
+  return TensorView<const T>(arr.data(), {arr.size()});
-tv::TensorView<T> array2TensorView(TPyObject arr){
-    py::array arr_np = arr;
-    py::array_t<T> arr_cc = arr_np;
-    tv::Shape shape;
-    for (int i = 0; i < arr_cc.ndim(); ++i){
-        shape.push_back(arr_cc.shape(i));
-    }
-    return tv::TensorView<T>(arr_cc.mutable_data(), shape);
 }
 template <typename T>
-tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
+std::vector<T> shape2stride(const std::vector<T> &shape, T itemsize) {
-    tv::Shape shape;
+  T p = T(1);
-    for (int i = 0; i < arr.ndim(); ++i){
+  std::vector<T> res;
-        shape.push_back(arr.shape(i));
+  for (auto iter = shape.rbegin(); iter != shape.rend(); ++iter) {
+    res.push_back(p * itemsize);
+    p *= *iter;
+  }
+  std::reverse(res.begin(), res.end());
+  return res;
+}
+tv::DType get_array_tv_dtype(const py::array& arr){
+  // 
+  switch (arr.dtype().kind()){
+    case 'b': return tv::bool_;
+    case 'i': {
+      switch (arr.itemsize()){
+        case 1: return tv::int8;
+        case 2: return tv::int16;
+        case 4: return tv::int32;
+        case 8: return tv::int64;
+        default: break;
+      }
    }
-    return tv::TensorView<T>(arr.mutable_data(), shape);
+    case 'u': {
-}
+      switch (arr.itemsize()){
\ No newline at end of file
+        case 1: return tv::uint8;
+        case 2: return tv::uint16;
+        case 4: return tv::uint32;
+        case 8: return tv::uint64;
+        default: break;
+      }
+    }
+    case 'f': {
+      switch (arr.itemsize()){
+        case 4: return tv::float32;
+        case 8: return tv::float64;
+        default: break;
+      }
+    }
+  }
+  TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
+}
+Tensor array2tensor(py::array& arr) {
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
+}
+} // namespace tv
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -110,7 +110,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose, false, useHash);
-      if (numActOut == -1){
+      if (numActOut == -1) {
        // build hash failed. use CPU algorithm
        auto device = indices.device();
        indicePairs = indicePairs.to({torch::kCPU});
@@ -123,7 +123,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
            tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
            dilation32, outSpatialShape32, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device), indiceNum.to(device)};
+        return {indices.to(device), indicePairs.to(device),
+                indiceNum.to(device)};
      }
    }
 #endif
@@ -169,7 +170,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
            false, useHash);
-        if (numActOut == -1){
+        if (numActOut == -1) {
          // build hash failed. use CPU algorithm
          auto getIndicePairFtor =
              functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
@@ -184,7 +185,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
              tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
              kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
              transpose);
-          return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)};
+          return {outInds.to(device).slice(0, 0, numActOut),
+                  indicePairs.to(device), indiceNum.to(device)};
        }
      }
    }
@@ -328,372 +330,14 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
  }
 }
-template <typename T>
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+                         int64_t numActOut, int64_t _inverse, int64_t _subM);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob =
-        torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-template <typename T>
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob =
-        torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-template <typename T>
-torch::Tensor
-indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto totalActsTen = indicePairNumCpu.sum();
-  auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-                                    indicePairNumCpu.data_ptr<int>() +
-                                        kernelVolume);
-  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-  int subRuleMaxSize =
-      *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  if (subM) {
-    indicePairMaxSize = subRuleMaxSize;
-  }
-  // auto timer = spconv::CudaContextTimer<>();
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  // std::cout << "create time " << timer.report() / 1000.0 << std::endl;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // auto timer = spconv::CudaContextTimer<>();
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    //
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } 
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // }
-    // for (int i = 0; i < kernelVolume; ++i) {
-    // totalGatherTime += timer.report() / 1000.0;
-    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
-    // {nHot, numOutPlanes}, options);
-  }
-  // totalGatherTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-  }
-  // totalGEMMTime += timer.report() / 1000.0;
-  // totalGEMMTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } 
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // totalSAddTime += timer.report() / 1000.0;
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
 } // namespace spconv

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "tensorview.h"
+#include <memory>
+#include <spconv/mp_helper.h>
+#ifdef SPCONV_CUDA
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#endif
+namespace tv
+{
+enum DType
+{
+  float32,
+  int32,
+  int16,
+  int8,
+  float64,
+  bool_,
+  uint8,
+  float16,
+  int64,
+  uint16,
+  uint32,
+  uint64
+};
+namespace detail
+{
+template <typename T>
+class TensorStorage
+{
+public:
+  TensorStorage(size_t size, int device = -1, bool managed = false)
+      : mSize(size), device_(device), managed_(managed)
+  {
+    if (size == 0)
+    {
+      mPtr = nullptr;
+    }
+    else
+    {
+      if (device == -1)
+      {
+#ifdef SPCONV_CUDA
+        checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
+#else
+        mPtr = new T[size];
+#endif
+      }
+      else
+      {
+#ifdef SPCONV_CUDA
+        int deviceCount;
+        cudaGetDeviceCount(&deviceCount);
+        if (device >= deviceCount)
+        {
+          TV_ASSERT_INVALID_ARG("you provide device ", device,
+                                " but you only have ", deviceCount, " device.");
+        }
+        cudaSetDevice(device);
+        if (managed)
+        {
+          checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
+        }
+        else
+        {
+          checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
+        }
+#else
+        TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+#endif
+      }
+    }
+  }
+  TensorStorage(T *ptr, size_t size, int device)
+      : mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
+  virtual ~TensorStorage()
+  {
+    if (empty())
+    {
+      return;
+    }
+    if (from_blob_)
+    {
+      return;
+    }
+    if (device_ == -1)
+    {
+#ifdef SPCONV_CUDA
+      cudaFreeHost(mPtr);
+#else
+      delete[] mPtr;
+#endif
+    }
+    else
+    {
+#ifdef SPCONV_CUDA
+      cudaFree(mPtr);
+#endif
+    }
+  };
+  inline size_t size() const { return mSize; }
+  T *data() { return mPtr; }
+  const T *data() const { return mPtr; }
+  bool empty() const { return mPtr == nullptr || mSize == 0; }
+  bool managed() const { return managed_; }
+  int device() const { return device_; }
+  void zero_()
+  {
+    if (device_ == -1)
+    {
+      std::memset(data(), 0, mSize);
+      // std::fill(data(), data() + mSize, 0);
+    }
+    else
+    {
+#ifdef SPCONV_CUDA
+      checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
+#else
+      TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+#endif
+    }
+  }
+private:
+  T *mPtr = nullptr;
+  size_t mSize = 0;
+  int device_ = -1;
+  bool from_blob_ = false;
+  bool managed_ = false;
+};
+size_t sizeof_dtype(DType dtype)
+{
+  switch (dtype)
+  {
+  case float32:
+    return sizeof(float);
+  case int8:
+    return sizeof(int8_t);
+  case int16:
+    return sizeof(int16_t);
+  case int32:
+    return sizeof(int32_t);
+  case float64:
+    return sizeof(double);
+  case int64:
+    return sizeof(int64_t);
+  case bool_:
+    return sizeof(bool);
+  case uint8:
+    return sizeof(uint8_t);
+  case uint16:
+    return sizeof(uint16_t);
+  case uint32:
+    return sizeof(uint32_t);
+  case uint64:
+    return sizeof(uint64_t);
+#ifdef SPCONV_CUDA
+  case float16:
+    return sizeof(__half);
+#endif
+  default:
+    TV_THROW_RT_ERR("unsupported dtype");
+  }
+  return 0;
+}
+std::string typeString(DType t)
+{
+  switch (t)
+  {
+  case DType::bool_:
+    return "bool";
+  case DType::float32:
+    return "float32";
+  case DType::int8:
+    return "int8";
+  case DType::int16:
+    return "int16";
+  case DType::int32:
+    return "int32";
+  case DType::float64:
+    return "float64";
+  case DType::int64:
+    return "int64";
+  case DType::uint8:
+    return "uint8";
+  case DType::uint16:
+    return "uint16";
+  case DType::uint32:
+    return "uint32";
+  case DType::uint64:
+    return "uint64";
+#ifdef SPCONV_CUDA
+  case DType::float16:
+    return "half";
+#endif
+  default:
+    return "";
+  }
+}
+template <typename T>
+struct TypeToDtypeTraits;
+template <>
+struct TypeToDtypeTraits<int32_t>
+{
+  static constexpr DType dtype = int32;
+};
+#ifdef SPCONV_CUDA
+template <>
+struct TypeToDtypeTraits<__half>
+{
+  static constexpr DType dtype = float16;
+};
+#endif
+template <>
+struct TypeToDtypeTraits<float>
+{
+  static constexpr DType dtype = float32;
+};
+template <>
+struct TypeToDtypeTraits<double>
+{
+  static constexpr DType dtype = float64;
+};
+template <>
+struct TypeToDtypeTraits<int16_t>
+{
+  static constexpr DType dtype = int16;
+};
+template <>
+struct TypeToDtypeTraits<int8_t>
+{
+  static constexpr DType dtype = int8;
+};
+template <>
+struct TypeToDtypeTraits<int64_t>
+{
+  static constexpr DType dtype = int64;
+};
+template <>
+struct TypeToDtypeTraits<uint8_t>
+{
+  static constexpr DType dtype = uint8;
+};
+template <>
+struct TypeToDtypeTraits<uint16_t>
+{
+  static constexpr DType dtype = uint16;
+};
+template <>
+struct TypeToDtypeTraits<uint32_t>
+{
+  static constexpr DType dtype = uint32;
+};
+template <>
+struct TypeToDtypeTraits<uint64_t>
+{
+  static constexpr DType dtype = uint64;
+};
+} // namespace detail
+template <class T>
+constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
+struct Tensor
+{
+  Tensor() {}
+  Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
+      : dtype_(dtype)
+  {
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        shape.size() * detail::sizeof_dtype(dtype), device, managed);
+    shape_ = shape;
+  }
+  Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
+  {
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        reinterpret_cast<uint8_t *>(ptr),
+        shape.size() * detail::sizeof_dtype(dtype), device);
+    shape_ = shape;
+  }
+  template <typename T>
+  TensorView<T> tview()
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
+    return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
+  }
+  template <typename T>
+  TensorView<T> tview() const
+  {
+    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return TensorView<const std::remove_const_t<T>>(
+        reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
+        shape_);
+  }
+  bool empty() const { return storage_->empty(); }
+  DType dtype() const { return dtype_; }
+  int device() const { return storage_->device(); }
+  const Shape &shape() const { return shape_; }
+  int dim(int idx) const
+  {
+    TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
+    return shape_[idx];
+  }
+  const uint8_t *raw_data() const { return storage_->data(); }
+  size_t size() const { return shape_.size(); }
+  Tensor &zero_()
+  {
+    storage_->zero_();
+    return *this;
+  }
+  uint8_t *raw_data() { return storage_->data(); }
+  template <typename T>
+  Tensor &fill_(T value)
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    auto ptr = reinterpret_cast<T *>(raw_data());
+    std::fill(ptr, ptr + size(), value);
+    return *this;
+  }
+  template <typename T>
+  T *data()
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return reinterpret_cast<T *>(raw_data());
+  }
+  template <typename T>
+  const T *data() const
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return reinterpret_cast<const T *>(raw_data());
+  }
+  void copy_(const Tensor &tensor)
+  {
+    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
+    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
+    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
+    if (device() == -1 && tensor.device() == -1)
+    {
+#ifdef SPCONV_CUDA
+      host2host(storage_->data(), tensor.raw_data(),
+                size() * detail::sizeof_dtype(dtype_));
+#else
+      std::copy(tensor.raw_data(),
+                tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
+                storage_->data());
+#endif
+    }
+#ifdef SPCONV_CUDA
+    else if (device() >= 0 && tensor.device() == -1)
+    {
+      // host2dev
+      host2dev(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_));
+    }
+    else if (device() == -1 && tensor.device() >= 0)
+    {
+      // dev2host
+      dev2host(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_));
+    }
+    else if (device() >= 0 && tensor.device() >= 0)
+    {
+      // dev2dev
+      dev2dev(storage_->data(), tensor.raw_data(),
+              size() * detail::sizeof_dtype(dtype_));
+    }
+#endif
+    else
+    {
+      TV_ASSERT_RT_ERR(false, "only support cpu tensor");
+    }
+  }
+  Tensor cpu() const
+  {
+    if (storage_->device() == -1)
+    {
+      return *this;
+    }
+    Tensor res(shape_, dtype_, -1, storage_->managed());
+    res.copy_(*this);
+    return res;
+  }
+  template <typename T>
+  void copy_(const TensorView<T> &tensor, int device)
+  {
+    Tensor src = from_blob(tensor, device);
+    return copy_(src);
+  }
+protected:
+  DType dtype_;
+  std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
+  Shape shape_;
+};
+inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
+{
+  return Tensor(ptr, shape, dtype, device);
+}
+template <typename T>
+Tensor from_blob(TensorView<T> tensor, int device)
+{
+  return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
+}
+template <class... Ts, typename F>
+void dispatch(DType t, F &&f)
+{
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (type_v<decltype(I)> == t)
+    {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound)
+  {
+    std::stringstream ss;
+    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
+      ss << detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
+                    ", available: ", ss.str());
+  }
+}
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -17,9 +17,9 @@
 #include <cassert>
 #include <cstdlib>
+#include "prettyprint.h"
 #include <iostream>
 #include <memory>
-// #include <prettyprint.h>
 #include <sstream>
 #include <type_traits>
 #include <vector>
@@ -27,7 +27,6 @@
 #include <cuda_runtime_api.h>
 #endif
 namespace tv {
 #ifdef __NVCC__
@@ -72,15 +71,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
  sstream_print(ss, args...);
 }
+template <class... TArgs> void ssprint(TArgs... args) {
+  std::stringstream ss;
+  sstream_print(ss, args...);
+  std::cout << ss.str() << std::endl;
+}
+#define TV_THROW_RT_ERR(...)                                                   \
+  {                                                                            \
+    std::stringstream __macro_s;                                               \
+    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
+    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
+    throw std::runtime_error(__macro_s.str());                                 \
+  }
 #define TV_ASSERT_RT_ERR(expr, ...)                                            \
  {                                                                            \
-    if (!(expr)) {                                                             \
+    if (!(expr))                                                               \
-      std::stringstream __macro_s;                                             \
+      TV_THROW_RT_ERR(__VA_ARGS__);                                            \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
  }
 #define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
@@ -96,24 +104,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
 #define TV_CHECK_CUDA_ERR()                                                    \
  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                             \
+    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                                  \
+    if (__macro_err != cudaSuccess) {                                          \
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;                 \
+      __macro_s << "cuda execution failed with error " << __macro_err;         \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }
-#define TV_CHECK_CUDA_ERR_V2(...)                                                    \
+#define TV_CHECK_CUDA_ERR_V2(...)                                              \
  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                             \
+    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                                  \
+    if (__macro_err != cudaSuccess) {                                          \
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;                 \
+      __macro_s << "cuda execution failed with error " << __macro_err;         \
-      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\
+      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";             \
-      tv::sstream_print(__macro_s, __VA_ARGS__); \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }
@@ -362,7 +370,8 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
      : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
+      : SimpleVector<int, MaxDim>(vec) {}
  template <typename T, template <class...> class Container>
  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
@@ -372,7 +381,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
 #ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+    TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
 #endif
    ShapeBase<MaxDim> shape;
    for (int i = start; i < end; ++i) {
@@ -417,6 +426,13 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
    }
    return shape;
  }
+  TV_HOST_DEVICE size_t prod() const {
+    size_t res = 1;
+    for (size_t i = 0; i < this->mSize; ++i) {
+      res *= this->mArray[i];
+    }
+    return res;
+  }
 };
 using Shape = ShapeBase<TV_MAX_DIM>;
@@ -545,6 +561,9 @@ template <typename T, int Rank = -1> struct TensorView {
      : mPtr(ptr) {
    mShape = {int(shapes)...};
  }
+  operator TensorView<const T>() {
+    return TensorView<const T>(mPtr, mShape);
+  } // conversion function
  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
  assign(const TensorView<T, Rank> &tensor) {
@@ -846,6 +865,19 @@ template <typename T, int Rank = -1> struct TensorView {
 #endif
    return mPtr[idx];
  }
+  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
    int idx = rowArrayIdx(mShape, index);
@@ -970,6 +1002,26 @@ template <typename T, int Rank = -1> struct TensorView {
                               mShape.subshape(sizeof...(ints) + 1));
  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  subview_ints(SimpleVector<int> ids) const {
+    Shape start = ids;
+    for (int i = ids.size(); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
+                               mShape.subshape(ids.size()));
+  }
+  std::string print_vec(TensorView<T> tensor) const {
+    std::ostringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
+      ss << tensor(i) << ", ";
+    }
+    ss << tensor(tensor.dim(0) - 1) << "]";
+    return ss.str();
+  }
  std::string repr() const {
    std::ostringstream ss;
    if (empty())
@@ -983,37 +1035,29 @@ template <typename T, int Rank = -1> struct TensorView {
    }
    Shape counter = mShape;
    auto tensor_flat = this->view(-1);
-    for (int i = 0; i < counter.ndim(); ++i) {
+    for (int i = 0; i < counter.ndim() - 1; ++i) {
      counter[i] = 0;
-      ss << "[";
+      // ss << "[";
    }
-    for (size_t i = 0; i < this->size(); ++i) {
+    for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
-      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      for (int i = 0; i < counter.ndim() - 1; ++i) {
-      counter[counter.ndim() - 1] += 1;
+        if (counter[i] == 0) {
-      int inc_count = 0;
+          ss << "[";
-      bool print_comma = true;
-      for (int c = counter.ndim() - 1; c >= 0; --c) {
-        if (counter[c] == this->dim(c) && c > 0) {
-          ++inc_count;
-          counter[c - 1] += 1;
-          counter[c] = 0;
-          print_comma = false;
        }
      }
-      if (print_comma && i != this->size() - 1)
+      std::cout << "counter.ndim() " << counter.ndim() << std::endl;
-        ss << ", ";
+      auto counter_ = counter.subshape(0, counter.ndim() - 1);
-      for (int j = 0; j < inc_count; ++j) {
+      std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
-        ss << "]";
+      ss << print_vec(this->subview_ints(counter_)) << "\n";
-      }
+      std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
-      if (i != this->size() - 1) {
+      for (int i = 0; i < counter.ndim() - 1; ++i) {
-        if (inc_count != 0)
+        if (counter[i] == this->dim(i) - 1) {
-          ss << "\n";
+          ss << "]";
-        for (int j = 0; j < inc_count; ++j) {
-          ss << "[";
        }
      }
    }
-    ss << "]";
+    // ss << "]";
    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
    // detail::simpleTypeName<T>());
    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
@@ -1161,4 +1205,160 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
  return printTensorView(TensorView<const T>(ptr, shape), format);
 }
+#ifdef SPCONV_CUDA
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), func);
+    DEVICE_RESET
+    // Make sure we call CUDA Device Reset before exiting
+    exit(EXIT_FAILURE);
+  }
+}
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
+}
+template <typename T>
+void host2dev(TensorView<T> dst, const TensorView<const T> src,
+              cudaStream_t s = 0) {
+  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
+  checkCudaErrors(
+      cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
+}
+template <typename T>
+void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
+  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
+}
+template <typename T>
+void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
+  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
+}
+template <typename T>
+void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
+}
+template <typename T>
+void dev2host(TensorView<T> dst, const TensorView<const T> src,
+              cudaStream_t s = 0) {
+  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
+}
+template <typename T>
+void dev2dev(TensorView<T> dst, const TensorView<const T> src,
+             cudaStream_t s = 0) {
+  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
+}
+template <typename T>
+void host2host(TensorView<T> dst, const TensorView<const T> src,
+               cudaStream_t s = 0) {
+  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T> void zero_dev(TensorView<T> tensor) {
+  checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
+}
+template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
+  checkCudaErrors(
+      cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
+}
+template <typename T> void zero_host(TensorView<T> tensor) {
+  std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
+}
+#endif
+namespace detail {
+template <typename T> struct TypeToString;
+template <> struct TypeToString<int32_t> {
+  static constexpr const char *value = "int32";
+};
+template <> struct TypeToString<bool> {
+  static constexpr const char *value = "bool";
+};
+template <> struct TypeToString<float> {
+  static constexpr const char *value = "float";
+};
+template <> struct TypeToString<double> {
+  static constexpr const char *value = "double";
+};
+template <> struct TypeToString<int16_t> {
+  static constexpr const char *value = "int16";
+};
+template <> struct TypeToString<int8_t> {
+  static constexpr const char *value = "int8";
+};
+template <> struct TypeToString<int64_t> {
+  static constexpr const char *value = "int64";
+};
+template <> struct TypeToString<uint8_t> {
+  static constexpr const char *value = "uint8";
+};
+template <> struct TypeToString<uint16_t> {
+  static constexpr const char *value = "uint16";
+};
+template <> struct TypeToString<uint32_t> {
+  static constexpr const char *value = "uint32";
+};
+template <> struct TypeToString<uint64_t> {
+  static constexpr const char *value = "uint64";
+};
+} // namespace detail
 } // namespace tv
\ No newline at end of file
--- a/include/torch_utils.h
+++ b/include/torch_utils.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,24 +13,26 @@
 // limitations under the License.
 #pragma once
+#include <spconv/mp_helper.h>
 #include <tensorview/tensorview.h>
-#include <torch/script.h>
 #include <ATen/ATen.h>
+#include <torch/script.h>
 #ifdef SPCONV_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #endif
 namespace tv {
 #ifdef SPCONV_CUDA
-struct TorchGPU: public tv::GPU {
+struct TorchGPU : public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
  }
 };
 #endif
 template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.type().scalarType()) {
+  switch (tensor.scalar_type()) {
  case at::ScalarType::Double: {
    auto val = std::is_same<std::remove_const_t<T>, double>::value;
    TV_ASSERT_RT_ERR(val, "error");
@@ -60,34 +62,34 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
    TV_ASSERT_RT_ERR(false, "error");
  }
 }
+namespace detail {
+template <typename T> struct TypeToTorchDtypeTraits;
-template <typename T>
+template <> struct TypeToTorchDtypeTraits<int32_t> {
-constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){
+  static constexpr decltype(torch::kInt32) value = torch::kInt32;
-  TV_ASSERT_RT_ERR(false, "unknown type");
+};
-}
-template <>
-constexpr auto type2torch(int val) -> decltype(torch::kInt32){
-  return torch::kInt32;
-}
-template <>
+template <> struct TypeToTorchDtypeTraits<int64_t> {
-constexpr auto type2torch(long val) -> decltype(torch::kInt32){
+  static constexpr decltype(torch::kInt32) value = torch::kInt64;
-  return torch::kInt64;
+};
-}
-template <>
+template <> struct TypeToTorchDtypeTraits<float> {
-constexpr auto type2torch(float val) -> decltype(torch::kInt32){
+  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
-  return torch::kFloat32;
+};
-}
+template <> struct TypeToTorchDtypeTraits<double> {
+  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
+};
+template <> struct TypeToTorchDtypeTraits<at::Half> {
+  static constexpr decltype(torch::kInt32) value = torch::kHalf;
+};
-template <>
+} // namespace detail
-constexpr auto type2torch(double val) -> decltype(torch::kInt32){
-  return torch::kFloat64;
-}
 template <typename T>
-tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
+constexpr decltype(torch::kInt32) torch_type_v =
+    detail::TypeToTorchDtypeTraits<T>::value;
+template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
  check_torch_dtype<T>(tensor);
  tv::Shape shape;
  for (auto i : tensor.sizes()) {
@@ -95,4 +97,28 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
  }
  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
 }
+namespace detail {
+template <> struct TypeToString<at::Half> {
+  static constexpr const char *value = "half";
+};
+} // namespace detail
+template <class... Ts, typename F>
+void torch_dispatch(at::ScalarType t, F &&f) {
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (torch_type_v<decltype(I)> == t) {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
+      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
+  }
+}
 } // namespace tv
\ No newline at end of file
--- a/spconv/conv.py
+++ b/spconv/conv.py
@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
                 inverse=False,
                 indice_key=None,
                 fused_bn=False,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
@@ -221,7 +221,7 @@ class SparseConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv2d, self).__init__(
            2,
            in_channels,
@@ -247,7 +247,7 @@ class SparseConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv3d, self).__init__(
            3,
            in_channels,
@@ -273,7 +273,7 @@ class SparseConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv4d, self).__init__(
            4,
            in_channels,
@@ -299,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvTranspose2d, self).__init__(
            2,
            in_channels,
@@ -326,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvTranspose3d, self).__init__(
            3,
            in_channels,
@@ -387,7 +387,7 @@ class SubMConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv2d, self).__init__(
            2,
            in_channels,
@@ -414,7 +414,7 @@ class SubMConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv3d, self).__init__(
            3,
            in_channels,
@@ -441,7 +441,7 @@ class SubMConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv4d, self).__init__(
            4,
            in_channels,

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -111,16 +111,9 @@ def indice_conv(features,
              num_activate_out,
              inverse=False,
              subm=False):
-    if filters.dtype == torch.float32:
+    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
-        return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
+                                        indice_pair_num, num_activate_out,
-                                               indice_pair_num, num_activate_out,
+                                        int(inverse), int(subm))
-                                               int(inverse), int(subm))
-    elif filters.dtype == torch.half:
-        return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
-                                               indice_pair_num, num_activate_out,
-                                               int(inverse), int(subm))
-    else:
-        raise NotImplementedError
 def fused_indice_conv(features, filters, bias,
            indice_pairs,
@@ -145,14 +138,8 @@ def indice_conv_backward(features,
                       indice_pair_num,
                       inverse=False,
                       subm=False):
-    if filters.dtype == torch.float32:
+    return torch.ops.spconv.indice_conv_backward(
-        return torch.ops.spconv.indice_conv_backward_fp32(
+        features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
-            features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
-    elif filters.dtype == torch.half:
-        return torch.ops.spconv.indice_conv_backward_half(
-            features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
-    else:
-        raise NotImplementedError
 def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):

--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc)
+set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc)
 if (SPCONV_BuildCUDA)
    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
 endif()

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -26,11 +26,8 @@ static auto registry =
        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
        .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
        .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
-        .op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
+        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>)
+        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
-        .op("spconv::indice_conv_backward_half",
-            &spconv::indiceConvBackward<at::Half>)
        .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
        .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
+#include <spconv/spconv_ops.h>
+namespace spconv {
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  /*if (_subM){
+  std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
+  indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+  auto indicePairVecMaxSizeIter = std::max_element(
+      indicePairNumVec.begin(), indicePairNumVec.end());
+  indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+    // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  tv::torch_dispatch<float, double, at::Half>(
+      features.scalar_type(), [&](auto I) {
+        using T = decltype(I);
+        for (int i = 0; i < kernelVolume; ++i) {
+          auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+          if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+            continue;
+          }
+          // auto timer = spconv::CudaContextTimer<>();
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
+                                                  {nHot, numInPlanes}, options);
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                       tv::torch2tv<const T>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          }
+#ifdef SPCONV_CUDA
+          else if (device == torch::kCUDA) {
+            functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                       tv::torch2tv<const T>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+    auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+    auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
+    {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
+    features, 0, indicePairBlob);*/
+          }
+#endif
+          else {
+            TV_ASSERT_INVALID_ARG(false, "unknown device type");
+          }
+          // totalGatherTime += timer.report() / 1000.0;
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+          // totalGEMMTime += timer.report() / 1000.0;
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<T>(output),
+                tv::torch2tv<const T>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          }
+#ifdef SPCONV_CUDA
+          else if (device == torch::kCUDA) {
+            functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<T>(output),
+                tv::torch2tv<const T>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+#endif
+          else {
+            TV_ASSERT_INVALID_ARG(false, "unknown device type");
+          }
+          // totalSAddTime += timer.report() / 1000.0;
+        }
+      });
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                   torch::Tensor outGrad, torch::Tensor indicePairs,
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  tv::torch_dispatch<float, double,
+                     at::Half>(features.scalar_type(), [&](auto I) {
+    using T = decltype(I);
+    for (int i = 0; i < kernelVolume; ++i) {
+      auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+      if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+        continue;
+      }
+      if (device == torch::kCPU) {
+        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
+        gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                   nHot);
+        gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
+                      tv::torch2tv<const T>(outGrad),
+                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                      nHot);
+      }
+#ifdef SPCONV_CUDA
+      else if (device == torch::kCUDA) {
+        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
+        gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                   nHot);
+        TV_CHECK_CUDA_ERR();
+        gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
+                      tv::torch2tv<const T>(outGrad),
+                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                      nHot);
+        TV_CHECK_CUDA_ERR();
+      }
+#endif
+      else {
+        TV_ASSERT_INVALID_ARG(false, "unknown device type");
+      }
+      auto filterGradSub = filtersGrad[i];
+      auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
+                                               {nHot, numOutPlanes}, options);
+      auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
+                                              {nHot, numInPlanes}, options);
+      torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+      torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+      if (device == torch::kCPU) {
+        functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+        scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
+                    tv::torch2tv<const T>(inputBuffer),
+                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                    nHot);
+      }
+#ifdef SPCONV_CUDA
+      else if (device == torch::kCUDA) {
+        functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+        scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
+                    tv::torch2tv<const T>(inputBuffer),
+                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                    nHot);
+        TV_CHECK_CUDA_ERR();
+      }
+#endif
+      else {
+        TV_ASSERT_INVALID_ARG(false, "unknown device type");
+      }
+    }
+  });
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+} // namespace spconv
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -581,7 +581,7 @@ def main():
        if all([s > 1, d > 1]):
            continue
        device = torch.device(dev)
-        num_points = [5] * bs
+        num_points = [500] * bs
        sparse_dict = generate_sparse_data(shape, num_points, IC)
@@ -601,7 +601,7 @@ def main():
        net.net[0].weight[:] = filters_t
        out_ref = net_ref(features_dense_t)
        times = []
-        for i in range(0):
+        for i in range(10):
            t = time.time()
            out = net(features_t, indices_t, bs)
            torch.cuda.synchronize()