Merge branch 'master' into master

f0d7a46d · Yan Yan · GitHub · 999c834c · 83344f71 · f0d7a46d
Unverified Commit f0d7a46d authored May 20, 2020 by Yan Yan Committed by GitHub May 20, 2020
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
+name: build
+
+on:
+  push:
+    branches:
+      - master
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    - cron:  '0 0 * * 6' # base builds run every saturday
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    env:
+      DOCKER_IMAGE_NAME: scrin/dev-spconv
+      DOCKER_FILE_PATH: ./Dockerfile
+
+    # TODO: create a action to reuse code. the problem is how to reuse docker-login.
+    steps:
+      - uses: actions/checkout@master
+      - name: Build Docker
+        run: |
+          docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest
+          docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
+          
+
+      - name: Login to Registry
+        uses: azure/docker-login@v1
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Publish to Registry
+        run: |
+          docker push ${{env.DOCKER_IMAGE_NAME}}:latest
+          docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 *.so
 *.o
 *.out
+*.so.*

 # Distribution / packaging
 .Python

--- a/Dockerfile
+++ b/Dockerfile
+FROM scrin/dev:latest
+
+RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \
+    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \
+    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \
+    cd /root && \
+    git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \
+    cd ./spconv && \
+    SPCONV_FORCE_BUILD_CUDA=1 python setup.py install
--- a/README.md
+++ b/README.md
 # SpConv: PyTorch Spatially Sparse Convolution Library

+[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
+
 This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.

-The GPU Indice Generation algorithm is a unofficial implementation of paper [SECOND](http://www.mdpi.com/1424-8220/18/10/3337). That algorithm (don't include GPU SubM indice generation algorithm) may be protected by patent.

-This project only support CUDA 9.0+ or CPU only. If you are using cuda 8.0, please update it to 9.0.
+2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv.

-This project only support tensors with spatial volume less than ```std::numeric_limits<int>::max()``` (~2e9). if someone really need very large space, open an issue.

-## News:
+## Docker:

-2019-5-24: spconv v1.1 released, now indice generation will use hash table as default (CPU code only support hash table). you can use ```use_hash=False``` to use dense table when using CUDA. In addition, add CPU only build support.
+```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow.

 ## Install on Ubuntu 16.04/18.04

+* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support).
+
 0. Use ```git clone xxx.git --recursive``` to clone this repo.

-1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboostall-dev``` or download compressed files from boost official website and copy headers to include path.
+1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path.

 2. Download cmake >= 3.13.2, then add cmake executables to PATH.

@@ -168,4 +170,4 @@ This project is licensed under the Apache license 2.0 License - see the [LICENSE

 The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.

-The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
\ No newline at end of file
+The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
--- a/include/pybind11_utils.h
+++ b/include/pybind11_utils.h
@@ -13,49 +13,103 @@
 // limitations under the License.

 #pragma once
+#include <tensorview/tensorview.h>
+#include <tensorview/tensor.h>
+
 #include <algorithm>
+#include <array>
 #include <iostream>
-#include <pybind11/embed.h> // everything needed for embedding
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

-#include <tensorview/tensorview.h>
-
 namespace py = pybind11;
+namespace tv {
+
+template <typename T> TensorView<T> arrayt2tv(py::array_t<T> arr) {
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return TensorView<T>(arr.mutable_data(), shape);
+}

-template <typename T, typename TPyObject>
-std::vector<T> array2Vector(TPyObject arr){
-    py::array arr_np = arr;
-    size_t size = arr.attr("size").template cast<size_t>();
-    py::array_t<T> arr_cc = arr_np;
-    std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
-    return data;
+template <typename T> TensorView<const T> carrayt2tv(py::array_t<T> arr) {
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return TensorView<const T>(arr.data(), shape);
+}
+
+template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
+  return TensorView<T>(arr.data(), {arr.size()});
 }

 template <typename T>
-std::vector<T> arrayT2Vector(py::array_t<T> arr)
-{
-  std::vector<T> data(arr.data(), arr.data() + arr.size());
-  return data;
-}
-
-template <typename T, typename TPyObject>
-tv::TensorView<T> array2TensorView(TPyObject arr){
-    py::array arr_np = arr;
-    py::array_t<T> arr_cc = arr_np;
-    tv::Shape shape;
-    for (int i = 0; i < arr_cc.ndim(); ++i){
-        shape.push_back(arr_cc.shape(i));
-    }
-    return tv::TensorView<T>(arr_cc.mutable_data(), shape);
+TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
+  TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
+  return TensorView<T>(arr.data(), shape);
+}
+
+template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
+  return TensorView<const T>(arr.data(), {arr.size()});
 }
+
 template <typename T>
-tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
-    tv::Shape shape;
-    for (int i = 0; i < arr.ndim(); ++i){
-        shape.push_back(arr.shape(i));
+std::vector<T> shape2stride(const std::vector<T> &shape, T itemsize) {
+  T p = T(1);
+  std::vector<T> res;
+  for (auto iter = shape.rbegin(); iter != shape.rend(); ++iter) {
+    res.push_back(p * itemsize);
+    p *= *iter;
+  }
+  std::reverse(res.begin(), res.end());
+  return res;
+}
+
+tv::DType get_array_tv_dtype(const py::array& arr){
+  // 
+  switch (arr.dtype().kind()){
+    case 'b': return tv::bool_;
+    case 'i': {
+      switch (arr.itemsize()){
+        case 1: return tv::int8;
+        case 2: return tv::int16;
+        case 4: return tv::int32;
+        case 8: return tv::int64;
+        default: break;
+      }
    }
-    return tv::TensorView<T>(arr.mutable_data(), shape);
-}
\ No newline at end of file
+    case 'u': {
+      switch (arr.itemsize()){
+        case 1: return tv::uint8;
+        case 2: return tv::uint16;
+        case 4: return tv::uint32;
+        case 8: return tv::uint64;
+        default: break;
+      }
+    }
+    case 'f': {
+      switch (arr.itemsize()){
+        case 4: return tv::float32;
+        case 8: return tv::float64;
+        default: break;
+      }
+    }
+  }
+  TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
+}
+
+
+Tensor array2tensor(py::array& arr) {
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
+}
+
+
+} // namespace tv
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
@@ -37,12 +37,12 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter = std::max_element(
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+      indicePairNumCpu.data_ptr<int>(), indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  
  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(), indicePairNumCpu.data_ptr<int>() + kernelVolume);
    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);

    auto indicePairVecMaxSizeIter = std::max_element(
@@ -68,15 +68,15 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
    // auto timer = spconv::CudaContextTimer<>();
    auto outputBufferBlob =
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+        torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+        torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);

    if (device == torch::kCPU) {
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;

--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
@@ -37,7 +37,7 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,

  tv::check_torch_dtype<int>(shape);
  tv::check_torch_dtype<T>(coors);
-  auto shapeData = shape.data<int>();
+  auto shapeData = shape.data_ptr<int>();
  torch::Tensor canvas =
      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
                   features.options());

--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
@@ -71,7 +71,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels)
-        break;
+        continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -139,7 +139,7 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points,
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels)
-        break;
+        continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -225,7 +225,7 @@ int points_to_voxel_3d_with_filtering(
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels)
-        break;
+        continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
@@ -274,4 +274,4 @@ int points_to_voxel_3d_with_filtering(
  return voxel_num;
 }

-} // namespace spconv
\ No newline at end of file
+} // namespace spconv
--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
@@ -33,7 +33,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
  double totalTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
@@ -75,7 +75,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
    auto kernelVolume = indicePairs.size(0);
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -110,7 +110,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose, false, useHash);
-      if (numActOut == -1){
+      if (numActOut == -1) {
        // build hash failed. use CPU algorithm
        auto device = indices.device();
        indicePairs = indicePairs.to({torch::kCPU});
@@ -123,7 +123,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
            tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
            dilation32, outSpatialShape32, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device), indiceNum.to(device)};
+        return {indices.to(device), indicePairs.to(device),
+                indiceNum.to(device)};
      }
    }
 #endif
@@ -169,7 +170,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
            false, useHash);
-        if (numActOut == -1){
+        if (numActOut == -1) {
          // build hash failed. use CPU algorithm
          auto getIndicePairFtor =
              functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
@@ -184,7 +185,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
              tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
              kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
              transpose);
-          return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)};
+          return {outInds.to(device).slice(0, 0, numActOut),
+                  indicePairs.to(device), indiceNum.to(device)};
        }
      }
    }
@@ -328,372 +330,14 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
  }
 }

-template <typename T>
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data<int>(),
-                       indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
-  indicePairNumCpu.data<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob =
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
+                         int64_t numActOut, int64_t _inverse, int64_t _subM);

-template <typename T>
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data<int>(),
-                       indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
-                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                    nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob =
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
-
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-    }
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
-                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                  nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <typename T>
-torch::Tensor
-indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto totalActsTen = indicePairNumCpu.sum();
-  auto totalActs = indicePairNumCpu.data<int>()[0];
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data<int>(),
-                       indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
-                                    indicePairNumCpu.data<int>() +
-                                        kernelVolume);
-  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-  int subRuleMaxSize =
-      *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  if (subM) {
-    indicePairMaxSize = subRuleMaxSize;
-  }
-  // auto timer = spconv::CudaContextTimer<>();
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  // std::cout << "create time " << timer.report() / 1000.0 << std::endl;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // auto timer = spconv::CudaContextTimer<>();
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    //
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
-                                            {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } 
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // }
-    // for (int i = 0; i < kernelVolume; ++i) {
-    // totalGatherTime += timer.report() / 1000.0;
-    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
-    // {nHot, numOutPlanes}, options);
-  }
-  // totalGatherTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-  }
-  // totalGEMMTime += timer.report() / 1000.0;
-  // totalGEMMTime += timer.report() / 1000.0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
-                                            {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } 
-#ifdef SPCONV_CUDA
-    else if (device == torch::kCUDA) {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // totalSAddTime += timer.report() / 1000.0;
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM);

 } // namespace spconv


--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "tensorview.h"
+#include <memory>
+#include <spconv/mp_helper.h>
+#ifdef SPCONV_CUDA
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#endif
+
+namespace tv
+{
+enum DType
+{
+  float32,
+  int32,
+  int16,
+  int8,
+  float64,
+  bool_,
+  uint8,
+  float16,
+  int64,
+  uint16,
+  uint32,
+  uint64
+};
+
+namespace detail
+{
+
+template <typename T>
+class TensorStorage
+{
+public:
+  TensorStorage(size_t size, int device = -1, bool managed = false)
+      : mSize(size), device_(device), managed_(managed)
+  {
+    if (size == 0)
+    {
+      mPtr = nullptr;
+    }
+    else
+    {
+      if (device == -1)
+      {
+#ifdef SPCONV_CUDA
+        checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
+#else
+        mPtr = new T[size];
+#endif
+      }
+      else
+      {
+#ifdef SPCONV_CUDA
+        int deviceCount;
+        cudaGetDeviceCount(&deviceCount);
+        if (device >= deviceCount)
+        {
+          TV_ASSERT_INVALID_ARG("you provide device ", device,
+                                " but you only have ", deviceCount, " device.");
+        }
+        cudaSetDevice(device);
+        if (managed)
+        {
+          checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
+        }
+        else
+        {
+          checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
+        }
+#else
+        TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+#endif
+      }
+    }
+  }
+  TensorStorage(T *ptr, size_t size, int device)
+      : mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
+
+  virtual ~TensorStorage()
+  {
+    if (empty())
+    {
+      return;
+    }
+    if (from_blob_)
+    {
+      return;
+    }
+    if (device_ == -1)
+    {
+#ifdef SPCONV_CUDA
+      cudaFreeHost(mPtr);
+#else
+      delete[] mPtr;
+#endif
+    }
+    else
+    {
+#ifdef SPCONV_CUDA
+      cudaFree(mPtr);
+#endif
+    }
+  };
+
+  inline size_t size() const { return mSize; }
+
+  T *data() { return mPtr; }
+  const T *data() const { return mPtr; }
+
+  bool empty() const { return mPtr == nullptr || mSize == 0; }
+  bool managed() const { return managed_; }
+  int device() const { return device_; }
+  void zero_()
+  {
+    if (device_ == -1)
+    {
+      std::memset(data(), 0, mSize);
+      // std::fill(data(), data() + mSize, 0);
+    }
+    else
+    {
+#ifdef SPCONV_CUDA
+      checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
+#else
+      TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+#endif
+    }
+  }
+
+private:
+  T *mPtr = nullptr;
+  size_t mSize = 0;
+  int device_ = -1;
+  bool from_blob_ = false;
+  bool managed_ = false;
+};
+
+size_t sizeof_dtype(DType dtype)
+{
+  switch (dtype)
+  {
+  case float32:
+    return sizeof(float);
+  case int8:
+    return sizeof(int8_t);
+  case int16:
+    return sizeof(int16_t);
+  case int32:
+    return sizeof(int32_t);
+  case float64:
+    return sizeof(double);
+  case int64:
+    return sizeof(int64_t);
+  case bool_:
+    return sizeof(bool);
+  case uint8:
+    return sizeof(uint8_t);
+  case uint16:
+    return sizeof(uint16_t);
+  case uint32:
+    return sizeof(uint32_t);
+  case uint64:
+    return sizeof(uint64_t);
+#ifdef SPCONV_CUDA
+  case float16:
+    return sizeof(__half);
+#endif
+  default:
+    TV_THROW_RT_ERR("unsupported dtype");
+  }
+  return 0;
+}
+
+std::string typeString(DType t)
+{
+  switch (t)
+  {
+  case DType::bool_:
+    return "bool";
+  case DType::float32:
+    return "float32";
+  case DType::int8:
+    return "int8";
+  case DType::int16:
+    return "int16";
+  case DType::int32:
+    return "int32";
+  case DType::float64:
+    return "float64";
+  case DType::int64:
+    return "int64";
+  case DType::uint8:
+    return "uint8";
+  case DType::uint16:
+    return "uint16";
+  case DType::uint32:
+    return "uint32";
+  case DType::uint64:
+    return "uint64";
+#ifdef SPCONV_CUDA
+  case DType::float16:
+    return "half";
+#endif
+  default:
+    return "";
+  }
+}
+
+template <typename T>
+struct TypeToDtypeTraits;
+
+template <>
+struct TypeToDtypeTraits<int32_t>
+{
+  static constexpr DType dtype = int32;
+};
+
+#ifdef SPCONV_CUDA
+template <>
+struct TypeToDtypeTraits<__half>
+{
+  static constexpr DType dtype = float16;
+};
+#endif
+
+template <>
+struct TypeToDtypeTraits<float>
+{
+  static constexpr DType dtype = float32;
+};
+template <>
+struct TypeToDtypeTraits<double>
+{
+  static constexpr DType dtype = float64;
+};
+template <>
+struct TypeToDtypeTraits<int16_t>
+{
+  static constexpr DType dtype = int16;
+};
+template <>
+struct TypeToDtypeTraits<int8_t>
+{
+  static constexpr DType dtype = int8;
+};
+template <>
+struct TypeToDtypeTraits<int64_t>
+{
+  static constexpr DType dtype = int64;
+};
+template <>
+struct TypeToDtypeTraits<uint8_t>
+{
+  static constexpr DType dtype = uint8;
+};
+template <>
+struct TypeToDtypeTraits<uint16_t>
+{
+  static constexpr DType dtype = uint16;
+};
+template <>
+struct TypeToDtypeTraits<uint32_t>
+{
+  static constexpr DType dtype = uint32;
+};
+template <>
+struct TypeToDtypeTraits<uint64_t>
+{
+  static constexpr DType dtype = uint64;
+};
+
+} // namespace detail
+
+template <class T>
+constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
+
+struct Tensor
+{
+  Tensor() {}
+  Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
+      : dtype_(dtype)
+  {
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        shape.size() * detail::sizeof_dtype(dtype), device, managed);
+    shape_ = shape;
+  }
+  Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
+  {
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        reinterpret_cast<uint8_t *>(ptr),
+        shape.size() * detail::sizeof_dtype(dtype), device);
+    shape_ = shape;
+  }
+
+  template <typename T>
+  TensorView<T> tview()
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
+    return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
+  }
+  template <typename T>
+  TensorView<T> tview() const
+  {
+    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return TensorView<const std::remove_const_t<T>>(
+        reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
+        shape_);
+  }
+  bool empty() const { return storage_->empty(); }
+  DType dtype() const { return dtype_; }
+  int device() const { return storage_->device(); }
+  const Shape &shape() const { return shape_; }
+  int dim(int idx) const
+  {
+    TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
+    return shape_[idx];
+  }
+  const uint8_t *raw_data() const { return storage_->data(); }
+  size_t size() const { return shape_.size(); }
+  Tensor &zero_()
+  {
+    storage_->zero_();
+    return *this;
+  }
+  uint8_t *raw_data() { return storage_->data(); }
+  template <typename T>
+  Tensor &fill_(T value)
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    auto ptr = reinterpret_cast<T *>(raw_data());
+    std::fill(ptr, ptr + size(), value);
+    return *this;
+  }
+
+  template <typename T>
+  T *data()
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return reinterpret_cast<T *>(raw_data());
+  }
+
+  template <typename T>
+  const T *data() const
+  {
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    return reinterpret_cast<const T *>(raw_data());
+  }
+
+  void copy_(const Tensor &tensor)
+  {
+    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
+    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
+    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
+    if (device() == -1 && tensor.device() == -1)
+    {
+#ifdef SPCONV_CUDA
+      host2host(storage_->data(), tensor.raw_data(),
+                size() * detail::sizeof_dtype(dtype_));
+#else
+      std::copy(tensor.raw_data(),
+                tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
+                storage_->data());
+#endif
+    }
+#ifdef SPCONV_CUDA
+    else if (device() >= 0 && tensor.device() == -1)
+    {
+      // host2dev
+      host2dev(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_));
+    }
+    else if (device() == -1 && tensor.device() >= 0)
+    {
+      // dev2host
+      dev2host(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_));
+    }
+    else if (device() >= 0 && tensor.device() >= 0)
+    {
+      // dev2dev
+      dev2dev(storage_->data(), tensor.raw_data(),
+              size() * detail::sizeof_dtype(dtype_));
+    }
+#endif
+    else
+    {
+      TV_ASSERT_RT_ERR(false, "only support cpu tensor");
+    }
+  }
+
+  Tensor cpu() const
+  {
+    if (storage_->device() == -1)
+    {
+      return *this;
+    }
+    Tensor res(shape_, dtype_, -1, storage_->managed());
+    res.copy_(*this);
+    return res;
+  }
+
+  template <typename T>
+  void copy_(const TensorView<T> &tensor, int device)
+  {
+    Tensor src = from_blob(tensor, device);
+    return copy_(src);
+  }
+
+protected:
+  DType dtype_;
+  std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
+  Shape shape_;
+};
+
+inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
+{
+  return Tensor(ptr, shape, dtype, device);
+}
+
+template <typename T>
+Tensor from_blob(TensorView<T> tensor, int device)
+{
+  return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
+}
+
+template <class... Ts, typename F>
+void dispatch(DType t, F &&f)
+{
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (type_v<decltype(I)> == t)
+    {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound)
+  {
+    std::stringstream ss;
+    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
+      ss << detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
+                    ", available: ", ss.str());
+  }
+}
+
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -17,9 +17,9 @@
 #include <cassert>
 #include <cstdlib>

+#include "prettyprint.h"
 #include <iostream>
 #include <memory>
-// #include <prettyprint.h>
 #include <sstream>
 #include <type_traits>
 #include <vector>
@@ -27,7 +27,6 @@
 #include <cuda_runtime_api.h>
 #endif

-
 namespace tv {

 #ifdef __NVCC__
@@ -72,15 +71,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
  sstream_print(ss, args...);
 }

+template <class... TArgs> void ssprint(TArgs... args) {
+  std::stringstream ss;
+  sstream_print(ss, args...);
+  std::cout << ss.str() << std::endl;
+}
+
+#define TV_THROW_RT_ERR(...)                                                   \
+  {                                                                            \
+    std::stringstream __macro_s;                                               \
+    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
+    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
+    throw std::runtime_error(__macro_s.str());                                 \
+  }
+
 #define TV_ASSERT_RT_ERR(expr, ...)                                            \
  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
+    if (!(expr))                                                               \
+      TV_THROW_RT_ERR(__VA_ARGS__);                                            \
  }

 #define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
@@ -96,24 +104,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {

 #define TV_CHECK_CUDA_ERR()                                                    \
  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                             \
-    if (__macro_err != cudaSuccess) {                                                  \
+    auto __macro_err = cudaGetLastError();                                     \
+    if (__macro_err != cudaSuccess) {                                          \
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;                 \
+      __macro_s << "cuda execution failed with error " << __macro_err;         \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }

-#define TV_CHECK_CUDA_ERR_V2(...)                                                    \
+#define TV_CHECK_CUDA_ERR_V2(...)                                              \
  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                             \
-    if (__macro_err != cudaSuccess) {                                                  \
+    auto __macro_err = cudaGetLastError();                                     \
+    if (__macro_err != cudaSuccess) {                                          \
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;                 \
-      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\
-      tv::sstream_print(__macro_s, __VA_ARGS__); \
+      __macro_s << "cuda execution failed with error " << __macro_err;         \
+      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";             \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }
@@ -362,7 +370,8 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
      : SimpleVector<int, MaxDim>(shape) {}
-
+  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
+      : SimpleVector<int, MaxDim>(vec) {}
  template <typename T, template <class...> class Container>
  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
@@ -372,7 +381,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
 #ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+    TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
 #endif
    ShapeBase<MaxDim> shape;
    for (int i = start; i < end; ++i) {
@@ -417,6 +426,13 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
    }
    return shape;
  }
+  TV_HOST_DEVICE size_t prod() const {
+    size_t res = 1;
+    for (size_t i = 0; i < this->mSize; ++i) {
+      res *= this->mArray[i];
+    }
+    return res;
+  }
 };

 using Shape = ShapeBase<TV_MAX_DIM>;
@@ -545,6 +561,9 @@ template <typename T, int Rank = -1> struct TensorView {
      : mPtr(ptr) {
    mShape = {int(shapes)...};
  }
+  operator TensorView<const T>() {
+    return TensorView<const T>(mPtr, mShape);
+  } // conversion function

  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
  assign(const TensorView<T, Rank> &tensor) {
@@ -846,6 +865,19 @@ template <typename T, int Rank = -1> struct TensorView {
 #endif
    return mPtr[idx];
  }
+  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+
  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
    int idx = rowArrayIdx(mShape, index);
@@ -970,6 +1002,26 @@ template <typename T, int Rank = -1> struct TensorView {
                               mShape.subshape(sizeof...(ints) + 1));
  }

+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  subview_ints(SimpleVector<int> ids) const {
+    Shape start = ids;
+    for (int i = ids.size(); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
+                               mShape.subshape(ids.size()));
+  }
+
+  std::string print_vec(TensorView<T> tensor) const {
+    std::ostringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
+      ss << tensor(i) << ", ";
+    }
+    ss << tensor(tensor.dim(0) - 1) << "]";
+    return ss.str();
+  }
+
  std::string repr() const {
    std::ostringstream ss;
    if (empty())
@@ -983,37 +1035,29 @@ template <typename T, int Rank = -1> struct TensorView {
    }
    Shape counter = mShape;
    auto tensor_flat = this->view(-1);
-    for (int i = 0; i < counter.ndim(); ++i) {
+
+    for (int i = 0; i < counter.ndim() - 1; ++i) {
      counter[i] = 0;
-      ss << "[";
+      // ss << "[";
    }
-    for (size_t i = 0; i < this->size(); ++i) {
-      ss << tensor_flat(rowArrayIdx(mShape, counter));
-      counter[counter.ndim() - 1] += 1;
-      int inc_count = 0;
-      bool print_comma = true;
-      for (int c = counter.ndim() - 1; c >= 0; --c) {
-        if (counter[c] == this->dim(c) && c > 0) {
-          ++inc_count;
-          counter[c - 1] += 1;
-          counter[c] = 0;
-          print_comma = false;
+    for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
+      for (int i = 0; i < counter.ndim() - 1; ++i) {
+        if (counter[i] == 0) {
+          ss << "[";
        }
      }
-      if (print_comma && i != this->size() - 1)
-        ss << ", ";
-      for (int j = 0; j < inc_count; ++j) {
-        ss << "]";
-      }
-      if (i != this->size() - 1) {
-        if (inc_count != 0)
-          ss << "\n";
-        for (int j = 0; j < inc_count; ++j) {
-          ss << "[";
+      std::cout << "counter.ndim() " << counter.ndim() << std::endl;
+      auto counter_ = counter.subshape(0, counter.ndim() - 1);
+      std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
+      ss << print_vec(this->subview_ints(counter_)) << "\n";
+      std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
+      for (int i = 0; i < counter.ndim() - 1; ++i) {
+        if (counter[i] == this->dim(i) - 1) {
+          ss << "]";
        }
      }
    }
-    ss << "]";
+    // ss << "]";
    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
    // detail::simpleTypeName<T>());
    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
@@ -1161,4 +1205,160 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
  return printTensorView(TensorView<const T>(ptr, shape), format);
 }

+#ifdef SPCONV_CUDA
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), func);
+    DEVICE_RESET
+    // Make sure we call CUDA Device Reset before exiting
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+template <typename T>
+void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
+}
+template <typename T>
+void host2dev(TensorView<T> dst, const TensorView<const T> src,
+              cudaStream_t s = 0) {
+  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+
+template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
+  checkCudaErrors(
+      cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
+}
+template <typename T>
+void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
+  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
+}
+template <typename T>
+void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
+  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
+}
+
+template <typename T>
+void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
+}
+
+template <typename T>
+void dev2host(TensorView<T> dst, const TensorView<const T> src,
+              cudaStream_t s = 0) {
+  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+
+template <typename T>
+void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+
+template <typename T>
+void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
+}
+
+template <typename T>
+void dev2dev(TensorView<T> dst, const TensorView<const T> src,
+             cudaStream_t s = 0) {
+  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+
+template <typename T>
+void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
+  checkCudaErrors(
+      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
+}
+
+template <typename T>
+void host2host(TensorView<T> dst, const TensorView<const T> src,
+               cudaStream_t s = 0) {
+  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+template <typename T>
+void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
+}
+
+template <typename T> void zero_dev(TensorView<T> tensor) {
+  checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
+}
+
+template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
+  checkCudaErrors(
+      cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
+}
+template <typename T> void zero_host(TensorView<T> tensor) {
+  std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
+}
+
+#endif
+
+namespace detail {
+template <typename T> struct TypeToString;
+
+template <> struct TypeToString<int32_t> {
+  static constexpr const char *value = "int32";
+};
+template <> struct TypeToString<bool> {
+  static constexpr const char *value = "bool";
+};
+template <> struct TypeToString<float> {
+  static constexpr const char *value = "float";
+};
+template <> struct TypeToString<double> {
+  static constexpr const char *value = "double";
+};
+template <> struct TypeToString<int16_t> {
+  static constexpr const char *value = "int16";
+};
+template <> struct TypeToString<int8_t> {
+  static constexpr const char *value = "int8";
+};
+template <> struct TypeToString<int64_t> {
+  static constexpr const char *value = "int64";
+};
+template <> struct TypeToString<uint8_t> {
+  static constexpr const char *value = "uint8";
+};
+template <> struct TypeToString<uint16_t> {
+  static constexpr const char *value = "uint16";
+};
+template <> struct TypeToString<uint32_t> {
+  static constexpr const char *value = "uint32";
+};
+template <> struct TypeToString<uint64_t> {
+  static constexpr const char *value = "uint64";
+};
+
+} // namespace detail
+
 } // namespace tv
\ No newline at end of file
--- a/include/torch_utils.h
+++ b/include/torch_utils.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,24 +13,26 @@
 // limitations under the License.

 #pragma once
+#include <spconv/mp_helper.h>
 #include <tensorview/tensorview.h>
-#include <torch/script.h>
+
 #include <ATen/ATen.h>
+#include <torch/script.h>
 #ifdef SPCONV_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #endif

 namespace tv {
-  
+
 #ifdef SPCONV_CUDA
-struct TorchGPU: public tv::GPU {
+struct TorchGPU : public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
  }
 };
 #endif
 template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.type().scalarType()) {
+  switch (tensor.scalar_type()) {
  case at::ScalarType::Double: {
    auto val = std::is_same<std::remove_const_t<T>, double>::value;
    TV_ASSERT_RT_ERR(val, "error");
@@ -60,39 +62,63 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
    TV_ASSERT_RT_ERR(false, "error");
  }
 }
+namespace detail {
+template <typename T> struct TypeToTorchDtypeTraits;

-template <typename T>
-constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){
-  TV_ASSERT_RT_ERR(false, "unknown type");
-}
-
-template <>
-constexpr auto type2torch(int val) -> decltype(torch::kInt32){
-  return torch::kInt32;
-}
+template <> struct TypeToTorchDtypeTraits<int32_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kInt32;
+};

-template <>
-constexpr auto type2torch(long val) -> decltype(torch::kInt32){
-  return torch::kInt64;
-}
+template <> struct TypeToTorchDtypeTraits<int64_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kInt64;
+};

-template <>
-constexpr auto type2torch(float val) -> decltype(torch::kInt32){
-  return torch::kFloat32;
-}
+template <> struct TypeToTorchDtypeTraits<float> {
+  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
+};
+template <> struct TypeToTorchDtypeTraits<double> {
+  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
+};
+template <> struct TypeToTorchDtypeTraits<at::Half> {
+  static constexpr decltype(torch::kInt32) value = torch::kHalf;
+};

-template <>
-constexpr auto type2torch(double val) -> decltype(torch::kInt32){
-  return torch::kFloat64;
-}
+} // namespace detail

 template <typename T>
-tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
+constexpr decltype(torch::kInt32) torch_type_v =
+    detail::TypeToTorchDtypeTraits<T>::value;
+
+template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
  check_torch_dtype<T>(tensor);
  tv::Shape shape;
  for (auto i : tensor.sizes()) {
    shape.push_back(i);
  }
-  return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
+  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
 }
+namespace detail {
+template <> struct TypeToString<at::Half> {
+  static constexpr const char *value = "half";
+};
+} // namespace detail
+template <class... Ts, typename F>
+void torch_dispatch(at::ScalarType t, F &&f) {
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (torch_type_v<decltype(I)> == t) {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
+      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
+  }
+}
+
 } // namespace tv
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,8 @@ from pathlib import Path

 LIBTORCH_ROOT = str(Path(torch.__file__).parent)

+SPCONV_FORCE_BUILD_CUDA = os.getenv("SPCONV_FORCE_BUILD_CUDA")
+
 PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)

 class CMakeExtension(Extension):
@@ -46,7 +48,7 @@ class CMakeBuild(build_ext):
                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
                      '-DSPCONV_BuildTests=OFF',
                      ] #  -arch=sm_61
-        if not torch.cuda.is_available():
+        if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
            cmake_args += ['-DSPCONV_BuildCUDA=OFF']
        else:
            cuda_flags = ["\"--expt-relaxed-constexpr\""]

--- a/spconv/__init__.py
+++ b/spconv/__init__.py
 # Copyright 2019 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,8 @@ from spconv.conv import SparseConvTranspose2d, SparseConvTranspose3d
 from spconv.conv import SparseInverseConv2d, SparseInverseConv3d
 from spconv.modules import SparseModule, SparseSequential
 from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
+from spconv.tables import ConcatTable, JoinTable, AddTable
+from spconv.identity import Identity

 from spconv import ops

@@ -55,7 +57,7 @@ class SparseConvTensor(object):
                is very large.
        """
        self.features = features
-        self.indices = indices 
+        self.indices = indices
        if self.indices.dtype != torch.int32:
            self.indices.int()
        self.spatial_shape = spatial_shape
@@ -69,7 +71,7 @@ class SparseConvTensor(object):

    def find_indice_pair(self, key):
        if key is None:
-            return None 
+            return None
        if key in self.indice_dict:
            return self.indice_dict[key]
        return None
@@ -100,4 +102,4 @@ class RemoveGrid(SparseModule):
    """
    def forward(self, x: SparseConvTensor):
        x.grid = None
-        return x
\ No newline at end of file
+        return x
--- a/spconv/conv.py
+++ b/spconv/conv.py
@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
                 inverse=False,
                 indice_key=None,
                 fused_bn=False,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
@@ -221,7 +221,7 @@ class SparseConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv2d, self).__init__(
            2,
            in_channels,
@@ -247,7 +247,7 @@ class SparseConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv3d, self).__init__(
            3,
            in_channels,
@@ -273,7 +273,7 @@ class SparseConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConv4d, self).__init__(
            4,
            in_channels,
@@ -299,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvTranspose2d, self).__init__(
            2,
            in_channels,
@@ -326,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SparseConvTranspose3d, self).__init__(
            3,
            in_channels,
@@ -387,7 +387,7 @@ class SubMConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv2d, self).__init__(
            2,
            in_channels,
@@ -414,7 +414,7 @@ class SubMConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv3d, self).__init__(
            3,
            in_channels,
@@ -441,7 +441,7 @@ class SubMConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=True):
+                 use_hash=False):
        super(SubMConv4d, self).__init__(
            4,
            in_channels,

--- a/spconv/identity.py
+++ b/spconv/identity.py
+# Copyright 2016-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch.nn import Module
+
+
+class Identity(Module):
+    def forward(self, input):
+        return input
+
+    def input_spatial_size(self, out_size):
+        return out_size
--- a/spconv/modules.py
+++ b/spconv/modules.py
@@ -72,7 +72,7 @@ class SparseSequential(SparseModule):
                  ('conv2', SparseConv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))
-        
+
        # Example of using Sequential with kwargs(python 3.6+)
        model = SparseSequential(
                  conv1=SparseConv2d(1,20,5),
@@ -125,9 +125,12 @@ class SparseSequential(SparseModule):
    def forward(self, input):
        for k, module in self._modules.items():
            if is_spconv_module(module):  # use SpConvTensor as input
-                assert isinstance(input, spconv.SparseConvTensor)
-                self._sparity_dict[k] = input.sparity
-                input = module(input)
+                if isinstance(input, list):
+                    input = module(input)
+                else:
+                    assert isinstance(input, spconv.SparseConvTensor)
+                    self._sparity_dict[k] = input.sparity
+                    input = module(input)
            else:
                if isinstance(input, spconv.SparseConvTensor):
                    if input.indices.shape[0] != 0:

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -111,16 +111,9 @@ def indice_conv(features,
              num_activate_out,
              inverse=False,
              subm=False):
-    if filters.dtype == torch.float32:
-        return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
-                                               indice_pair_num, num_activate_out,
-                                               int(inverse), int(subm))
-    elif filters.dtype == torch.half:
-        return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
-                                               indice_pair_num, num_activate_out,
-                                               int(inverse), int(subm))
-    else:
-        raise NotImplementedError
+    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
+                                        indice_pair_num, num_activate_out,
+                                        int(inverse), int(subm))

 def fused_indice_conv(features, filters, bias,
            indice_pairs,
@@ -145,14 +138,8 @@ def indice_conv_backward(features,
                       indice_pair_num,
                       inverse=False,
                       subm=False):
-    if filters.dtype == torch.float32:
-        return torch.ops.spconv.indice_conv_backward_fp32(
-            features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
-    elif filters.dtype == torch.half:
-        return torch.ops.spconv.indice_conv_backward_half(
-            features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
-    else:
-        raise NotImplementedError
+    return torch.ops.spconv.indice_conv_backward(
+        features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))


 def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):

--- a/spconv/tables.py
+++ b/spconv/tables.py
+
+from torch.autograd import Function
+#from torch.nn import Module
+from spconv.modules import SparseModule
+import spconv
+import torch
+
+class JoinTable(SparseModule):# Module):
+    def forward(self, input):
+        output = spconv.SparseConvTensor(
+                torch.cat([i.features for i in input],1), input[1].indices,
+                input[1].spatial_shape, input[0].batch_size )
+        output.indice_dict = input[1].indice_dict
+        output.grid = input[1].grid
+        return output
+
+    def input_spatial_size(self, out_size):
+        return out_size
+
+
+class AddTable(SparseModule): # Module):
+    def forward(self, input):
+        output = spconv.SparseConvTensor(
+                sum([i.features for i in input]), input[1].indices,
+                input[1].spatial_shape, input[1].batch_size )
+        output.indice_dict = input[1].indice_dict
+        output.grid = input[1].grid
+
+        return output
+
+    def input_spatial_size(self, out_size):
+        return out_size
+
+
+class ConcatTable(SparseModule): # Module):
+    def forward(self, input):
+        return [module(input) for module in self._modules.values()]
+
+    def add(self, module):
+        self._modules[str(len(self._modules))] = module
+        return self
+
+    def input_spatial_size(self, out_size):
+        return self._modules['0'].input_spatial_size(out_size)